rasdeamon package update
Signed-off-by: basebuilder_pel7ppc64bebuilder0 <basebuilder@powerel.org>master
parent
63332ce77a
commit
9ece713570
|
@ -0,0 +1,38 @@
|
|||
From 5e8fb95e2f6dd3f427e0ae5d7d066aeb6d61fd0f Mon Sep 17 00:00:00 2001
|
||||
From: Mauro Carvalho Chehab <mchehab@redhat.com>
|
||||
Date: Wed, 29 May 2013 21:53:58 -0300
|
||||
Subject: [PATCH 01/32] ras-mc-ctl: Improve error summary to show label and mc
|
||||
|
||||
Both information are useful for the users, even on summary.
|
||||
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
|
||||
---
|
||||
util/ras-mc-ctl.in | 6 +++---
|
||||
1 files changed, 3 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index 32c4edb..5b1ca4d 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -827,15 +827,15 @@ sub summary
|
||||
|
||||
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
|
||||
|
||||
- my $query = "select top_layer,middle_layer,lower_layer, count(*) from mc_event group by top_layer,middle_layer,lower_layer";
|
||||
+ my $query = "select label, mc, top_layer,middle_layer,lower_layer, count(*) from mc_event group by label,mc,top_layer,middle_layer,lower_layer";
|
||||
my $query_handle = $dbh->prepare($query);
|
||||
$query_handle->execute();
|
||||
|
||||
- $query_handle->bind_columns(\my($top, $mid, $low, $count));
|
||||
+ $query_handle->bind_columns(\my($label, $mc, $top, $mid, $low, $count));
|
||||
|
||||
print "Memory controller events summary:\n";
|
||||
while($query_handle->fetch()) {
|
||||
- print "location: $top:$mid:$low errors: $count\n";
|
||||
+ print "DIMM Label(s): '$label' location: $mc:$top:$mid:$low errors: $count\n";
|
||||
}
|
||||
|
||||
$query_handle->finish;
|
||||
--
|
||||
1.7.1
|
||||
|
|
@ -0,0 +1,240 @@
|
|||
From 002238dff53b284c9455554f146176ee8de2de4a Mon Sep 17 00:00:00 2001
|
||||
From: Mauro Carvalho Chehab <mchehab@redhat.com>
|
||||
Date: Fri, 31 May 2013 12:41:01 -0300
|
||||
Subject: [PATCH 02/32] ras-record: make the code more generic
|
||||
|
||||
Now that we're ready to add more tables to the database, make
|
||||
the code that creates and inserts data into the table more
|
||||
generic.
|
||||
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
|
||||
---
|
||||
ras-record.c | 173 +++++++++++++++++++++++++++++++++++++---------------------
|
||||
1 files changed, 110 insertions(+), 63 deletions(-)
|
||||
|
||||
diff --git a/ras-record.c b/ras-record.c
|
||||
index 8995c9e..3af0791 100644
|
||||
--- a/ras-record.c
|
||||
+++ b/ras-record.c
|
||||
@@ -28,80 +28,128 @@
|
||||
#include "ras-mc-handler.h"
|
||||
#include "ras-logger.h"
|
||||
|
||||
+/* #define DEBUG_SQL 1 */
|
||||
+
|
||||
#define SQLITE_RAS_DB RASSTATEDIR "/" RAS_DB_FNAME
|
||||
|
||||
-const char *mc_event_db = " mc_event ";
|
||||
-const char *mc_event_db_create_fields = "("
|
||||
- "id INTEGER PRIMARY KEY"
|
||||
- ", timestamp TEXT"
|
||||
- ", err_count INTEGER"
|
||||
- ", err_type TEXT"
|
||||
- ", err_msg TEXT" /* 5 */
|
||||
- ", label TEXT"
|
||||
- ", mc INTEGER"
|
||||
- ", top_layer INTEGER"
|
||||
- ", middle_layer INTEGER"
|
||||
- ", lower_layer INTEGER" /* 10 */
|
||||
- ", address INTEGER"
|
||||
- ", grain INTEGER"
|
||||
- ", syndrome INTEGER"
|
||||
- ", driver_detail TEXT" /* 14 */
|
||||
- ")";
|
||||
-
|
||||
-const char *mc_event_db_fields = "("
|
||||
- "id"
|
||||
- ", timestamp"
|
||||
- ", err_count"
|
||||
- ", err_type"
|
||||
- ", err_msg" /* 5 */
|
||||
- ", label"
|
||||
- ", mc"
|
||||
- ", top_layer"
|
||||
- ", middle_layer"
|
||||
- ", lower_layer" /* 10 */
|
||||
- ", address"
|
||||
- ", grain"
|
||||
- ", syndrome"
|
||||
- ", driver_detail" /* 14 */
|
||||
- ")";
|
||||
-
|
||||
-#define NUM_MC_EVENT_DB_VALUES 14
|
||||
-
|
||||
-const char *createdb = "CREATE TABLE IF NOT EXISTS";
|
||||
+
|
||||
+#define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x)))
|
||||
+
|
||||
+struct db_fields {
|
||||
+ char *name;
|
||||
+ char *type;
|
||||
+};
|
||||
+
|
||||
+struct db_table_descriptor {
|
||||
+ char *name;
|
||||
+ const struct db_fields *fields;
|
||||
+ size_t num_fields;
|
||||
+};
|
||||
+
|
||||
+static const struct db_fields mc_event_fields[] = {
|
||||
+ { .name="id", .type="INTEGER PRIMARY KEY" },
|
||||
+ { .name="timestamp", .type="TEXT" },
|
||||
+ { .name="err_count", .type="INTEGER" },
|
||||
+ { .name="err_type", .type="TEXT" },
|
||||
+ { .name="err_msg", .type="TEXT" },
|
||||
+ { .name="label", .type="TEXT" },
|
||||
+ { .name="mc", .type="INTEGER" },
|
||||
+ { .name="top_layer", .type="INTEGER" },
|
||||
+ { .name="middle_layer", .type="INTEGER" },
|
||||
+ { .name="lower_layer", .type="INTEGER" },
|
||||
+ { .name="address", .type="INTEGER" },
|
||||
+ { .name="grain", .type="INTEGER" },
|
||||
+ { .name="syndrome", .type="INTEGER" },
|
||||
+ { .name="driver_detail", .type="TEXT" },
|
||||
+};
|
||||
+
|
||||
+static const struct db_table_descriptor mc_event_tab = {
|
||||
+ .name = "mc_event",
|
||||
+ .fields = mc_event_fields,
|
||||
+ .num_fields = ARRAY_SIZE(mc_event_fields),
|
||||
+};
|
||||
+
|
||||
const char *insertdb = "INSERT INTO";
|
||||
const char *valuesdb = " VALUES ";
|
||||
|
||||
-static int ras_mc_prepare_stmt(struct sqlite3_priv *priv)
|
||||
+static int ras_mc_prepare_stmt(struct sqlite3_priv *priv,
|
||||
+ sqlite3_stmt **stmt,
|
||||
+ const struct db_table_descriptor *db_tab)
|
||||
+
|
||||
{
|
||||
int i, rc;
|
||||
- char sql[1024];
|
||||
+ char sql[1024], *p = sql, *end = sql + sizeof(sql);
|
||||
+ const struct db_fields *field;
|
||||
+
|
||||
+ p += snprintf(p, end - p, "INSERT INTO %s (",
|
||||
+ db_tab->name);
|
||||
+
|
||||
+ for (i = 0; i < db_tab->num_fields; i++) {
|
||||
+ field = &db_tab->fields[i];
|
||||
+ p += snprintf(p, end - p, "%s", field->name);
|
||||
+
|
||||
+ if (i < db_tab->num_fields - 1)
|
||||
+ p += snprintf(p, end - p, ", ");
|
||||
+ }
|
||||
|
||||
- strcpy(sql, insertdb);
|
||||
- strcat(sql, mc_event_db);
|
||||
- strcat(sql, mc_event_db_fields);
|
||||
- strcat(sql, valuesdb);
|
||||
+ p += snprintf(p, end - p, ") VALUES ( NULL, ");
|
||||
|
||||
- strcat(sql, "(NULL, "); /* Auto-increment field */
|
||||
- for (i = 1; i < NUM_MC_EVENT_DB_VALUES; i++) {
|
||||
- if (i < NUM_MC_EVENT_DB_VALUES - 1)
|
||||
+ for (i = 1; i < db_tab->num_fields; i++) {
|
||||
+ if (i < db_tab->num_fields - 1)
|
||||
strcat(sql, "?, ");
|
||||
else
|
||||
strcat(sql, "?)");
|
||||
}
|
||||
|
||||
- rc = sqlite3_prepare_v2(priv->db, sql, -1, &priv->stmt, NULL);
|
||||
+#ifdef DEBUG_SQL
|
||||
+ log(TERM, LOG_INFO, "SQL: %s\n", sql);
|
||||
+#endif
|
||||
+
|
||||
+ rc = sqlite3_prepare_v2(priv->db, sql, -1, stmt, NULL);
|
||||
if (rc != SQLITE_OK)
|
||||
- log(TERM, LOG_ERR, "Failed to prepare insert db on %s: error = %s\n",
|
||||
- SQLITE_RAS_DB, sqlite3_errmsg(priv->db));
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "Failed to prepare insert db at table %s (db %s): error = %s\n",
|
||||
+ db_tab->name, SQLITE_RAS_DB, sqlite3_errmsg(priv->db));
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
+static int ras_mc_create_table(struct sqlite3_priv *priv,
|
||||
+ const struct db_table_descriptor *db_tab)
|
||||
+{
|
||||
+ const struct db_fields *field;
|
||||
+ char sql[1024], *p = sql, *end = sql + sizeof(sql);
|
||||
+ int i,rc;
|
||||
+
|
||||
+ p += snprintf(p, end - p, "CREATE TABLE IF NOT EXISTS %s (",
|
||||
+ db_tab->name);
|
||||
+
|
||||
+ for (i = 0; i < db_tab->num_fields; i++) {
|
||||
+ field = &db_tab->fields[i];
|
||||
+ p += snprintf(p, end - p, "%s %s", field->name, field->type);
|
||||
+
|
||||
+ if (i < db_tab->num_fields - 1)
|
||||
+ p += snprintf(p, end - p, ", ");
|
||||
+ }
|
||||
+ p += snprintf(p, end - p, ")");
|
||||
+
|
||||
+#ifdef DEBUG_SQL
|
||||
+ log(TERM, LOG_INFO, "SQL: %s\n", sql);
|
||||
+#endif
|
||||
+
|
||||
+ rc = sqlite3_exec(priv->db, sql, NULL, NULL, NULL);
|
||||
+ if (rc != SQLITE_OK) {
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "Failed to create table %s on %s: error = %d\n",
|
||||
+ db_tab->name, SQLITE_RAS_DB, rc);
|
||||
+ }
|
||||
+ return rc;
|
||||
+}
|
||||
+
|
||||
int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras)
|
||||
{
|
||||
int rc;
|
||||
sqlite3 *db;
|
||||
- char sql[1024];
|
||||
struct sqlite3_priv *priv;
|
||||
|
||||
printf("Calling %s()\n", __FUNCTION__);
|
||||
@@ -137,27 +185,26 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras)
|
||||
free(priv);
|
||||
return -1;
|
||||
}
|
||||
+ priv->db = db;
|
||||
|
||||
- strcpy(sql, createdb);
|
||||
- strcat(sql, mc_event_db);
|
||||
- strcat(sql, mc_event_db_create_fields);
|
||||
- rc = sqlite3_exec(db, sql, NULL, NULL, NULL);
|
||||
+ rc = ras_mc_create_table(priv, &mc_event_tab);
|
||||
if (rc != SQLITE_OK) {
|
||||
- log(TERM, LOG_ERR,
|
||||
- "cpu %u: Failed to create db on %s: error = %d\n",
|
||||
- cpu, SQLITE_RAS_DB, rc);
|
||||
+ sqlite3_close(db);
|
||||
free(priv);
|
||||
return -1;
|
||||
}
|
||||
|
||||
- priv->db = db;
|
||||
- ras->db_priv = priv;
|
||||
-
|
||||
- rc = ras_mc_prepare_stmt(priv);
|
||||
- if (rc == SQLITE_OK)
|
||||
+ rc = ras_mc_prepare_stmt(priv, &priv->stmt, &mc_event_tab);
|
||||
+ if (rc == SQLITE_OK) {
|
||||
log(TERM, LOG_INFO,
|
||||
"cpu %u: Recording events at %s\n",
|
||||
cpu, SQLITE_RAS_DB);
|
||||
+ ras->db_priv = priv;
|
||||
+ } else {
|
||||
+ sqlite3_close(db);
|
||||
+ free(priv);
|
||||
+ return -1;
|
||||
+ }
|
||||
|
||||
return 0;
|
||||
}
|
||||
--
|
||||
1.7.1
|
||||
|
|
@ -0,0 +1,97 @@
|
|||
From 016802f4093e80971a52c590c661a04924cb9aa3 Mon Sep 17 00:00:00 2001
|
||||
From: Mauro Carvalho Chehab <mchehab@redhat.com>
|
||||
Date: Fri, 31 May 2013 13:10:16 -0300
|
||||
Subject: [PATCH 03/32] ras-record: rename stmt to stmt_mc_event
|
||||
|
||||
This stmt is used only for mc_event. So, rename it, as we'll be
|
||||
adding other stmts for the other tables.
|
||||
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
|
||||
---
|
||||
ras-record.c | 46 ++++++++++++++++++++++++----------------------
|
||||
ras-record.h | 2 +-
|
||||
2 files changed, 25 insertions(+), 23 deletions(-)
|
||||
|
||||
diff --git a/ras-record.c b/ras-record.c
|
||||
index 3af0791..efcd78f 100644
|
||||
--- a/ras-record.c
|
||||
+++ b/ras-record.c
|
||||
@@ -194,7 +194,7 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras)
|
||||
return -1;
|
||||
}
|
||||
|
||||
- rc = ras_mc_prepare_stmt(priv, &priv->stmt, &mc_event_tab);
|
||||
+ rc = ras_mc_prepare_stmt(priv, &priv->stmt_mc_event, &mc_event_tab);
|
||||
if (rc == SQLITE_OK) {
|
||||
log(TERM, LOG_INFO,
|
||||
"cpu %u: Recording events at %s\n",
|
||||
@@ -214,30 +214,32 @@ int ras_store_mc_event(struct ras_events *ras, struct ras_mc_event *ev)
|
||||
int rc;
|
||||
struct sqlite3_priv *priv = ras->db_priv;
|
||||
|
||||
- if (!priv || !priv->stmt)
|
||||
+ if (!priv || !priv->stmt_mc_event)
|
||||
return 0;
|
||||
- log(TERM, LOG_INFO, "mc_event store: %p\n", priv->stmt);
|
||||
-
|
||||
- sqlite3_bind_text(priv->stmt, 1, ev->timestamp, -1, NULL);
|
||||
- sqlite3_bind_int (priv->stmt, 2, ev->error_count);
|
||||
- sqlite3_bind_text(priv->stmt, 3, ev->error_type, -1, NULL);
|
||||
- sqlite3_bind_text(priv->stmt, 4, ev->msg, -1, NULL);
|
||||
- sqlite3_bind_text(priv->stmt, 5, ev->label, -1, NULL);
|
||||
- sqlite3_bind_int (priv->stmt, 6, ev->mc_index);
|
||||
- sqlite3_bind_int (priv->stmt, 7, ev->top_layer);
|
||||
- sqlite3_bind_int (priv->stmt, 8, ev->middle_layer);
|
||||
- sqlite3_bind_int (priv->stmt, 9, ev->lower_layer);
|
||||
- sqlite3_bind_int (priv->stmt, 10, ev->address);
|
||||
- sqlite3_bind_int (priv->stmt, 11, ev->grain);
|
||||
- sqlite3_bind_int (priv->stmt, 12, ev->syndrome);
|
||||
- sqlite3_bind_text(priv->stmt, 13, ev->driver_detail, -1, NULL);
|
||||
- rc = sqlite3_step(priv->stmt);
|
||||
+ log(TERM, LOG_INFO, "mc_event store: %p\n", priv->stmt_mc_event);
|
||||
+
|
||||
+ sqlite3_bind_text(priv->stmt_mc_event, 1, ev->timestamp, -1, NULL);
|
||||
+ sqlite3_bind_int (priv->stmt_mc_event, 2, ev->error_count);
|
||||
+ sqlite3_bind_text(priv->stmt_mc_event, 3, ev->error_type, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_mc_event, 4, ev->msg, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_mc_event, 5, ev->label, -1, NULL);
|
||||
+ sqlite3_bind_int (priv->stmt_mc_event, 6, ev->mc_index);
|
||||
+ sqlite3_bind_int (priv->stmt_mc_event, 7, ev->top_layer);
|
||||
+ sqlite3_bind_int (priv->stmt_mc_event, 8, ev->middle_layer);
|
||||
+ sqlite3_bind_int (priv->stmt_mc_event, 9, ev->lower_layer);
|
||||
+ sqlite3_bind_int (priv->stmt_mc_event, 10, ev->address);
|
||||
+ sqlite3_bind_int (priv->stmt_mc_event, 11, ev->grain);
|
||||
+ sqlite3_bind_int (priv->stmt_mc_event, 12, ev->syndrome);
|
||||
+ sqlite3_bind_text(priv->stmt_mc_event, 13, ev->driver_detail, -1, NULL);
|
||||
+ rc = sqlite3_step(priv->stmt_mc_event);
|
||||
if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
- log(TERM, LOG_ERR, "Failed to do mc_event step on sqlite: error = %d\n", rc);
|
||||
- rc = sqlite3_reset(priv->stmt);
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "Failed to do mc_event step on sqlite: error = %d\n", rc);
|
||||
+ rc = sqlite3_reset(priv->stmt_mc_event);
|
||||
if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
- log(TERM, LOG_ERR, "Failed reset mc_event on sqlite: error = %d\n",
|
||||
- rc);
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "Failed reset mc_event on sqlite: error = %d\n",
|
||||
+ rc);
|
||||
log(TERM, LOG_INFO, "register inserted at db\n");
|
||||
|
||||
return rc;
|
||||
diff --git a/ras-record.h b/ras-record.h
|
||||
index 20c327f..9791185 100644
|
||||
--- a/ras-record.h
|
||||
+++ b/ras-record.h
|
||||
@@ -46,7 +46,7 @@ struct ras_aer_event {
|
||||
|
||||
struct sqlite3_priv {
|
||||
sqlite3 *db;
|
||||
- sqlite3_stmt *stmt;
|
||||
+ sqlite3_stmt *stmt_mc_event;
|
||||
};
|
||||
|
||||
int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras);
|
||||
--
|
||||
1.7.1
|
||||
|
|
@ -0,0 +1,114 @@
|
|||
From 4474f696c9207ceb21d55a0047ab6871879afe5a Mon Sep 17 00:00:00 2001
|
||||
From: Mauro Carvalho Chehab <mchehab@redhat.com>
|
||||
Date: Fri, 31 May 2013 13:51:55 -0300
|
||||
Subject: [PATCH 04/32] ras-record: reorder functions
|
||||
|
||||
No functional changes
|
||||
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
|
||||
---
|
||||
ras-record.c | 77 +++++++++++++++++++++++++++++----------------------------
|
||||
1 files changed, 39 insertions(+), 38 deletions(-)
|
||||
|
||||
diff --git a/ras-record.c b/ras-record.c
|
||||
index efcd78f..298977e 100644
|
||||
--- a/ras-record.c
|
||||
+++ b/ras-record.c
|
||||
@@ -46,6 +46,10 @@ struct db_table_descriptor {
|
||||
size_t num_fields;
|
||||
};
|
||||
|
||||
+/*
|
||||
+ * Table and functions to handle ras:mc_event
|
||||
+ */
|
||||
+
|
||||
static const struct db_fields mc_event_fields[] = {
|
||||
{ .name="id", .type="INTEGER PRIMARY KEY" },
|
||||
{ .name="timestamp", .type="TEXT" },
|
||||
@@ -69,8 +73,41 @@ static const struct db_table_descriptor mc_event_tab = {
|
||||
.num_fields = ARRAY_SIZE(mc_event_fields),
|
||||
};
|
||||
|
||||
-const char *insertdb = "INSERT INTO";
|
||||
-const char *valuesdb = " VALUES ";
|
||||
+int ras_store_mc_event(struct ras_events *ras, struct ras_mc_event *ev)
|
||||
+{
|
||||
+ int rc;
|
||||
+ struct sqlite3_priv *priv = ras->db_priv;
|
||||
+
|
||||
+ if (!priv || !priv->stmt_mc_event)
|
||||
+ return 0;
|
||||
+ log(TERM, LOG_INFO, "mc_event store: %p\n", priv->stmt_mc_event);
|
||||
+
|
||||
+ sqlite3_bind_text(priv->stmt_mc_event, 1, ev->timestamp, -1, NULL);
|
||||
+ sqlite3_bind_int (priv->stmt_mc_event, 2, ev->error_count);
|
||||
+ sqlite3_bind_text(priv->stmt_mc_event, 3, ev->error_type, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_mc_event, 4, ev->msg, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_mc_event, 5, ev->label, -1, NULL);
|
||||
+ sqlite3_bind_int (priv->stmt_mc_event, 6, ev->mc_index);
|
||||
+ sqlite3_bind_int (priv->stmt_mc_event, 7, ev->top_layer);
|
||||
+ sqlite3_bind_int (priv->stmt_mc_event, 8, ev->middle_layer);
|
||||
+ sqlite3_bind_int (priv->stmt_mc_event, 9, ev->lower_layer);
|
||||
+ sqlite3_bind_int (priv->stmt_mc_event, 10, ev->address);
|
||||
+ sqlite3_bind_int (priv->stmt_mc_event, 11, ev->grain);
|
||||
+ sqlite3_bind_int (priv->stmt_mc_event, 12, ev->syndrome);
|
||||
+ sqlite3_bind_text(priv->stmt_mc_event, 13, ev->driver_detail, -1, NULL);
|
||||
+ rc = sqlite3_step(priv->stmt_mc_event);
|
||||
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "Failed to do mc_event step on sqlite: error = %d\n", rc);
|
||||
+ rc = sqlite3_reset(priv->stmt_mc_event);
|
||||
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "Failed reset mc_event on sqlite: error = %d\n",
|
||||
+ rc);
|
||||
+ log(TERM, LOG_INFO, "register inserted at db\n");
|
||||
+
|
||||
+ return rc;
|
||||
+}
|
||||
|
||||
static int ras_mc_prepare_stmt(struct sqlite3_priv *priv,
|
||||
sqlite3_stmt **stmt,
|
||||
@@ -208,39 +245,3 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras)
|
||||
|
||||
return 0;
|
||||
}
|
||||
-
|
||||
-int ras_store_mc_event(struct ras_events *ras, struct ras_mc_event *ev)
|
||||
-{
|
||||
- int rc;
|
||||
- struct sqlite3_priv *priv = ras->db_priv;
|
||||
-
|
||||
- if (!priv || !priv->stmt_mc_event)
|
||||
- return 0;
|
||||
- log(TERM, LOG_INFO, "mc_event store: %p\n", priv->stmt_mc_event);
|
||||
-
|
||||
- sqlite3_bind_text(priv->stmt_mc_event, 1, ev->timestamp, -1, NULL);
|
||||
- sqlite3_bind_int (priv->stmt_mc_event, 2, ev->error_count);
|
||||
- sqlite3_bind_text(priv->stmt_mc_event, 3, ev->error_type, -1, NULL);
|
||||
- sqlite3_bind_text(priv->stmt_mc_event, 4, ev->msg, -1, NULL);
|
||||
- sqlite3_bind_text(priv->stmt_mc_event, 5, ev->label, -1, NULL);
|
||||
- sqlite3_bind_int (priv->stmt_mc_event, 6, ev->mc_index);
|
||||
- sqlite3_bind_int (priv->stmt_mc_event, 7, ev->top_layer);
|
||||
- sqlite3_bind_int (priv->stmt_mc_event, 8, ev->middle_layer);
|
||||
- sqlite3_bind_int (priv->stmt_mc_event, 9, ev->lower_layer);
|
||||
- sqlite3_bind_int (priv->stmt_mc_event, 10, ev->address);
|
||||
- sqlite3_bind_int (priv->stmt_mc_event, 11, ev->grain);
|
||||
- sqlite3_bind_int (priv->stmt_mc_event, 12, ev->syndrome);
|
||||
- sqlite3_bind_text(priv->stmt_mc_event, 13, ev->driver_detail, -1, NULL);
|
||||
- rc = sqlite3_step(priv->stmt_mc_event);
|
||||
- if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
- log(TERM, LOG_ERR,
|
||||
- "Failed to do mc_event step on sqlite: error = %d\n", rc);
|
||||
- rc = sqlite3_reset(priv->stmt_mc_event);
|
||||
- if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
- log(TERM, LOG_ERR,
|
||||
- "Failed reset mc_event on sqlite: error = %d\n",
|
||||
- rc);
|
||||
- log(TERM, LOG_INFO, "register inserted at db\n");
|
||||
-
|
||||
- return rc;
|
||||
-}
|
||||
--
|
||||
1.7.1
|
||||
|
|
@ -0,0 +1,60 @@
|
|||
From 93217061a4b1dc7f287f2715aadc621d2c00425d Mon Sep 17 00:00:00 2001
|
||||
From: Mauro Carvalho Chehab <mchehab@redhat.com>
|
||||
Date: Fri, 31 May 2013 13:53:18 -0300
|
||||
Subject: [PATCH 05/32] ras-record: Make the code easier to add support for other tables
|
||||
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
|
||||
---
|
||||
ras-record.c | 25 ++++++++-----------------
|
||||
1 files changed, 8 insertions(+), 17 deletions(-)
|
||||
|
||||
diff --git a/ras-record.c b/ras-record.c
|
||||
index 298977e..36b3373 100644
|
||||
--- a/ras-record.c
|
||||
+++ b/ras-record.c
|
||||
@@ -143,10 +143,14 @@ static int ras_mc_prepare_stmt(struct sqlite3_priv *priv,
|
||||
#endif
|
||||
|
||||
rc = sqlite3_prepare_v2(priv->db, sql, -1, stmt, NULL);
|
||||
- if (rc != SQLITE_OK)
|
||||
+ if (rc != SQLITE_OK) {
|
||||
log(TERM, LOG_ERR,
|
||||
"Failed to prepare insert db at table %s (db %s): error = %s\n",
|
||||
db_tab->name, SQLITE_RAS_DB, sqlite3_errmsg(priv->db));
|
||||
+ stmt = NULL;
|
||||
+ } else {
|
||||
+ log(TERM, LOG_INFO, "Recording %s events\n", db_tab->name);
|
||||
+ }
|
||||
|
||||
return rc;
|
||||
}
|
||||
@@ -225,23 +229,10 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras)
|
||||
priv->db = db;
|
||||
|
||||
rc = ras_mc_create_table(priv, &mc_event_tab);
|
||||
- if (rc != SQLITE_OK) {
|
||||
- sqlite3_close(db);
|
||||
- free(priv);
|
||||
- return -1;
|
||||
- }
|
||||
+ if (rc == SQLITE_OK)
|
||||
+ rc = ras_mc_prepare_stmt(priv, &priv->stmt_mc_event, &mc_event_tab);
|
||||
|
||||
- rc = ras_mc_prepare_stmt(priv, &priv->stmt_mc_event, &mc_event_tab);
|
||||
- if (rc == SQLITE_OK) {
|
||||
- log(TERM, LOG_INFO,
|
||||
- "cpu %u: Recording events at %s\n",
|
||||
- cpu, SQLITE_RAS_DB);
|
||||
- ras->db_priv = priv;
|
||||
- } else {
|
||||
- sqlite3_close(db);
|
||||
- free(priv);
|
||||
- return -1;
|
||||
- }
|
||||
|
||||
+ ras->db_priv = priv;
|
||||
return 0;
|
||||
}
|
||||
--
|
||||
1.7.1
|
||||
|
|
@ -0,0 +1,141 @@
|
|||
From 11004aaa98865dd7c0ee28b4af8d6ba6b6f11507 Mon Sep 17 00:00:00 2001
|
||||
From: Mauro Carvalho Chehab <mchehab@redhat.com>
|
||||
Date: Fri, 31 May 2013 13:54:11 -0300
|
||||
Subject: [PATCH 06/32] Add support to record AER events
|
||||
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
|
||||
---
|
||||
ras-aer-handler.c | 4 ++-
|
||||
ras-record.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
|
||||
ras-record.h | 6 +++++
|
||||
3 files changed, 68 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ras-aer-handler.c b/ras-aer-handler.c
|
||||
index ec63e2a..e5abaca 100644
|
||||
--- a/ras-aer-handler.c
|
||||
+++ b/ras-aer-handler.c
|
||||
@@ -111,7 +111,9 @@ int ras_aer_event_handler(struct trace_seq *s,
|
||||
trace_seq_puts(s, ev.error_type);
|
||||
|
||||
/* Insert data into the SGBD */
|
||||
-// ras_store_aer_event(ras, &ev);
|
||||
+#ifdef HAVE_SQLITE3
|
||||
+ ras_store_aer_event(ras, &ev);
|
||||
+#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
diff --git a/ras-record.c b/ras-record.c
|
||||
index 36b3373..cb302ce 100644
|
||||
--- a/ras-record.c
|
||||
+++ b/ras-record.c
|
||||
@@ -26,6 +26,7 @@
|
||||
#include <unistd.h>
|
||||
#include "ras-events.h"
|
||||
#include "ras-mc-handler.h"
|
||||
+#include "ras-aer-handler.h"
|
||||
#include "ras-logger.h"
|
||||
|
||||
/* #define DEBUG_SQL 1 */
|
||||
@@ -109,6 +110,56 @@ int ras_store_mc_event(struct ras_events *ras, struct ras_mc_event *ev)
|
||||
return rc;
|
||||
}
|
||||
|
||||
+/*
|
||||
+ * Table and functions to handle ras:aer
|
||||
+ */
|
||||
+
|
||||
+#ifdef HAVE_AER
|
||||
+static const struct db_fields aer_event_fields[] = {
|
||||
+ { .name="id", .type="INTEGER PRIMARY KEY" },
|
||||
+ { .name="timestamp", .type="TEXT" },
|
||||
+ { .name="err_type", .type="TEXT" },
|
||||
+ { .name="err_msg", .type="TEXT" },
|
||||
+};
|
||||
+
|
||||
+static const struct db_table_descriptor aer_event_tab = {
|
||||
+ .name = "aer_event",
|
||||
+ .fields = aer_event_fields,
|
||||
+ .num_fields = ARRAY_SIZE(aer_event_fields),
|
||||
+};
|
||||
+
|
||||
+int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev)
|
||||
+{
|
||||
+ int rc;
|
||||
+ struct sqlite3_priv *priv = ras->db_priv;
|
||||
+
|
||||
+ if (!priv || !priv->stmt_aer_event)
|
||||
+ return 0;
|
||||
+ log(TERM, LOG_INFO, "mc_event store: %p\n", priv->stmt_aer_event);
|
||||
+
|
||||
+ sqlite3_bind_text(priv->stmt_aer_event, 1, ev->timestamp, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_aer_event, 3, ev->error_type, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_aer_event, 4, ev->msg, -1, NULL);
|
||||
+
|
||||
+ rc = sqlite3_step(priv->stmt_aer_event);
|
||||
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "Failed to do aer_event step on sqlite: error = %d\n", rc);
|
||||
+ rc = sqlite3_reset(priv->stmt_aer_event);
|
||||
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "Failed reset aer_event on sqlite: error = %d\n",
|
||||
+ rc);
|
||||
+ log(TERM, LOG_INFO, "register inserted at db\n");
|
||||
+
|
||||
+ return rc;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+/*
|
||||
+ * Generic code
|
||||
+ */
|
||||
+
|
||||
static int ras_mc_prepare_stmt(struct sqlite3_priv *priv,
|
||||
sqlite3_stmt **stmt,
|
||||
const struct db_table_descriptor *db_tab)
|
||||
@@ -230,8 +281,15 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras)
|
||||
|
||||
rc = ras_mc_create_table(priv, &mc_event_tab);
|
||||
if (rc == SQLITE_OK)
|
||||
- rc = ras_mc_prepare_stmt(priv, &priv->stmt_mc_event, &mc_event_tab);
|
||||
+ rc = ras_mc_prepare_stmt(priv, &priv->stmt_mc_event,
|
||||
+ &mc_event_tab);
|
||||
|
||||
+#ifdef HAVE_AER
|
||||
+ rc = ras_mc_create_table(priv, &aer_event_tab);
|
||||
+ if (rc == SQLITE_OK)
|
||||
+ rc = ras_mc_prepare_stmt(priv, &priv->stmt_aer_event,
|
||||
+ &aer_event_tab);
|
||||
+#endif
|
||||
|
||||
ras->db_priv = priv;
|
||||
return 0;
|
||||
diff --git a/ras-record.h b/ras-record.h
|
||||
index 9791185..5008906 100644
|
||||
--- a/ras-record.h
|
||||
+++ b/ras-record.h
|
||||
@@ -47,14 +47,20 @@ struct ras_aer_event {
|
||||
struct sqlite3_priv {
|
||||
sqlite3 *db;
|
||||
sqlite3_stmt *stmt_mc_event;
|
||||
+#ifdef HAVE_AER
|
||||
+ sqlite3_stmt *stmt_aer_event;
|
||||
+#endif
|
||||
};
|
||||
|
||||
int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras);
|
||||
int ras_store_mc_event(struct ras_events *ras, struct ras_mc_event *ev);
|
||||
+int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev);
|
||||
|
||||
#else
|
||||
static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; };
|
||||
static inline int ras_store_mc_event(struct ras_events *ras, struct ras_mc_event *ev) { return 0; };
|
||||
+static inline int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev) { return 0; };
|
||||
+
|
||||
#endif
|
||||
|
||||
#endif
|
||||
--
|
||||
1.7.1
|
||||
|
|
@ -0,0 +1,202 @@
|
|||
From 0a31d938cf29e065e96de1206a7d35042962e02a Mon Sep 17 00:00:00 2001
|
||||
From: Mauro Carvalho Chehab <mchehab@redhat.com>
|
||||
Date: Fri, 31 May 2013 14:18:24 -0300
|
||||
Subject: [PATCH 07/32] Add support to store MCE events at the database
|
||||
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
|
||||
---
|
||||
ras-mce-handler.c | 5 +++
|
||||
ras-record.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++-
|
||||
ras-record.h | 9 +++++
|
||||
3 files changed, 116 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ras-mce-handler.c b/ras-mce-handler.c
|
||||
index 614a0eb..59e8d05 100644
|
||||
--- a/ras-mce-handler.c
|
||||
+++ b/ras-mce-handler.c
|
||||
@@ -396,5 +396,10 @@ int ras_mce_event_handler(struct trace_seq *s,
|
||||
return rc;
|
||||
|
||||
report_mce_event(ras, record, s, &e);
|
||||
+
|
||||
+#ifdef HAVE_SQLITE3
|
||||
+ ras_store_mce_record(ras, &e);
|
||||
+#endif
|
||||
+
|
||||
return 0;
|
||||
}
|
||||
diff --git a/ras-record.c b/ras-record.c
|
||||
index cb302ce..daa3cb1 100644
|
||||
--- a/ras-record.c
|
||||
+++ b/ras-record.c
|
||||
@@ -27,6 +27,7 @@
|
||||
#include "ras-events.h"
|
||||
#include "ras-mc-handler.h"
|
||||
#include "ras-aer-handler.h"
|
||||
+#include "ras-mce-handler.h"
|
||||
#include "ras-logger.h"
|
||||
|
||||
/* #define DEBUG_SQL 1 */
|
||||
@@ -135,7 +136,7 @@ int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev)
|
||||
|
||||
if (!priv || !priv->stmt_aer_event)
|
||||
return 0;
|
||||
- log(TERM, LOG_INFO, "mc_event store: %p\n", priv->stmt_aer_event);
|
||||
+ log(TERM, LOG_INFO, "aer_event store: %p\n", priv->stmt_aer_event);
|
||||
|
||||
sqlite3_bind_text(priv->stmt_aer_event, 1, ev->timestamp, -1, NULL);
|
||||
sqlite3_bind_text(priv->stmt_aer_event, 3, ev->error_type, -1, NULL);
|
||||
@@ -156,6 +157,98 @@ int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev)
|
||||
}
|
||||
#endif
|
||||
|
||||
+
|
||||
+/*
|
||||
+ * Table and functions to handle mce:mce_record
|
||||
+ */
|
||||
+
|
||||
+#ifdef HAVE_MCE
|
||||
+static const struct db_fields mce_record_fields[] = {
|
||||
+ { .name="id", .type="INTEGER PRIMARY KEY" },
|
||||
+ { .name="timestamp", .type="TEXT" },
|
||||
+
|
||||
+ /* MCE registers */
|
||||
+ { .name="mcgcap", .type="INTEGER" },
|
||||
+ { .name="mcgstatus", .type="INTEGER" },
|
||||
+ { .name="status", .type="INTEGER" },
|
||||
+ { .name="addr", .type="INTEGER" }, // 5
|
||||
+ { .name="misc", .type="INTEGER" },
|
||||
+ { .name="ip", .type="INTEGER" },
|
||||
+ { .name="tsc", .type="INTEGER" },
|
||||
+ { .name="walltime", .type="INTEGER" },
|
||||
+ { .name="cpu", .type="INTEGER" }, // 10
|
||||
+ { .name="cpuid", .type="INTEGER" },
|
||||
+ { .name="apicid", .type="INTEGER" },
|
||||
+ { .name="socketid", .type="INTEGER" },
|
||||
+ { .name="cs", .type="INTEGER" },
|
||||
+ { .name="bank", .type="INTEGER" }, //15
|
||||
+ { .name="cpuvendor", .type="INTEGER" },
|
||||
+
|
||||
+ /* Parsed data - will likely change */
|
||||
+ { .name="bank_name", .type="TEXT" },
|
||||
+ { .name="error_msg", .type="TEXT" },
|
||||
+ { .name="mcgstatus_msg", .type="TEXT" },
|
||||
+ { .name="mcistatus_msg", .type="TEXT" }, // 20
|
||||
+ { .name="user_action", .type="TEXT" },
|
||||
+ { .name="mc_location", .type="TEXT" },
|
||||
+};
|
||||
+
|
||||
+static const struct db_table_descriptor mce_record_tab = {
|
||||
+ .name = "mce_record",
|
||||
+ .fields = mce_record_fields,
|
||||
+ .num_fields = ARRAY_SIZE(mce_record_fields),
|
||||
+};
|
||||
+
|
||||
+int ras_store_mce_record(struct ras_events *ras, struct mce_event *ev)
|
||||
+{
|
||||
+ int rc;
|
||||
+ struct sqlite3_priv *priv = ras->db_priv;
|
||||
+
|
||||
+ if (!priv || !priv->stmt_mce_record)
|
||||
+ return 0;
|
||||
+ log(TERM, LOG_INFO, "mce_record store: %p\n", priv->stmt_mce_record);
|
||||
+
|
||||
+ sqlite3_bind_text(priv->stmt_mce_record, 1, ev->timestamp, -1, NULL);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 2, ev->mcgcap);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 3, ev->mcgstatus);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 4, ev->status);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 5, ev->addr);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 6, ev->misc);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 7, ev->ip);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 8, ev->tsc);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 9, ev->walltime);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 10, ev->cpu);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 11, ev->cpuid);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 12, ev->apicid);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 13, ev->socketid);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 14, ev->cs);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 15, ev->bank);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 16, ev->cpuvendor);
|
||||
+
|
||||
+ sqlite3_bind_text(priv->stmt_mce_record, 17, ev->bank_name, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_mce_record, 18, ev->error_msg, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_mce_record, 19, ev->mcgstatus_msg, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_mce_record, 20, ev->mcistatus_msg, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_mce_record, 21, ev->mcastatus_msg, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_mce_record, 22, ev->user_action, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_mce_record, 23, ev->mc_location, -1, NULL);
|
||||
+
|
||||
+ rc = sqlite3_step(priv->stmt_mce_record);
|
||||
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "Failed to do mce_record step on sqlite: error = %d\n", rc);
|
||||
+ rc = sqlite3_reset(priv->stmt_mce_record);
|
||||
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "Failed reset mce_record on sqlite: error = %d\n",
|
||||
+ rc);
|
||||
+ log(TERM, LOG_INFO, "register inserted at db\n");
|
||||
+
|
||||
+ return rc;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+
|
||||
/*
|
||||
* Generic code
|
||||
*/
|
||||
@@ -291,6 +384,13 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras)
|
||||
&aer_event_tab);
|
||||
#endif
|
||||
|
||||
- ras->db_priv = priv;
|
||||
+#ifdef HAVE_MCE
|
||||
+ rc = ras_mc_create_table(priv, &mce_record_tab);
|
||||
+ if (rc == SQLITE_OK)
|
||||
+ rc = ras_mc_prepare_stmt(priv, &priv->stmt_mce_record,
|
||||
+ &mce_record_tab);
|
||||
+#endif
|
||||
+
|
||||
+ ras->db_priv = priv;
|
||||
return 0;
|
||||
}
|
||||
diff --git a/ras-record.h b/ras-record.h
|
||||
index 5008906..6f146a8 100644
|
||||
--- a/ras-record.h
|
||||
+++ b/ras-record.h
|
||||
@@ -40,6 +40,10 @@ struct ras_aer_event {
|
||||
const char *msg;
|
||||
};
|
||||
|
||||
+struct ras_mc_event;
|
||||
+struct ras_aer_event;
|
||||
+struct mce_event;
|
||||
+
|
||||
#ifdef HAVE_SQLITE3
|
||||
|
||||
#include <sqlite3.h>
|
||||
@@ -50,16 +54,21 @@ struct sqlite3_priv {
|
||||
#ifdef HAVE_AER
|
||||
sqlite3_stmt *stmt_aer_event;
|
||||
#endif
|
||||
+#ifdef HAVE_MCE
|
||||
+ sqlite3_stmt *stmt_mce_record;
|
||||
+#endif
|
||||
};
|
||||
|
||||
int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras);
|
||||
int ras_store_mc_event(struct ras_events *ras, struct ras_mc_event *ev);
|
||||
int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev);
|
||||
+int ras_store_mce_record(struct ras_events *ras, struct mce_event *ev);
|
||||
|
||||
#else
|
||||
static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; };
|
||||
static inline int ras_store_mc_event(struct ras_events *ras, struct ras_mc_event *ev) { return 0; };
|
||||
static inline int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev) { return 0; };
|
||||
+static inline int ras_store_mce_record(struct ras_events *ras, struct mce_event *ev) { return 0; };
|
||||
|
||||
#endif
|
||||
|
||||
--
|
||||
1.7.1
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
From 2925cc92d73065dab3bbf7de83404d6e0e141dc6 Mon Sep 17 00:00:00 2001
|
||||
From: Mauro Carvalho Chehab <mchehab@redhat.com>
|
||||
Date: Fri, 31 May 2013 14:57:54 -0300
|
||||
Subject: [PATCH 08/32] ras-mc-ctl: add summary for MCE and PCIe AER errors
|
||||
|
||||
Report the summary also for MCE and PCIe errors.
|
||||
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
|
||||
---
|
||||
util/ras-mc-ctl.in | 50 ++++++++++++++++++++++++++++++++++++++++++++------
|
||||
1 files changed, 44 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index 5b1ca4d..118af7b 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -824,21 +824,59 @@ sub find_prog
|
||||
sub summary
|
||||
{
|
||||
require DBI;
|
||||
+ my ($query, $query_handle, $out);
|
||||
+ my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg);
|
||||
|
||||
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
|
||||
|
||||
- my $query = "select label, mc, top_layer,middle_layer,lower_layer, count(*) from mc_event group by label,mc,top_layer,middle_layer,lower_layer";
|
||||
- my $query_handle = $dbh->prepare($query);
|
||||
+ # Memory controller mc_event errors
|
||||
+ $query = "select err_type, label, mc, top_layer,middle_layer,lower_layer, count(*) from mc_event group by err_type, label, mc, top_layer, middle_layer, lower_layer";
|
||||
+ $query_handle = $dbh->prepare($query);
|
||||
$query_handle->execute();
|
||||
+ $query_handle->bind_columns(\($err_type, $label, $mc, $top, $mid, $low, $count));
|
||||
+ $out = "";
|
||||
+ while($query_handle->fetch()) {
|
||||
+ $out .= "\t$err_type on DIMM Label(s): '$label' location: $mc:$top:$mid:$low errors: $count\n";
|
||||
+ }
|
||||
+ if ($out ne "") {
|
||||
+ print "Memory controller events summary:\n$out\n";
|
||||
+ } else {
|
||||
+ print "No Memory errors.\n\n";
|
||||
+ }
|
||||
+ $query_handle->finish;
|
||||
|
||||
- $query_handle->bind_columns(\my($label, $mc, $top, $mid, $low, $count));
|
||||
-
|
||||
- print "Memory controller events summary:\n";
|
||||
+ # PCIe AER aer_event errors
|
||||
+ $query = "select err_type, err_msg, count(*) from aer_event group by err_type, err_msg";
|
||||
+ $query_handle = $dbh->prepare($query);
|
||||
+ $query_handle->execute();
|
||||
+ $query_handle->bind_columns(\($err_type, $msg, $count));
|
||||
+ $out = "";
|
||||
while($query_handle->fetch()) {
|
||||
- print "DIMM Label(s): '$label' location: $mc:$top:$mid:$low errors: $count\n";
|
||||
+ $out .= "\t$count $err_type errors: $msg\n";
|
||||
}
|
||||
+ if ($out ne "") {
|
||||
+ print "PCIe AER events summary:\n$out\n";
|
||||
+ } else {
|
||||
+ print "No PCIe AER errors.\n\n";
|
||||
+ }
|
||||
+ $query_handle->finish;
|
||||
|
||||
+ # MCE mce_record errors
|
||||
+ $query = "select error_msg, count(*) from mce_record group by error_msg";
|
||||
+ $query_handle = $dbh->prepare($query);
|
||||
+ $query_handle->execute();
|
||||
+ $query_handle->bind_columns(\($msg, $count));
|
||||
+ $out = "";
|
||||
+ while($query_handle->fetch()) {
|
||||
+ $out .= "\t$count $msg errors\n";
|
||||
+ }
|
||||
+ if ($out ne "") {
|
||||
+ print "MCE records summary:\n$out";
|
||||
+ } else {
|
||||
+ print "No MCE errors.\n";
|
||||
+ }
|
||||
$query_handle->finish;
|
||||
+
|
||||
undef($dbh);
|
||||
}
|
||||
|
||||
--
|
||||
1.7.1
|
||||
|
|
@ -0,0 +1,108 @@
|
|||
From 4b64649eb5740027f58377f6c29d1554d9792b97 Mon Sep 17 00:00:00 2001
|
||||
From: Mauro Carvalho Chehab <mchehab@redhat.com>
|
||||
Date: Fri, 31 May 2013 16:16:44 -0300
|
||||
Subject: [PATCH 09/32] ras-mc-ctl: report errors also for PCIe AER and MCE
|
||||
|
||||
Show also PCIe AER and MCE when used with --errors parameter.
|
||||
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
|
||||
---
|
||||
util/ras-mc-ctl.in | 73 +++++++++++++++++++++++++++++++++++++++++++++++-----
|
||||
1 files changed, 66 insertions(+), 7 deletions(-)
|
||||
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index 118af7b..30d3078 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -883,22 +883,81 @@ sub summary
|
||||
sub errors
|
||||
{
|
||||
require DBI;
|
||||
+ my ($query, $query_handle, $id, $time, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out);
|
||||
+ my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location);
|
||||
|
||||
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
|
||||
|
||||
- my $query = "select id, timestamp, err_count, err_type, err_msg, label, mc, top_layer,middle_layer,lower_layer, address, grain, syndrome, driver_detail from mc_event order by id";
|
||||
-
|
||||
- my $query_handle = $dbh->prepare($query);
|
||||
+ # Memory controller mc_event errors
|
||||
+ $query = "select id, timestamp, err_count, err_type, err_msg, label, mc, top_layer,middle_layer,lower_layer, address, grain, syndrome, driver_detail from mc_event order by id";
|
||||
+ $query_handle = $dbh->prepare($query);
|
||||
$query_handle->execute();
|
||||
+ $query_handle->bind_columns(\($id, $time, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail));
|
||||
+ $out = "";
|
||||
+ while($query_handle->fetch()) {
|
||||
+ $out .= "$id $time $count $type error(s): $msg at $label location: $mc:$top:$mid:$low, addr $addr, grain $grain, syndrome $syndrome $detail\n";
|
||||
+ }
|
||||
+ if ($out ne "") {
|
||||
+ print "PCIe AER events:\n$out\n";
|
||||
+ } else {
|
||||
+ print "No PCIe AER errors.\n\n";
|
||||
+ }
|
||||
+ $query_handle->finish;
|
||||
|
||||
- $query_handle->bind_columns(\my($id, $time, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail));
|
||||
-
|
||||
- print "Memory controller events:\n";
|
||||
+ # PCIe AER aer_event errors
|
||||
+ $query = "select id, timestamp, err_type, err_msg from aer_event order by id";
|
||||
+ $query_handle = $dbh->prepare($query);
|
||||
+ $query_handle->execute();
|
||||
+ $query_handle->bind_columns(\($id, $time, $type, $msg));
|
||||
+ $out = "";
|
||||
while($query_handle->fetch()) {
|
||||
- print "$id $time $count $type error(s): $msg at $label location: $mc:$top:$mid:$low, addr $addr, grain $grain, syndrome $syndrome $detail\n";
|
||||
+ $out .= "$id $time $type error: $msg\n";
|
||||
}
|
||||
+ if ($out ne "") {
|
||||
+ print "MCE events:\n$out\n";
|
||||
+ } else {
|
||||
+ print "No MCE errors.\n\n";
|
||||
+ }
|
||||
+ $query_handle->finish;
|
||||
|
||||
+ # MCE mce_record errors
|
||||
+ $query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id";
|
||||
+ $query_handle = $dbh->prepare($query);
|
||||
+ $query_handle->execute();
|
||||
+ $query_handle->bind_columns(\($id, $time, $mcgcap,$mcgstatus, $status, $addr, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $msg, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location));
|
||||
+ $out = "";
|
||||
+ while($query_handle->fetch()) {
|
||||
+ $out .= "$id $time error: $msg";
|
||||
+ $out .= ", CPU $cpuvendor" if ($cpuvendor);
|
||||
+ $out .= ", bank $bank_name" if ($bank_name);
|
||||
+ $out .= ", mcg $mcgstatus_msg" if ($mcgstatus_msg);
|
||||
+ $out .= ", mci $mcistatus_msg" if ($mcistatus_msg);
|
||||
+ $out .= ", $mc_location" if ($mc_location);
|
||||
+ $out .= ", $user_action" if ($user_action);
|
||||
+ $out .= sprintf ", mcgcap=0x%08x", $mcgcap if ($mcgcap);
|
||||
+ $out .= sprintf ", mcgstatus=0x%08x", $mcgstatus if ($mcgstatus);
|
||||
+ $out .= sprintf ", status=0x%08x", $status if ($status);
|
||||
+ $out .= sprintf ", addr=0x%08x", $addr if ($addr);
|
||||
+ $out .= sprintf ", misc=0x%08x", $misc if ($misc);
|
||||
+ $out .= sprintf ", ip=0x%08x", $ip if ($ip);
|
||||
+ $out .= sprintf ", tsc=0x%08x", $tsc if ($tsc);
|
||||
+ $out .= sprintf ", walltime=0x%08x", $walltime if ($walltime);
|
||||
+ $out .= sprintf ", cpu=0x%08x", $cpu if ($cpu);
|
||||
+ $out .= sprintf ", cpuid=0x%08x", $cpuid if ($cpuid);
|
||||
+ $out .= sprintf ", apicid=0x%08x", $apicid if ($apicid);
|
||||
+ $out .= sprintf ", socketid=0x%08x", $socketid if ($socketid);
|
||||
+ $out .= sprintf ", cs=0x%08x", $cs if ($cs);
|
||||
+ $out .= sprintf ", bank=0x%08x", $bank if ($bank);
|
||||
+
|
||||
+ $out .= "\n";
|
||||
+ }
|
||||
+ if ($out ne "") {
|
||||
+ print "Memory controller events:\n$out\n";
|
||||
+ } else {
|
||||
+ print "No Memory errors.\n\n";
|
||||
+ }
|
||||
$query_handle->finish;
|
||||
+
|
||||
undef($dbh);
|
||||
}
|
||||
|
||||
--
|
||||
1.7.1
|
||||
|
|
@ -0,0 +1,53 @@
|
|||
From dc811f88b1bd5ac33faa1606c3a3ce4d3bc0b7ed Mon Sep 17 00:00:00 2001
|
||||
From: Mauro Carvalho Chehab <mchehab@redhat.com>
|
||||
Date: Fri, 31 May 2013 16:40:40 -0300
|
||||
Subject: [PATCH 10/32] ras-mc-ctl: Fix the name of the error table data
|
||||
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
|
||||
---
|
||||
util/ras-mc-ctl.in | 12 ++++++------
|
||||
1 files changed, 6 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index 30d3078..48d9b00 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -898,9 +898,9 @@ sub errors
|
||||
$out .= "$id $time $count $type error(s): $msg at $label location: $mc:$top:$mid:$low, addr $addr, grain $grain, syndrome $syndrome $detail\n";
|
||||
}
|
||||
if ($out ne "") {
|
||||
- print "PCIe AER events:\n$out\n";
|
||||
+ print "Memory controller events:\n$out\n";
|
||||
} else {
|
||||
- print "No PCIe AER errors.\n\n";
|
||||
+ print "No Memory errors.\n\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
|
||||
@@ -914,9 +914,9 @@ sub errors
|
||||
$out .= "$id $time $type error: $msg\n";
|
||||
}
|
||||
if ($out ne "") {
|
||||
- print "MCE events:\n$out\n";
|
||||
+ print "PCIe AER events:\n$out\n";
|
||||
} else {
|
||||
- print "No MCE errors.\n\n";
|
||||
+ print "No PCIe AER errors.\n\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
|
||||
@@ -952,9 +952,9 @@ sub errors
|
||||
$out .= "\n";
|
||||
}
|
||||
if ($out ne "") {
|
||||
- print "Memory controller events:\n$out\n";
|
||||
+ print "MCE events:\n$out\n";
|
||||
} else {
|
||||
- print "No Memory errors.\n\n";
|
||||
+ print "No MCE errors.\n\n";
|
||||
}
|
||||
$query_handle->finish;
|
||||
|
||||
--
|
||||
1.7.1
|
||||
|
|
@ -0,0 +1,36 @@
|
|||
From 099af4056912faa28bf1385fffa77e7bbb468b93 Mon Sep 17 00:00:00 2001
|
||||
From: Mauro Carvalho Chehab <m.chehab@samsung.com>
|
||||
Date: Thu, 15 Aug 2013 12:43:02 -0300
|
||||
Subject: [PATCH 13/32] ras-mc-ctl: Improve parser
|
||||
|
||||
Accept either . or : as layers separator at config files.
|
||||
|
||||
Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com>
|
||||
---
|
||||
util/ras-mc-ctl.in | 4 ++--
|
||||
1 files changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index 48d9b00..f5a8ce5 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -481,14 +481,14 @@ sub parse_dimm_labels_file
|
||||
|
||||
next unless (my ($label, $info) = ($str =~ /^(.*)\s*:\s*(.*)$/i));
|
||||
|
||||
- unless ($info =~ /\d+(?:\.\d+)*/) {
|
||||
+ unless ($info =~ /\d+(?:[\.\:]\d+)*/) {
|
||||
log_error ("$file: $line: Invalid syntax, ignoring: \"$_\"\n");
|
||||
next;
|
||||
}
|
||||
|
||||
for my $target (split (/[, ]+/, $info)) {
|
||||
my $n;
|
||||
- my ($mc, $top, $mid, $low, $extra) = ($target =~ /(\d+)(?:\.(\d+)){0,1}(?:\.(\d+)){0,1}(?:\.(\d+)){0,1}(?:\.(\d+)){0,1}/);
|
||||
+ my ($mc, $top, $mid, $low, $extra) = ($target =~ /(\d+)(?:[\.\:](\d+)){0,1}(?:[\.\:](\d+)){0,1}(?:[\.\:](\d+)){0,1}(?:[\.\:](\d+)){0,1}/);
|
||||
|
||||
if (defined($extra)) {
|
||||
die ("Error: Only up to 3 layers are currently supported on label db \"$file\"\n");
|
||||
--
|
||||
1.7.1
|
||||
|
|
@ -0,0 +1,77 @@
|
|||
From 0d53728f9cbdca5a1bd32c51a121dd1162f50e95 Mon Sep 17 00:00:00 2001
|
||||
From: Mauro Carvalho Chehab <m.chehab@samsung.com>
|
||||
Date: Thu, 15 Aug 2013 12:45:18 -0300
|
||||
Subject: [PATCH 14/32] ras-mc-ctl: Fix label register with 2 layers
|
||||
|
||||
When there aren't 3 layers, label print/register weren't working.
|
||||
|
||||
Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com>
|
||||
---
|
||||
util/ras-mc-ctl.in | 19 +++++++++++++------
|
||||
1 files changed, 13 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index f5a8ce5..a7137be 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -508,7 +508,6 @@ sub parse_dimm_labels_file
|
||||
}
|
||||
map { $lh->{$vendor}{lc $_}{$mc}{$top}{$mid}{$low} = $label }
|
||||
@models;
|
||||
- $n = 3;
|
||||
}
|
||||
if (!$num) {
|
||||
$num = $n;
|
||||
@@ -542,9 +541,13 @@ sub parse_dimm_labels
|
||||
|
||||
sub read_dimm_label
|
||||
{
|
||||
- my ($mc, $top, $mid, $low) = @_;
|
||||
+ my ($num_layers, $mc, $top, $mid, $low) = @_;
|
||||
my $sysfs = "/sys/devices/system/edac/mc";
|
||||
- my $pos = "$mc:$top:$mid:$low";
|
||||
+ my $pos;
|
||||
+
|
||||
+ $pos = "$mc:$top:$mid:$low" if ($num_layers == 3);
|
||||
+ $pos = "$mc:$top:$mid" if ($num_layers == 2);
|
||||
+ $pos = "$mc:$top" if ($num_layers == 1);
|
||||
|
||||
if (!defined($dimm_node{$pos})) {
|
||||
my $label = "$pos missing";
|
||||
@@ -574,10 +577,14 @@ sub read_dimm_label
|
||||
|
||||
sub get_dimm_label_node
|
||||
{
|
||||
- my ($mc, $top, $mid, $low) = @_;
|
||||
+ my ($num_layers, $mc, $top, $mid, $low) = @_;
|
||||
my $sysfs = "/sys/devices/system/edac/mc";
|
||||
my $pos = "$mc:$top:$mid:$low";
|
||||
|
||||
+ $pos = "$mc:$top:$mid:$low" if ($num_layers == 3);
|
||||
+ $pos = "$mc:$top:$mid" if ($num_layers == 2);
|
||||
+ $pos = "$mc:$top" if ($num_layers == 1);
|
||||
+
|
||||
return "" if (!defined($dimm_node{$pos}));
|
||||
|
||||
my $dimm = $dimm_node{$pos};
|
||||
@@ -611,7 +618,7 @@ sub print_dimm_labels
|
||||
for my $mid (sort keys %{$$lref{$vendor}{$model}{$mc}{$top}}) {
|
||||
for my $low (sort keys %{$$lref{$vendor}{$model}{$mc}{$top}{$mid}}) {
|
||||
my $label = $$lref{$vendor}{$model}{$mc}{$top}{$mid}{$low};
|
||||
- my ($rlabel,$loc) = read_dimm_label ($mc, $top, $mid, $low);
|
||||
+ my ($rlabel,$loc) = read_dimm_label ($$num_layers{$vendor}{$model}, $mc, $top, $mid, $low);
|
||||
|
||||
printf $fh $format, $loc, $label, $rlabel;
|
||||
}
|
||||
@@ -645,7 +652,7 @@ sub register_dimm_labels
|
||||
for my $mid (sort keys %{$$lref{$vendor}{$model}{$mc}{$top}}) {
|
||||
for my $low (sort keys %{$$lref{$vendor}{$model}{$mc}{$top}{$mid}}) {
|
||||
|
||||
- my $file = get_dimm_label_node($mc, $top, $mid, $low);
|
||||
+ my $file = get_dimm_label_node($$num_layers{$vendor}{$model}, $mc, $top, $mid, $low);
|
||||
|
||||
# Ignore sysfs files that don't exist. Might just be
|
||||
# unpopulated bank.
|
||||
--
|
||||
1.7.1
|
||||
|
|
@ -0,0 +1,44 @@
|
|||
From 74d84ba18f4f1d7097b47ce1c2e41e332d197dfb Mon Sep 17 00:00:00 2001
|
||||
From: Mauro Carvalho Chehab <m.chehab@samsung.com>
|
||||
Date: Thu, 15 Aug 2013 12:58:02 -0300
|
||||
Subject: [PATCH 15/32] Add an example of labels file
|
||||
|
||||
This is an example of a labels file for a Dell Power Edge T620.
|
||||
|
||||
For now, only DIMMs A1 and B1 are tested here.
|
||||
|
||||
Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com>
|
||||
---
|
||||
labels/dell | 20 ++++++++++++++++++++
|
||||
1 files changed, 20 insertions(+), 0 deletions(-)
|
||||
create mode 100644 labels/dell
|
||||
|
||||
diff --git a/labels/dell b/labels/dell
|
||||
new file mode 100644
|
||||
index 0000000..e1a09a7
|
||||
--- /dev/null
|
||||
+++ b/labels/dell
|
||||
@@ -0,0 +1,20 @@
|
||||
+# RASDAEMON Motherboard DIMM labels Database file.
|
||||
+#
|
||||
+# Vendor-name and model-name are found from the program 'dmidecode'
|
||||
+# labels are found from the silk screen on the motherboard.
|
||||
+#
|
||||
+#Vendor: <vendor-name>
|
||||
+# Model: <model-name>
|
||||
+# <label>: <mc>.<top>.<mid>.<low>
|
||||
+#
|
||||
+
|
||||
+Vendor: Dell Inc.
|
||||
+
|
||||
+ Model: 0F5XM3
|
||||
+ DIMM_A1: 0.0.0; DIMM_A2: 0.0.1; DIMM_A3: 0.0.2; DIMM_A4: 0.0.3;
|
||||
+ DIMM_A5: 0.1.0; DIMM_A6: 0.1.1; DIMM_A7: 0.1.2; DIMM_A8: 0.1.3;
|
||||
+ DIMM_A9: 0.2.0; DIMM_A10: 0.2.1; DIMM_A11: 0.2.2; DIMM_A12: 0.2.3;
|
||||
+
|
||||
+ DIMM_B1: 1.0.0; DIMM_B2: 1.0.1; DIMM_B3: 1.0.2; DIMM_B4: 1.0.3;
|
||||
+ DIMM_B5: 1.1.0; DIMM_B6: 1.1.1; DIMM_B7: 1.1.2; DIMM_B8: 1.1.3;
|
||||
+ DIMM_B9: 1.2.0; DIMM_B10: 1.2.1; DIMM_B11: 1.2.2; DIMM_B12: 1.2.3;
|
||||
--
|
||||
1.7.1
|
||||
|
|
@ -0,0 +1,76 @@
|
|||
From b8bb2ed4a751516d32373e478e5c9ea9f16b524d Mon Sep 17 00:00:00 2001
|
||||
From: Mauro Carvalho Chehab <m.chehab@samsung.com>
|
||||
Date: Thu, 15 Aug 2013 17:13:43 -0300
|
||||
Subject: [PATCH 17/32] ras-mc-ctl: Fix the DIMM layout display
|
||||
|
||||
The items weren't being presented at the right order. Fix it.
|
||||
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
|
||||
---
|
||||
util/ras-mc-ctl.in | 26 ++++++++++++++++++++------
|
||||
1 files changed, 20 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index a7137be..196a643 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -673,15 +673,15 @@ sub register_dimm_labels
|
||||
return 1;
|
||||
}
|
||||
|
||||
-sub dimm_display_layer($@);
|
||||
+sub dimm_display_layer_rev($@);
|
||||
|
||||
-sub dimm_display_layer($@)
|
||||
+sub dimm_display_layer_rev($@)
|
||||
{
|
||||
my $layer = shift;
|
||||
my @pos = @_;
|
||||
|
||||
- $layer--;
|
||||
- if ($layer < 0) {
|
||||
+ $layer++;
|
||||
+ if ($layer >= scalar(@pos) - 1) {
|
||||
my $str_loc = join(':', @pos);
|
||||
my $size = $dimm_size{$str_loc};
|
||||
if (!$size) {
|
||||
@@ -695,12 +695,26 @@ sub dimm_display_layer($@)
|
||||
my $s;
|
||||
for (my $i = 0; $i <= $max_pos[$layer]; $i++) {
|
||||
$pos[$layer] = $i;
|
||||
- $s .= dimm_display_layer($layer, @pos);
|
||||
+ $s .= dimm_display_layer_rev($layer, @pos);
|
||||
}
|
||||
|
||||
return $s;
|
||||
}
|
||||
|
||||
+sub dimm_display_layer(@)
|
||||
+{
|
||||
+ my @pos = @_;
|
||||
+
|
||||
+ my $s;
|
||||
+ for (my $i = 0; $i <= $max_pos[0]; $i++) {
|
||||
+ $pos[0] = $i;
|
||||
+ $s .= dimm_display_layer_rev(0, @pos);
|
||||
+ }
|
||||
+
|
||||
+ return $s;
|
||||
+}
|
||||
+
|
||||
+
|
||||
sub dimm_display_layer_header($$)
|
||||
{
|
||||
my $n_items = 1;
|
||||
@@ -753,7 +767,7 @@ sub dimm_display_mem()
|
||||
my $p1 = length($s) - 1;
|
||||
|
||||
$pos[scalar(@pos) - 1] = $d;
|
||||
- $s .= dimm_display_layer(scalar(@pos) - 1, @pos);
|
||||
+ $s .= dimm_display_layer(@pos);
|
||||
$len += length($s);
|
||||
|
||||
$sep = "-" x $p1;
|
||||
--
|
||||
1.7.1
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
From 2afbcd81173822014d6d73e98e9093a140bb1421 Mon Sep 17 00:00:00 2001
|
||||
From: Aristeu Rozanski <arozansk@redhat.com>
|
||||
Date: Fri, 6 Dec 2013 09:45:14 -0500
|
||||
Subject: [PATCH 19/32] ras-mc-ctl: remove completely use of modprobe
|
||||
|
||||
While verifying SELinux policies, this popped up. ras-mc-ctl inherited a
|
||||
modprobe lookup that ends up never being used. This patch gets rid of
|
||||
it.
|
||||
|
||||
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
|
||||
---
|
||||
util/ras-mc-ctl.in | 1 -
|
||||
1 files changed, 0 insertions(+), 1 deletions(-)
|
||||
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index 196a643..ef0d9bc 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -39,7 +39,6 @@ my $dbname = "@RASSTATEDIR@/@RAS_DB_FNAME@";
|
||||
my $prefix = "@prefix@";
|
||||
my $sysconfdir = "@sysconfdir@";
|
||||
my $dmidecode = find_prog ("dmidecode");
|
||||
-my $modprobe = find_prog ("modprobe") or exit (1);
|
||||
|
||||
my %conf = ();
|
||||
my %bus = ();
|
||||
--
|
||||
1.7.1
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
From 78465e5047b226011c1a4c916c79c63fb6e68f71 Mon Sep 17 00:00:00 2001
|
||||
From: Mauro Carvalho Chehab <m.chehab@samsung.com>
|
||||
Date: Fri, 14 Feb 2014 05:11:26 +0900
|
||||
Subject: [PATCH 22/32] mce-amd-k8.c: fix a warning
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
mce-amd-k8.c: In function ‘bank_name’:
|
||||
mce-amd-k8.c:250:22: warning: argument to ‘sizeof’ in ‘snprintf’ call is the same expression as the destination; did you mean to provide an explicit length? [-Wsizeof-pointer-memaccess]
|
||||
snprintf(buf, sizeof(buf), "%s (bank=%d)", s, e->bank);
|
||||
^
|
||||
|
||||
Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com>
|
||||
---
|
||||
mce-amd-k8.c | 3 +--
|
||||
1 files changed, 1 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/mce-amd-k8.c b/mce-amd-k8.c
|
||||
index 5e21b55..8179f74 100644
|
||||
--- a/mce-amd-k8.c
|
||||
+++ b/mce-amd-k8.c
|
||||
@@ -236,7 +236,6 @@ static void decode_k8_threashold(struct mce_event *e)
|
||||
|
||||
static void bank_name(struct mce_event *e)
|
||||
{
|
||||
- char *buf = e->bank_name;
|
||||
const char *s;
|
||||
|
||||
if (e->bank < ARRAY_SIZE(k8bank))
|
||||
@@ -247,7 +246,7 @@ static void bank_name(struct mce_event *e)
|
||||
else
|
||||
return; /* Use the generic parser for bank */
|
||||
|
||||
- snprintf(buf, sizeof(buf), "%s (bank=%d)", s, e->bank);
|
||||
+ mce_snprintf(e->bank_name, "%s (bank=%d)", s, e->bank);
|
||||
}
|
||||
|
||||
int parse_amd_k8_event(struct ras_events *ras, struct mce_event *e)
|
||||
--
|
||||
1.7.1
|
||||
|
|
@ -0,0 +1,641 @@
|
|||
From c6ed1e1af9356cdce1eaa652061dd6e4eb32d283 Mon Sep 17 00:00:00 2001
|
||||
From: Junliang Li <lijunliang.dna@gmail.com>
|
||||
Date: Thu, 13 Feb 2014 10:39:53 +0800
|
||||
Subject: [PATCH 23/32] add abrt suppport for rasdaemon
|
||||
|
||||
Adds abrt as another error mechanism for the rasdaemon.
|
||||
This patch does:
|
||||
|
||||
1) read ras event (mc,mce and aer)
|
||||
|
||||
2) setup a abrt-server unix socket
|
||||
|
||||
3) write messages follow ABRT server protocol, set event
|
||||
info into backtrace zone.
|
||||
|
||||
4) commit report.
|
||||
|
||||
For now, it depends on ABRT to limit flood reports.
|
||||
|
||||
Signed-off-by: Junliang Li <lijunliang.dna@gmail.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com>
|
||||
---
|
||||
Makefile.am | 5 +-
|
||||
configure.ac | 9 +
|
||||
ras-aer-handler.c | 6 +
|
||||
ras-events.h | 3 +
|
||||
ras-mc-handler.c | 7 +
|
||||
ras-mce-handler.c | 6 +
|
||||
ras-report.c | 429 +++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
ras-report.h | 39 +++++
|
||||
8 files changed, 503 insertions(+), 1 deletions(-)
|
||||
create mode 100644 ras-report.c
|
||||
create mode 100644 ras-report.h
|
||||
|
||||
diff --git a/Makefile.am b/Makefile.am
|
||||
index 473ce98..c1668b4 100644
|
||||
--- a/Makefile.am
|
||||
+++ b/Makefile.am
|
||||
@@ -17,10 +17,13 @@ if WITH_MCE
|
||||
mce-intel-dunnington.c mce-intel-tulsa.c \
|
||||
mce-intel-sb.c mce-intel-ivb.c
|
||||
endif
|
||||
+if WITH_ABRT_REPORT
|
||||
+ rasdaemon_SOURCES += ras-report.c
|
||||
+endif
|
||||
rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a
|
||||
|
||||
include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \
|
||||
- ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h
|
||||
+ ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h
|
||||
|
||||
# This rule can't be called with more than one Makefile job (like make -j8)
|
||||
# I can't figure out a way to fix that
|
||||
diff --git a/configure.ac b/configure.ac
|
||||
index 4fe6ef2..0ea962e 100644
|
||||
--- a/configure.ac
|
||||
+++ b/configure.ac
|
||||
@@ -53,6 +53,15 @@ AS_IF([test "x$enable_mce" = "xyes"], [
|
||||
])
|
||||
AM_CONDITIONAL([WITH_MCE], [test x$enable_mce = xyes])
|
||||
|
||||
+AC_ARG_ENABLE([abrt_report],
|
||||
+ AS_HELP_STRING([--enable-abrt-report], [enable report event to ABRT (currently experimental)]))
|
||||
+
|
||||
+AS_IF([test "x$enable_abrt_report" = "xyes"], [
|
||||
+ AC_DEFINE(HAVE_ABRT_REPORT,1,"have report event to ABRT")
|
||||
+ AC_SUBST([WITH_ABRT_REPORT])
|
||||
+])
|
||||
+AM_CONDITIONAL([WITH_ABRT_REPORT], [test x$enable_abrt_report = xyes])
|
||||
+
|
||||
test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc
|
||||
|
||||
CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes"
|
||||
diff --git a/ras-aer-handler.c b/ras-aer-handler.c
|
||||
index e5abaca..50526af 100644
|
||||
--- a/ras-aer-handler.c
|
||||
+++ b/ras-aer-handler.c
|
||||
@@ -24,6 +24,7 @@
|
||||
#include "ras-record.h"
|
||||
#include "ras-logger.h"
|
||||
#include "bitfield.h"
|
||||
+#include "ras-report.h"
|
||||
|
||||
static const char *aer_errors[32] = {
|
||||
/* Correctable errors */
|
||||
@@ -115,5 +116,10 @@ int ras_aer_event_handler(struct trace_seq *s,
|
||||
ras_store_aer_event(ras, &ev);
|
||||
#endif
|
||||
|
||||
+#ifdef HAVE_ABRT_REPORT
|
||||
+ /* Report event to ABRT */
|
||||
+ ras_report_aer_event(ras, &ev);
|
||||
+#endif
|
||||
+
|
||||
return 0;
|
||||
}
|
||||
diff --git a/ras-events.h b/ras-events.h
|
||||
index 554a95e..64e045a 100644
|
||||
--- a/ras-events.h
|
||||
+++ b/ras-events.h
|
||||
@@ -47,6 +47,9 @@ struct ras_events {
|
||||
|
||||
/* For the mce handler */
|
||||
struct mce_priv *mce_priv;
|
||||
+
|
||||
+ /* For ABRT socket*/
|
||||
+ int socketfd;
|
||||
};
|
||||
|
||||
struct pthread_data {
|
||||
diff --git a/ras-mc-handler.c b/ras-mc-handler.c
|
||||
index 5c24f65..ffb3805 100644
|
||||
--- a/ras-mc-handler.c
|
||||
+++ b/ras-mc-handler.c
|
||||
@@ -23,6 +23,7 @@
|
||||
#include "ras-mc-handler.h"
|
||||
#include "ras-record.h"
|
||||
#include "ras-logger.h"
|
||||
+#include "ras-report.h"
|
||||
|
||||
int ras_mc_event_handler(struct trace_seq *s,
|
||||
struct pevent_record *record,
|
||||
@@ -189,6 +190,12 @@ int ras_mc_event_handler(struct trace_seq *s,
|
||||
/* Insert data into the SGBD */
|
||||
|
||||
ras_store_mc_event(ras, &ev);
|
||||
+
|
||||
+#ifdef HAVE_ABRT_REPORT
|
||||
+ /* Report event to ABRT */
|
||||
+ ras_report_mc_event(ras, &ev);
|
||||
+#endif
|
||||
+
|
||||
return 0;
|
||||
|
||||
parse_error:
|
||||
diff --git a/ras-mce-handler.c b/ras-mce-handler.c
|
||||
index 59e8d05..1431049 100644
|
||||
--- a/ras-mce-handler.c
|
||||
+++ b/ras-mce-handler.c
|
||||
@@ -26,6 +26,7 @@
|
||||
#include "ras-mce-handler.h"
|
||||
#include "ras-record.h"
|
||||
#include "ras-logger.h"
|
||||
+#include "ras-report.h"
|
||||
|
||||
/*
|
||||
* The code below were adapted from Andi Kleen/Intel/SuSe mcelog code,
|
||||
@@ -401,5 +402,10 @@ int ras_mce_event_handler(struct trace_seq *s,
|
||||
ras_store_mce_record(ras, &e);
|
||||
#endif
|
||||
|
||||
+#ifdef HAVE_ABRT_REPORT
|
||||
+ /* Report event to ABRT */
|
||||
+ ras_report_mce_event(ras, &e);
|
||||
+#endif
|
||||
+
|
||||
return 0;
|
||||
}
|
||||
diff --git a/ras-report.c b/ras-report.c
|
||||
new file mode 100644
|
||||
index 0000000..d3e4a79
|
||||
--- /dev/null
|
||||
+++ b/ras-report.c
|
||||
@@ -0,0 +1,429 @@
|
||||
+#include <stdio.h>
|
||||
+#include <string.h>
|
||||
+#include <unistd.h>
|
||||
+#include <sys/types.h>
|
||||
+#include <sys/utsname.h>
|
||||
+#include <sys/socket.h>
|
||||
+#include <sys/un.h>
|
||||
+
|
||||
+#include "ras-report.h"
|
||||
+
|
||||
+static int setup_report_socket(void){
|
||||
+ int sockfd = -1;
|
||||
+ int rc = -1;
|
||||
+ struct sockaddr_un addr;
|
||||
+
|
||||
+ sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
|
||||
+ if (sockfd < 0){
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ memset(&addr, 0, sizeof(struct sockaddr_un));
|
||||
+ addr.sun_family = AF_UNIX;
|
||||
+ strncpy(addr.sun_path, ABRT_SOCKET, strlen(ABRT_SOCKET));
|
||||
+
|
||||
+ rc = connect(sockfd, (struct sockaddr *)&addr, sizeof(struct sockaddr_un));
|
||||
+ if (rc < 0){
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ return sockfd;
|
||||
+}
|
||||
+
|
||||
+static int commit_report_basic(int sockfd){
|
||||
+ char buf[INPUT_BUFFER_SIZE];
|
||||
+ struct utsname un;
|
||||
+ int rc = -1;
|
||||
+
|
||||
+ if(sockfd < 0){
|
||||
+ return rc;
|
||||
+ }
|
||||
+
|
||||
+ memset(buf, 0, INPUT_BUFFER_SIZE);
|
||||
+ memset(&un, 0, sizeof(struct utsname));
|
||||
+
|
||||
+ rc = uname(&un);
|
||||
+ if(rc < 0){
|
||||
+ return rc;
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * ABRT server protocol
|
||||
+ */
|
||||
+ sprintf(buf, "PUT / HTTP/1.1\r\n\r\n");
|
||||
+ rc = write(sockfd, buf, strlen(buf) + 1);
|
||||
+ if(rc < strlen(buf) + 1){
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ sprintf(buf, "PID=%d", (int)getpid());
|
||||
+ rc = write(sockfd, buf, strlen(buf) + 1);
|
||||
+ if(rc < strlen(buf) + 1){
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ sprintf(buf, "EXECUTABLE=/boot/vmlinuz-%s", un.release);
|
||||
+ rc = write(sockfd, buf, strlen(buf) + 1);
|
||||
+ if(rc < strlen(buf) + 1){
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ sprintf(buf, "BASENAME=%s", "rasdaemon");
|
||||
+ rc = write(sockfd, buf, strlen(buf) + 1);
|
||||
+ if(rc < strlen(buf) + 1){
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * add "DONE" string to finish message.
|
||||
+ */
|
||||
+static int commit_report_done(int sockfd){
|
||||
+ int rc = -1;
|
||||
+
|
||||
+ if(sockfd < 0){
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ rc = write(sockfd, "DONE\0", strlen("DONE\0"));
|
||||
+ if(rc < strlen("DONE\0")){
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static int set_mc_event_backtrace(char *buf, struct ras_mc_event *ev){
|
||||
+ char bt_buf[MAX_BACKTRACE_SIZE];
|
||||
+
|
||||
+ if(!buf || !ev)
|
||||
+ return -1;
|
||||
+
|
||||
+ sprintf(bt_buf, "BACKTRACE= " \
|
||||
+ "timestamp=%s\n" \
|
||||
+ "error_count=%d\n" \
|
||||
+ "error_type=%s\n" \
|
||||
+ "msg=%s\n" \
|
||||
+ "label=%s\n" \
|
||||
+ "mc_index=%c\n" \
|
||||
+ "top_layer=%c\n" \
|
||||
+ "middle_layer=%c\n" \
|
||||
+ "lower_layer=%c\n" \
|
||||
+ "address=%llu\n" \
|
||||
+ "grain=%llu\n" \
|
||||
+ "syndrome=%llu\n" \
|
||||
+ "driver_detail=%s\n", \
|
||||
+ ev->timestamp, \
|
||||
+ ev->error_count, \
|
||||
+ ev->error_type, \
|
||||
+ ev->msg, \
|
||||
+ ev->label, \
|
||||
+ ev->mc_index, \
|
||||
+ ev->top_layer, \
|
||||
+ ev->middle_layer, \
|
||||
+ ev->lower_layer, \
|
||||
+ ev->address, \
|
||||
+ ev->grain, \
|
||||
+ ev->syndrome, \
|
||||
+ ev->driver_detail);
|
||||
+
|
||||
+ strcat(buf, bt_buf);
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static int set_mce_event_backtrace(char *buf, struct mce_event *ev){
|
||||
+ char bt_buf[MAX_BACKTRACE_SIZE];
|
||||
+
|
||||
+ if(!buf || !ev)
|
||||
+ return -1;
|
||||
+
|
||||
+ sprintf(bt_buf, "BACKTRACE=" \
|
||||
+ "timestamp=%s\n" \
|
||||
+ "bank_name=%s\n" \
|
||||
+ "error_msg=%s\n" \
|
||||
+ "mcgstatus_msg=%s\n" \
|
||||
+ "mcistatus_msg=%s\n" \
|
||||
+ "mcastatus_msg=%s\n" \
|
||||
+ "user_action=%s\n" \
|
||||
+ "mc_location=%s\n" \
|
||||
+ "mcgcap=%lu\n" \
|
||||
+ "mcgstatus=%lu\n" \
|
||||
+ "status=%lu\n" \
|
||||
+ "addr=%lu\n" \
|
||||
+ "misc=%lu\n" \
|
||||
+ "ip=%lu\n" \
|
||||
+ "tsc=%lu\n" \
|
||||
+ "walltime=%lu\n" \
|
||||
+ "cpu=%u\n" \
|
||||
+ "cpuid=%u\n" \
|
||||
+ "apicid=%u\n" \
|
||||
+ "socketid=%u\n" \
|
||||
+ "cs=%d\n" \
|
||||
+ "bank=%d\n" \
|
||||
+ "cpuvendor=%d\n", \
|
||||
+ ev->timestamp, \
|
||||
+ ev->bank_name, \
|
||||
+ ev->error_msg, \
|
||||
+ ev->mcgstatus_msg, \
|
||||
+ ev->mcistatus_msg, \
|
||||
+ ev->mcastatus_msg, \
|
||||
+ ev->user_action, \
|
||||
+ ev->mc_location, \
|
||||
+ ev->mcgcap, \
|
||||
+ ev->mcgstatus, \
|
||||
+ ev->status, \
|
||||
+ ev->addr, \
|
||||
+ ev->misc, \
|
||||
+ ev->ip, \
|
||||
+ ev->tsc, \
|
||||
+ ev->walltime, \
|
||||
+ ev->cpu, \
|
||||
+ ev->cpuid, \
|
||||
+ ev->apicid, \
|
||||
+ ev->socketid, \
|
||||
+ ev->cs, \
|
||||
+ ev->bank, \
|
||||
+ ev->cpuvendor);
|
||||
+
|
||||
+ strcat(buf, bt_buf);
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static int set_aer_event_backtrace(char *buf, struct ras_aer_event *ev){
|
||||
+ char bt_buf[MAX_BACKTRACE_SIZE];
|
||||
+
|
||||
+ if(!buf || !ev)
|
||||
+ return -1;
|
||||
+
|
||||
+ sprintf(bt_buf, "BACKTRACE=" \
|
||||
+ "timestamp=%s\n" \
|
||||
+ "error_type=%s\n" \
|
||||
+ "dev_name=%s\n" \
|
||||
+ "msg=%s\n", \
|
||||
+ ev->timestamp, \
|
||||
+ ev->error_type, \
|
||||
+ ev->dev_name, \
|
||||
+ ev->msg);
|
||||
+
|
||||
+ strcat(buf, bt_buf);
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static int commit_report_backtrace(int sockfd, int type, void *ev){
|
||||
+ char buf[MAX_BACKTRACE_SIZE];
|
||||
+ char *pbuf = buf;
|
||||
+ int rc = -1;
|
||||
+ int buf_len = 0;
|
||||
+
|
||||
+ if(sockfd < 0 || !ev){
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ memset(buf, 0, MAX_BACKTRACE_SIZE);
|
||||
+
|
||||
+ switch(type){
|
||||
+ case MC_EVENT:
|
||||
+ rc = set_mc_event_backtrace(buf, (struct ras_mc_event *)ev);
|
||||
+ break;
|
||||
+ case AER_EVENT:
|
||||
+ rc = set_aer_event_backtrace(buf, (struct ras_aer_event *)ev);
|
||||
+ break;
|
||||
+ case MCE_EVENT:
|
||||
+ rc = set_mce_event_backtrace(buf, (struct mce_event *)ev);
|
||||
+ break;
|
||||
+ default:
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ if(rc < 0){
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ buf_len = strlen(buf);
|
||||
+
|
||||
+ for(;buf_len > INPUT_BUFFER_SIZE - 1; buf_len -= (INPUT_BUFFER_SIZE - 1)){
|
||||
+ rc = write(sockfd, pbuf, INPUT_BUFFER_SIZE - 1);
|
||||
+ if(rc < INPUT_BUFFER_SIZE - 1){
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ pbuf = pbuf + INPUT_BUFFER_SIZE - 1;
|
||||
+ }
|
||||
+
|
||||
+ rc = write(sockfd, pbuf, buf_len + 1);
|
||||
+ if(rc < buf_len){
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev){
|
||||
+ char buf[MAX_MESSAGE_SIZE];
|
||||
+ int sockfd = -1;
|
||||
+ int done = 0;
|
||||
+ int rc = -1;
|
||||
+
|
||||
+ memset(buf, 0, sizeof(buf));
|
||||
+
|
||||
+ sockfd = setup_report_socket();
|
||||
+ if(sockfd < 0){
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ rc = commit_report_basic(sockfd);
|
||||
+ if(rc < 0){
|
||||
+ goto mc_fail;
|
||||
+ }
|
||||
+
|
||||
+ rc = commit_report_backtrace(sockfd, MC_EVENT, ev);
|
||||
+ if(rc < 0){
|
||||
+ goto mc_fail;
|
||||
+ }
|
||||
+
|
||||
+ sprintf(buf, "ANALYZER=%s", "rasdaemon-mc");
|
||||
+ rc = write(sockfd, buf, strlen(buf) + 1);
|
||||
+ if(rc < strlen(buf) + 1){
|
||||
+ goto mc_fail;
|
||||
+ }
|
||||
+
|
||||
+ sprintf(buf, "REASON=%s", "EDAC driver report problem");
|
||||
+ rc = write(sockfd, buf, strlen(buf) + 1);
|
||||
+ if(rc < strlen(buf) + 1){
|
||||
+ goto mc_fail;
|
||||
+ }
|
||||
+
|
||||
+ rc = commit_report_done(sockfd);
|
||||
+ if(rc < 0){
|
||||
+ goto mc_fail;
|
||||
+ }
|
||||
+
|
||||
+ done = 1;
|
||||
+
|
||||
+mc_fail:
|
||||
+
|
||||
+ if(sockfd > 0){
|
||||
+ close(sockfd);
|
||||
+ }
|
||||
+
|
||||
+ if(done){
|
||||
+ return 0;
|
||||
+ }else{
|
||||
+ return -1;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev){
|
||||
+ char buf[MAX_MESSAGE_SIZE];
|
||||
+ int sockfd = 0;
|
||||
+ int done = 0;
|
||||
+ int rc = -1;
|
||||
+
|
||||
+ memset(buf, 0, sizeof(buf));
|
||||
+
|
||||
+ sockfd = setup_report_socket();
|
||||
+ if(sockfd < 0){
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ rc = commit_report_basic(sockfd);
|
||||
+ if(rc < 0){
|
||||
+ goto aer_fail;
|
||||
+ }
|
||||
+
|
||||
+ rc = commit_report_backtrace(sockfd, AER_EVENT, ev);
|
||||
+ if(rc < 0){
|
||||
+ goto aer_fail;
|
||||
+ }
|
||||
+
|
||||
+ sprintf(buf, "ANALYZER=%s", "rasdaemon-aer");
|
||||
+ rc = write(sockfd, buf, strlen(buf) + 1);
|
||||
+ if(rc < strlen(buf) + 1){
|
||||
+ goto aer_fail;
|
||||
+ }
|
||||
+
|
||||
+ sprintf(buf, "REASON=%s", "PCIe AER driver report problem");
|
||||
+ rc = write(sockfd, buf, strlen(buf) + 1);
|
||||
+ if(rc < strlen(buf) + 1){
|
||||
+ goto aer_fail;
|
||||
+ }
|
||||
+
|
||||
+ rc = commit_report_done(sockfd);
|
||||
+ if(rc < 0){
|
||||
+ goto aer_fail;
|
||||
+ }
|
||||
+
|
||||
+ done = 1;
|
||||
+
|
||||
+aer_fail:
|
||||
+
|
||||
+ if(sockfd > 0){
|
||||
+ close(sockfd);
|
||||
+ }
|
||||
+
|
||||
+ if(done){
|
||||
+ return 0;
|
||||
+ }else{
|
||||
+ return -1;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev){
|
||||
+ char buf[MAX_MESSAGE_SIZE];
|
||||
+ int sockfd = 0;
|
||||
+ int done = 0;
|
||||
+ int rc = -1;
|
||||
+
|
||||
+ memset(buf, 0, sizeof(buf));
|
||||
+
|
||||
+ sockfd = setup_report_socket();
|
||||
+ if(sockfd < 0){
|
||||
+ return -1;
|
||||
+ }
|
||||
+
|
||||
+ rc = commit_report_basic(sockfd);
|
||||
+ if(rc < 0){
|
||||
+ goto mce_fail;
|
||||
+ }
|
||||
+
|
||||
+ rc = commit_report_backtrace(sockfd, MCE_EVENT, ev);
|
||||
+ if(rc < 0){
|
||||
+ goto mce_fail;
|
||||
+ }
|
||||
+
|
||||
+ sprintf(buf, "ANALYZER=%s", "rasdaemon-mce");
|
||||
+ rc = write(sockfd, buf, strlen(buf) + 1);
|
||||
+ if(rc < strlen(buf) + 1){
|
||||
+ goto mce_fail;
|
||||
+ }
|
||||
+
|
||||
+ sprintf(buf, "REASON=%s", "Machine Check driver report problem");
|
||||
+ rc = write(sockfd, buf, strlen(buf) + 1);
|
||||
+ if(rc < strlen(buf) + 1){
|
||||
+ goto mce_fail;
|
||||
+ }
|
||||
+
|
||||
+ rc = commit_report_done(sockfd);
|
||||
+ if(rc < 0){
|
||||
+ goto mce_fail;
|
||||
+ }
|
||||
+
|
||||
+ done = 1;
|
||||
+
|
||||
+mce_fail:
|
||||
+
|
||||
+ if(sockfd > 0){
|
||||
+ close(sockfd);
|
||||
+ }
|
||||
+
|
||||
+ if(done){
|
||||
+ return 0;
|
||||
+ }else{
|
||||
+ return -1;
|
||||
+ }
|
||||
+}
|
||||
diff --git a/ras-report.h b/ras-report.h
|
||||
new file mode 100644
|
||||
index 0000000..7920cdf
|
||||
--- /dev/null
|
||||
+++ b/ras-report.h
|
||||
@@ -0,0 +1,39 @@
|
||||
+#ifndef __RAS_REPORT_H
|
||||
+#define __RAS_REPORT_H
|
||||
+
|
||||
+#include "ras-record.h"
|
||||
+#include "ras-events.h"
|
||||
+#include "ras-mc-handler.h"
|
||||
+#include "ras-mce-handler.h"
|
||||
+#include "ras-aer-handler.h"
|
||||
+
|
||||
+/* Maximal length of backtrace. */
|
||||
+#define MAX_BACKTRACE_SIZE (1024*1024)
|
||||
+/* Amount of data received from one client for a message before reporting error. */
|
||||
+#define MAX_MESSAGE_SIZE (4*MAX_BACKTRACE_SIZE)
|
||||
+/* Maximal number of characters read from socket at once. */
|
||||
+#define INPUT_BUFFER_SIZE (8*1024)
|
||||
+/* ABRT socket file */
|
||||
+#define ABRT_SOCKET "/var/run/abrt/abrt.socket"
|
||||
+
|
||||
+enum {
|
||||
+ MC_EVENT,
|
||||
+ MCE_EVENT,
|
||||
+ AER_EVENT
|
||||
+};
|
||||
+
|
||||
+#ifdef HAVE_ABRT_REPORT
|
||||
+
|
||||
+int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev);
|
||||
+int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev);
|
||||
+int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev);
|
||||
+
|
||||
+#else
|
||||
+
|
||||
+static inline int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev) { return 0; };
|
||||
+static inline int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev) { return 0; };
|
||||
+static inline int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev) { return 0; };
|
||||
+
|
||||
+#endif
|
||||
+
|
||||
+#endif
|
||||
--
|
||||
1.7.1
|
||||
|
|
@ -0,0 +1,50 @@
|
|||
From d1b81490639f2608ecaf8fa50c24ac78c053fc2b Mon Sep 17 00:00:00 2001
|
||||
From: Betty Dall <betty.dall@hp.com>
|
||||
Date: Wed, 19 Mar 2014 14:59:47 -0600
|
||||
Subject: [PATCH 26/32] rasdaemon: Add record option to rasdaemon man page
|
||||
|
||||
Add the already existing rasdaemon option 'record' to the rasdaemon man
|
||||
page. This option records events via sqlite3.
|
||||
|
||||
Signed-off-by: Betty Dall <betty.dall@hp.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com>
|
||||
---
|
||||
man/rasdaemon.1.in | 14 +++++++++++---
|
||||
1 files changed, 11 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/man/rasdaemon.1.in b/man/rasdaemon.1.in
|
||||
index 5349fa3..7a8b60f 100644
|
||||
--- a/man/rasdaemon.1.in
|
||||
+++ b/man/rasdaemon.1.in
|
||||
@@ -29,8 +29,10 @@ rasdaemon \- RAS daemon to log the RAS events.
|
||||
|
||||
.SH DESCRIPTION
|
||||
|
||||
-The \fBrasdaemon\fR program is a daemon with monitors the RAS trace events
|
||||
-from /sys/kernel/debug/tracing, reporting them via syslog/journald.
|
||||
+The \fBrasdaemon\fR program is a daemon which monitors the platform
|
||||
+Reliablity, Availability and Serviceability (RAS) reports from the
|
||||
+Linux kernel trace events. These trace events are logged in
|
||||
+/sys/kernel/debug/tracing, reporting them via syslog/journald.
|
||||
|
||||
.SH OPTIONS
|
||||
.TP
|
||||
@@ -51,8 +53,14 @@ Executes in foreground, printing the events at console. Useful for testing it,
|
||||
and to be used by systemd or Unix System V respan.
|
||||
If not specified, the program runs in daemon mode.
|
||||
.TP
|
||||
+.BI "--record"
|
||||
+Record RAS events via Sqlite3. The Sqlite3 database has the benefit of
|
||||
+keeping a persistent record of the RAS events. This feature is used with
|
||||
+the ras-mc-ctl utility. Note that rasdaemon may be compiled without this
|
||||
+feature.
|
||||
+.TP
|
||||
.BI "--version"
|
||||
-Prints the program version and exit.
|
||||
+Print the program version and exit.
|
||||
|
||||
.SH SEE ALSO
|
||||
\fBras-mc-ctl\fR(8)
|
||||
--
|
||||
1.7.1
|
||||
|
|
@ -0,0 +1,45 @@
|
|||
From caa44c3946ddc900896830297c28b90ce5b9034b Mon Sep 17 00:00:00 2001
|
||||
From: Betty Dall <betty.dall@hp.com>
|
||||
Date: Wed, 19 Mar 2014 15:54:56 -0600
|
||||
Subject: [PATCH 27/32] ras-mc-ctl: Print useful message when run without rasdaemon -r
|
||||
|
||||
The utility script ras-mc-ctl requires that rasdaemon --record be run
|
||||
to create the me_event table in the SQLite database. The current behaviour
|
||||
is this:
|
||||
[root@sa1 util]# ras-mc-ctl --errors
|
||||
DBD::SQLite::db prepare failed: no such table: mc_event at
|
||||
/usr/local/sbin/ras-mc-ctl line 914.
|
||||
Can't call method "execute" on an undefined value at
|
||||
/usr/local/sbin/ras-mc-ctl line 915.
|
||||
|
||||
With this change, the user sees:
|
||||
[root@sa1 util]# ras-mc-ctl --errors
|
||||
DBD::SQLite::db prepare failed: no such table: mc_event at
|
||||
/usr/local/sbin/ras-mc-ctl line 914.
|
||||
ras-mc-ctl: Error: mc_event table missing from
|
||||
/usr/local/var/lib/rasdaemon/ras-mc_event.db. Run 'rasdaemon --record'.
|
||||
|
||||
Signed-off-by: Betty Dall <betty.dall@hp.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com>
|
||||
---
|
||||
util/ras-mc-ctl.in | 4 ++++
|
||||
1 files changed, 4 insertions(+), 0 deletions(-)
|
||||
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index 196a643..e9f9c59 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -912,6 +912,10 @@ sub errors
|
||||
# Memory controller mc_event errors
|
||||
$query = "select id, timestamp, err_count, err_type, err_msg, label, mc, top_layer,middle_layer,lower_layer, address, grain, syndrome, driver_detail from mc_event order by id";
|
||||
$query_handle = $dbh->prepare($query);
|
||||
+ if (!$query_handle) {
|
||||
+ log_error ("mc_event table missing from $dbname. Run 'rasdaemon --record'.\n");
|
||||
+ exit -1
|
||||
+ }
|
||||
$query_handle->execute();
|
||||
$query_handle->bind_columns(\($id, $time, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail));
|
||||
$out = "";
|
||||
--
|
||||
1.7.1
|
||||
|
|
@ -0,0 +1,114 @@
|
|||
From 4bfa45f56e1500f1cfc8de3fd8d1228b11011e95 Mon Sep 17 00:00:00 2001
|
||||
From: Jakub Filak <jfilak@redhat.com>
|
||||
Date: Fri, 21 Feb 2014 15:54:09 +0100
|
||||
Subject: [PATCH 28/32] Make paths in the systemd services configurable
|
||||
|
||||
The path to a binary depends on configuration, therefore it is better to
|
||||
not use hard coded strings.
|
||||
|
||||
Signed-off-by: Jakub Filak <jfilak@redhat.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com>
|
||||
---
|
||||
Makefile.am | 15 ++++++++++++++-
|
||||
misc/ras-mc-ctl.service | 10 ----------
|
||||
misc/ras-mc-ctl.service.in | 10 ++++++++++
|
||||
misc/rasdaemon.service | 10 ----------
|
||||
misc/rasdaemon.service.in | 10 ++++++++++
|
||||
5 files changed, 34 insertions(+), 21 deletions(-)
|
||||
delete mode 100644 misc/ras-mc-ctl.service
|
||||
create mode 100644 misc/ras-mc-ctl.service.in
|
||||
delete mode 100644 misc/rasdaemon.service
|
||||
create mode 100644 misc/rasdaemon.service.in
|
||||
|
||||
diff --git a/Makefile.am b/Makefile.am
|
||||
index c1668b4..0fa615f 100644
|
||||
--- a/Makefile.am
|
||||
+++ b/Makefile.am
|
||||
@@ -1,6 +1,19 @@
|
||||
ACLOCAL_AMFLAGS=-I m4
|
||||
SUBDIRS = libtrace util man
|
||||
-EXTRA_DIST = misc/rasdaemon.service misc/ras-mc-ctl.service
|
||||
+SYSTEMD_SERVICES_IN = misc/rasdaemon.service.in misc/ras-mc-ctl.service.in
|
||||
+SYSTEMD_SERVICES = $(SYSTEMD_SERVICES_IN:.service.in=.service)
|
||||
+EXTRA_DIST = $(SYSTEMD_SERVICES_IN)
|
||||
+
|
||||
+# This rule is needed because \@sbindir\@ is expanded to \${exec_prefix\}/sbin
|
||||
+# during ./configure phase, therefore it is not possible to add .service.in
|
||||
+# files to AC_CONFIG_FILES in configure.ac
|
||||
+SUFFIXES = .service.in .service
|
||||
+.service.in.service:
|
||||
+ sed -e s,\@sbindir\@,$(sbindir),g $< > $@
|
||||
+
|
||||
+# This rule is needed because the service files must be generated on target
|
||||
+# system after ./configure phase
|
||||
+all-local: $(SYSTEMD_SERVICES)
|
||||
|
||||
sbin_PROGRAMS = rasdaemon
|
||||
rasdaemon_SOURCES = rasdaemon.c ras-events.c ras-mc-handler.c \
|
||||
diff --git a/misc/ras-mc-ctl.service b/misc/ras-mc-ctl.service
|
||||
deleted file mode 100644
|
||||
index 8a09508..0000000
|
||||
--- a/misc/ras-mc-ctl.service
|
||||
+++ /dev/null
|
||||
@@ -1,10 +0,0 @@
|
||||
-[Unit]
|
||||
-Description=Initialize EDAC v3.0.0 Drivers For Machine Hardware
|
||||
-
|
||||
-[Service]
|
||||
-Type=oneshot
|
||||
-ExecStart=/usr/sbin/ras-mc-ctl --register-labels
|
||||
-RemainAfterExit=yes
|
||||
-
|
||||
-[Install]
|
||||
-WantedBy=multi-user.target
|
||||
diff --git a/misc/ras-mc-ctl.service.in b/misc/ras-mc-ctl.service.in
|
||||
new file mode 100644
|
||||
index 0000000..8cb3651
|
||||
--- /dev/null
|
||||
+++ b/misc/ras-mc-ctl.service.in
|
||||
@@ -0,0 +1,10 @@
|
||||
+[Unit]
|
||||
+Description=Initialize EDAC v3.0.0 Drivers For Machine Hardware
|
||||
+
|
||||
+[Service]
|
||||
+Type=oneshot
|
||||
+ExecStart=@sbindir@/ras-mc-ctl --register-labels
|
||||
+RemainAfterExit=yes
|
||||
+
|
||||
+[Install]
|
||||
+WantedBy=multi-user.target
|
||||
diff --git a/misc/rasdaemon.service b/misc/rasdaemon.service
|
||||
deleted file mode 100644
|
||||
index 36cdef5..0000000
|
||||
--- a/misc/rasdaemon.service
|
||||
+++ /dev/null
|
||||
@@ -1,10 +0,0 @@
|
||||
-[Unit]
|
||||
-Description=RAS daemon to log the RAS events
|
||||
-After=syslog.target
|
||||
-
|
||||
-[Service]
|
||||
-ExecStart=/usr/local/sbin/rasdaemon -f
|
||||
-Restart=on-abort
|
||||
-
|
||||
-[Install]
|
||||
-WantedBy=multi-user.target
|
||||
diff --git a/misc/rasdaemon.service.in b/misc/rasdaemon.service.in
|
||||
new file mode 100644
|
||||
index 0000000..5e1f375
|
||||
--- /dev/null
|
||||
+++ b/misc/rasdaemon.service.in
|
||||
@@ -0,0 +1,10 @@
|
||||
+[Unit]
|
||||
+Description=RAS daemon to log the RAS events
|
||||
+After=syslog.target
|
||||
+
|
||||
+[Service]
|
||||
+ExecStart=@sbindir@/rasdaemon -f
|
||||
+Restart=on-abort
|
||||
+
|
||||
+[Install]
|
||||
+WantedBy=multi-user.target
|
||||
--
|
||||
1.7.1
|
||||
|
|
@ -0,0 +1,118 @@
|
|||
From d7453479e96693ebb5e17b285adf915b67095aad Mon Sep 17 00:00:00 2001
|
||||
From: Jakub Filak <jfilak@redhat.com>
|
||||
Date: Wed, 2 Apr 2014 15:03:44 +0200
|
||||
Subject: [PATCH 31/32] Correct ABRT report data
|
||||
|
||||
Remove '\0' byte from 'PUT' message because this was superfluous.
|
||||
|
||||
Replaced 'BASENAME' item with 'TYPE' item because the first one is no
|
||||
longer supported by abrtd and the second one is required. Basically the
|
||||
later is a substitute for the first one.
|
||||
|
||||
Removed the closing message which is not supported by abrtd. abrtd
|
||||
considers that message as a part of the problem report.
|
||||
|
||||
Removed a superfluous space from 'Backtrace'.
|
||||
|
||||
Signed-off-by: Jakub Filak <jfilak@redhat.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com>
|
||||
---
|
||||
ras-report.c | 41 ++++-------------------------------------
|
||||
1 files changed, 4 insertions(+), 37 deletions(-)
|
||||
|
||||
diff --git a/ras-report.c b/ras-report.c
|
||||
index d3e4a79..0a05732 100644
|
||||
--- a/ras-report.c
|
||||
+++ b/ras-report.c
|
||||
@@ -51,8 +51,8 @@ static int commit_report_basic(int sockfd){
|
||||
* ABRT server protocol
|
||||
*/
|
||||
sprintf(buf, "PUT / HTTP/1.1\r\n\r\n");
|
||||
- rc = write(sockfd, buf, strlen(buf) + 1);
|
||||
- if(rc < strlen(buf) + 1){
|
||||
+ rc = write(sockfd, buf, strlen(buf));
|
||||
+ if(rc < strlen(buf)){
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -68,7 +68,7 @@ static int commit_report_basic(int sockfd){
|
||||
return -1;
|
||||
}
|
||||
|
||||
- sprintf(buf, "BASENAME=%s", "rasdaemon");
|
||||
+ sprintf(buf, "TYPE=%s", "ras");
|
||||
rc = write(sockfd, buf, strlen(buf) + 1);
|
||||
if(rc < strlen(buf) + 1){
|
||||
return -1;
|
||||
@@ -77,31 +77,13 @@ static int commit_report_basic(int sockfd){
|
||||
return 0;
|
||||
}
|
||||
|
||||
-/*
|
||||
- * add "DONE" string to finish message.
|
||||
- */
|
||||
-static int commit_report_done(int sockfd){
|
||||
- int rc = -1;
|
||||
-
|
||||
- if(sockfd < 0){
|
||||
- return -1;
|
||||
- }
|
||||
-
|
||||
- rc = write(sockfd, "DONE\0", strlen("DONE\0"));
|
||||
- if(rc < strlen("DONE\0")){
|
||||
- return -1;
|
||||
- }
|
||||
-
|
||||
- return 0;
|
||||
-}
|
||||
-
|
||||
static int set_mc_event_backtrace(char *buf, struct ras_mc_event *ev){
|
||||
char bt_buf[MAX_BACKTRACE_SIZE];
|
||||
|
||||
if(!buf || !ev)
|
||||
return -1;
|
||||
|
||||
- sprintf(bt_buf, "BACKTRACE= " \
|
||||
+ sprintf(bt_buf, "BACKTRACE=" \
|
||||
"timestamp=%s\n" \
|
||||
"error_count=%d\n" \
|
||||
"error_type=%s\n" \
|
||||
@@ -298,11 +280,6 @@ int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev){
|
||||
goto mc_fail;
|
||||
}
|
||||
|
||||
- rc = commit_report_done(sockfd);
|
||||
- if(rc < 0){
|
||||
- goto mc_fail;
|
||||
- }
|
||||
-
|
||||
done = 1;
|
||||
|
||||
mc_fail:
|
||||
@@ -353,11 +330,6 @@ int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev){
|
||||
goto aer_fail;
|
||||
}
|
||||
|
||||
- rc = commit_report_done(sockfd);
|
||||
- if(rc < 0){
|
||||
- goto aer_fail;
|
||||
- }
|
||||
-
|
||||
done = 1;
|
||||
|
||||
aer_fail:
|
||||
@@ -408,11 +380,6 @@ int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev){
|
||||
goto mce_fail;
|
||||
}
|
||||
|
||||
- rc = commit_report_done(sockfd);
|
||||
- if(rc < 0){
|
||||
- goto mce_fail;
|
||||
- }
|
||||
-
|
||||
done = 1;
|
||||
|
||||
mce_fail:
|
||||
--
|
||||
1.7.1
|
||||
|
|
@ -0,0 +1,43 @@
|
|||
From 59f6c44864f914a189cb924dd8fea14cc314bf3f Mon Sep 17 00:00:00 2001
|
||||
From: Aristeu Rozanski <arozansk@redhat.com>
|
||||
Date: Mon, 23 Jun 2014 15:43:41 -0400
|
||||
Subject: [PATCH 1/2] rasdaemon: handle failures of snprintf()
|
||||
|
||||
Florian Weimer found that in bitfield_msg() the return value of
|
||||
snprintf() is used to calculate length ignoring that it can return a
|
||||
negative number. This patch makes bitfield_msg() to stop writing in such
|
||||
case.
|
||||
|
||||
Reference: https://bugzilla.redhat.com/show_bug.cgi?id=1035741
|
||||
|
||||
Reported-by: Florian Weimer <fweimer@redhat.com>
|
||||
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
|
||||
---
|
||||
bitfield.c | 4 ++++
|
||||
1 files changed, 4 insertions(+), 0 deletions(-)
|
||||
|
||||
diff --git a/bitfield.c b/bitfield.c
|
||||
index b2895b4..1690f15 100644
|
||||
--- a/bitfield.c
|
||||
+++ b/bitfield.c
|
||||
@@ -41,6 +41,8 @@ unsigned bitfield_msg(char *buf, size_t len, const char **bitarray,
|
||||
if (status & (1 << (i + bit_offset))) {
|
||||
if (p != buf) {
|
||||
n = snprintf(p, len, ", ");
|
||||
+ if (n < 0)
|
||||
+ break;
|
||||
len -= n;
|
||||
p += n;
|
||||
}
|
||||
@@ -48,6 +50,8 @@ unsigned bitfield_msg(char *buf, size_t len, const char **bitarray,
|
||||
n = snprintf(p, len, "BIT%d", i + bit_offset);
|
||||
else
|
||||
n = snprintf(p, len, "%s", bitarray[i]);
|
||||
+ if (n < 0)
|
||||
+ break;
|
||||
len -= n;
|
||||
p += n;
|
||||
}
|
||||
--
|
||||
1.7.1
|
||||
|
|
@ -0,0 +1,46 @@
|
|||
From 5ba31285710e85c7d3688e536cd54180321964e4 Mon Sep 17 00:00:00 2001
|
||||
From: Aristeu Rozanski <arozansk@redhat.com>
|
||||
Date: Mon, 23 Jun 2014 16:31:50 -0400
|
||||
Subject: [PATCH 2/2] rasdaemon: correct range while parsing top, middle and lower layers
|
||||
|
||||
{top,middle,lower}_layer are signed char, therefore will never be 255.
|
||||
|
||||
Reference: https://bugzilla.redhat.com/show_bug.cgi?id=1035746
|
||||
|
||||
Reported-by: Florian Weimer <fweimer@redhat.com>
|
||||
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
|
||||
---
|
||||
ras-mc-handler.c | 14 +++-----------
|
||||
1 file changed, 3 insertions(+), 11 deletions(-)
|
||||
|
||||
--- upstream.orig/ras-mc-handler.c 2014-06-26 16:09:30.000000000 -0400
|
||||
+++ upstream/ras-mc-handler.c 2014-06-26 16:09:32.000000000 -0400
|
||||
@@ -120,25 +120,17 @@ if (pevent_get_field_val(s, event, "mc_
|
||||
if (pevent_get_field_val(s, event, "top_layer", record, &val, 1) < 0)
|
||||
goto parse_error;
|
||||
parsed_fields++;
|
||||
+ ev.top_layer = (signed char) val;
|
||||
|
||||
- ev.top_layer = (int) val;
|
||||
if (pevent_get_field_val(s, event, "middle_layer", record, &val, 1) < 0)
|
||||
goto parse_error;
|
||||
parsed_fields++;
|
||||
+ ev.middle_layer = (signed char) val;
|
||||
|
||||
- ev.middle_layer = (int) val;
|
||||
if (pevent_get_field_val(s, event, "lower_layer", record, &val, 1) < 0)
|
||||
goto parse_error;
|
||||
parsed_fields++;
|
||||
-
|
||||
- ev.lower_layer = (int) val;
|
||||
-
|
||||
- if (ev.top_layer == 255)
|
||||
- ev.top_layer = -1;
|
||||
- if (ev.middle_layer == 255)
|
||||
- ev.middle_layer = -1;
|
||||
- if (ev.lower_layer == 255)
|
||||
- ev.lower_layer = -1;
|
||||
+ ev.lower_layer = (signed char) val;
|
||||
|
||||
if (ev.top_layer >= 0 || ev.middle_layer >= 0 || ev.lower_layer >= 0) {
|
||||
if (ev.lower_layer >= 0)
|
|
@ -0,0 +1,17 @@
|
|||
---
|
||||
misc/rasdaemon.service.in | 4 +++-
|
||||
1 file changed, 3 insertions(+), 1 deletion(-)
|
||||
|
||||
--- upstream.orig/misc/rasdaemon.service.in 2014-06-04 14:25:13.000000000 -0400
|
||||
+++ upstream/misc/rasdaemon.service.in 2014-07-08 14:37:26.421395520 -0400
|
||||
@@ -3,7 +3,9 @@ Description=RAS daemon to log the RAS ev
|
||||
After=syslog.target
|
||||
|
||||
[Service]
|
||||
-ExecStart=@sbindir@/rasdaemon -f
|
||||
+ExecStart=@sbindir@/rasdaemon -f -r
|
||||
+ExecStartPost=@sbindir@/rasdaemon --enable
|
||||
+ExecStop=@sbindir@/rasdaemon --disable
|
||||
Restart=on-abort
|
||||
|
||||
[Install]
|
|
@ -0,0 +1,807 @@
|
|||
commit 38d48ed48f9d0baa20786d98abe2b4085fca7d5d
|
||||
Author: Luck, Tony <tony.luck@intel.com>
|
||||
Date: Mon Aug 4 13:29:01 2014 -0700
|
||||
|
||||
rasdaemon: Add support for extlog trace events
|
||||
|
||||
Linux kernel 3.17 includes a new trace event to pick up extended
|
||||
error logs produced by BIOS in the Common Platform Error Record
|
||||
format described in appendix N of the UEFI standard. This patch
|
||||
adds support to collect that information and log it both in
|
||||
readable ASCII and into the sqlite3 database that rasdaemon
|
||||
uses to store all error information. In addition ras-mc-ctl
|
||||
is updated to query that database for both detailed and summary
|
||||
reports.
|
||||
|
||||
Big thanks to Aristeu for pretty much all the sqlite3 pieces,
|
||||
plus testing and fixing miscellaneous issues elsewhere.
|
||||
|
||||
Signed-off-by: Tony Luck <tony.luck@intel.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com>
|
||||
|
||||
diff --git a/Makefile.am b/Makefile.am
|
||||
index 0fa615f..117c970 100644
|
||||
--- a/Makefile.am
|
||||
+++ b/Makefile.am
|
||||
@@ -30,13 +30,17 @@ if WITH_MCE
|
||||
mce-intel-dunnington.c mce-intel-tulsa.c \
|
||||
mce-intel-sb.c mce-intel-ivb.c
|
||||
endif
|
||||
+if WITH_EXTLOG
|
||||
+ rasdaemon_SOURCES += ras-extlog-handler.c
|
||||
+endif
|
||||
if WITH_ABRT_REPORT
|
||||
rasdaemon_SOURCES += ras-report.c
|
||||
endif
|
||||
rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a
|
||||
|
||||
include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \
|
||||
- ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h
|
||||
+ ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \
|
||||
+ ras-extlog-handler.h
|
||||
|
||||
# This rule can't be called with more than one Makefile job (like make -j8)
|
||||
# I can't figure out a way to fix that
|
||||
diff --git a/configure.ac b/configure.ac
|
||||
index 64a5b13..9495491 100644
|
||||
--- a/configure.ac
|
||||
+++ b/configure.ac
|
||||
@@ -53,6 +53,15 @@ AS_IF([test "x$enable_mce" = "xyes"], [
|
||||
])
|
||||
AM_CONDITIONAL([WITH_MCE], [test x$enable_mce = xyes])
|
||||
|
||||
+AC_ARG_ENABLE([extlog],
|
||||
+ AS_HELP_STRING([--enable-extlog], [enable EXTLOG events (currently experimental)]))
|
||||
+
|
||||
+AS_IF([test "x$enable_extlog" = "xyes"], [
|
||||
+ AC_DEFINE(HAVE_EXTLOG,1,"have EXTLOG events collect")
|
||||
+ AC_SUBST([WITH_EXTLOG])
|
||||
+])
|
||||
+AM_CONDITIONAL([WITH_EXTLOG], [test x$enable_extlog = xyes])
|
||||
+
|
||||
AC_ARG_ENABLE([abrt_report],
|
||||
AS_HELP_STRING([--enable-abrt-report], [enable report event to ABRT (currently experimental)]))
|
||||
|
||||
diff --git a/ras-aer-handler.c b/ras-aer-handler.c
|
||||
index 50526af..bb7c0b9 100644
|
||||
--- a/ras-aer-handler.c
|
||||
+++ b/ras-aer-handler.c
|
||||
@@ -70,7 +70,7 @@ int ras_aer_event_handler(struct trace_seq *s,
|
||||
*/
|
||||
|
||||
if (ras->use_uptime)
|
||||
- now = record->ts/1000000000L + ras->uptime_diff;
|
||||
+ now = record->ts/user_hz + ras->uptime_diff;
|
||||
else
|
||||
now = time(NULL);
|
||||
|
||||
diff --git a/ras-events.c b/ras-events.c
|
||||
index ecbbd3a..0be7c3f 100644
|
||||
--- a/ras-events.c
|
||||
+++ b/ras-events.c
|
||||
@@ -30,6 +30,7 @@
|
||||
#include "ras-mc-handler.h"
|
||||
#include "ras-aer-handler.h"
|
||||
#include "ras-mce-handler.h"
|
||||
+#include "ras-extlog-handler.h"
|
||||
#include "ras-record.h"
|
||||
#include "ras-logger.h"
|
||||
|
||||
@@ -203,6 +204,10 @@ int toggle_ras_mc_event(int enable)
|
||||
rc |= __toggle_ras_mc_event(ras, "mce", "mce_record", enable);
|
||||
#endif
|
||||
|
||||
+#ifdef HAVE_EXTLOG
|
||||
+ rc |= __toggle_ras_mc_event(ras, "ras", "extlog_mem_event", enable);
|
||||
+#endif
|
||||
+
|
||||
free_ras:
|
||||
free(ras);
|
||||
return rc;
|
||||
@@ -688,6 +693,19 @@ int handle_ras_events(int record_events)
|
||||
"mce", "mce_record");
|
||||
}
|
||||
#endif
|
||||
+
|
||||
+#ifdef HAVE_EXTLOG
|
||||
+ rc = add_event_handler(ras, pevent, page_size, "ras", "extlog_mem_event",
|
||||
+ ras_extlog_mem_event_handler);
|
||||
+ if (!rc) {
|
||||
+ /* tell kernel we are listening, so don't printk to console */
|
||||
+ (void)open("/sys/kernel/debug/ras/daemon_active", 0);
|
||||
+ num_events++;
|
||||
+ } else
|
||||
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
|
||||
+ "ras", "aer_event");
|
||||
+#endif
|
||||
+
|
||||
if (!num_events) {
|
||||
log(ALL, LOG_INFO,
|
||||
"Failed to trace all supported RAS events. Aborting.\n");
|
||||
diff --git a/ras-extlog-handler.c b/ras-extlog-handler.c
|
||||
new file mode 100644
|
||||
index 0000000..5fd3580
|
||||
--- /dev/null
|
||||
+++ b/ras-extlog-handler.c
|
||||
@@ -0,0 +1,246 @@
|
||||
+/*
|
||||
+ * Copyright (C) 2014 Tony Luck <tony.luck@intel.com>
|
||||
+ *
|
||||
+ * This program is free software; you can redistribute it and/or modify
|
||||
+ * it under the terms of the GNU General Public License as published by
|
||||
+ * the Free Software Foundation; either version 2 of the License, or
|
||||
+ * (at your option) any later version.
|
||||
+ *
|
||||
+ * This program is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
+ * GNU General Public License for more details.
|
||||
+ *
|
||||
+ * You should have received a copy of the GNU General Public License
|
||||
+ * along with this program; if not, write to the Free Software
|
||||
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
+*/
|
||||
+#include <ctype.h>
|
||||
+#include <errno.h>
|
||||
+#include <stdio.h>
|
||||
+#include <stdlib.h>
|
||||
+#include <string.h>
|
||||
+#include <unistd.h>
|
||||
+#include <stdint.h>
|
||||
+#include "libtrace/kbuffer.h"
|
||||
+#include "ras-extlog-handler.h"
|
||||
+#include "ras-record.h"
|
||||
+#include "ras-logger.h"
|
||||
+#include "ras-report.h"
|
||||
+
|
||||
+static char *err_type(int etype)
|
||||
+{
|
||||
+ switch (etype) {
|
||||
+ case 0: return "unknown";
|
||||
+ case 1: return "no error";
|
||||
+ case 2: return "single-bit ECC";
|
||||
+ case 3: return "multi-bit ECC";
|
||||
+ case 4: return "single-symbol chipkill ECC";
|
||||
+ case 5: return "multi-symbol chipkill ECC";
|
||||
+ case 6: return "master abort";
|
||||
+ case 7: return "target abort";
|
||||
+ case 8: return "parity error";
|
||||
+ case 9: return "watchdog timeout";
|
||||
+ case 10: return "invalid address";
|
||||
+ case 11: return "mirror Broken";
|
||||
+ case 12: return "memory sparing";
|
||||
+ case 13: return "scrub corrected error";
|
||||
+ case 14: return "scrub uncorrected error";
|
||||
+ case 15: return "physical memory map-out event";
|
||||
+ }
|
||||
+ return "unknown-type";
|
||||
+}
|
||||
+
|
||||
+static char *err_severity(int severity)
|
||||
+{
|
||||
+ switch (severity) {
|
||||
+ case 0: return "recoverable";
|
||||
+ case 1: return "fatal";
|
||||
+ case 2: return "corrected";
|
||||
+ case 3: return "informational";
|
||||
+ }
|
||||
+ return "unknown-severity";
|
||||
+}
|
||||
+
|
||||
+static unsigned long long err_mask(int lsb)
|
||||
+{
|
||||
+ if (lsb == 0xff)
|
||||
+ return ~0ull;
|
||||
+ return ~((1ull << lsb) - 1);
|
||||
+}
|
||||
+
|
||||
+#define CPER_MEM_VALID_NODE 0x0008
|
||||
+#define CPER_MEM_VALID_CARD 0x0010
|
||||
+#define CPER_MEM_VALID_MODULE 0x0020
|
||||
+#define CPER_MEM_VALID_BANK 0x0040
|
||||
+#define CPER_MEM_VALID_DEVICE 0x0080
|
||||
+#define CPER_MEM_VALID_ROW 0x0100
|
||||
+#define CPER_MEM_VALID_COLUMN 0x0200
|
||||
+#define CPER_MEM_VALID_BIT_POSITION 0x0400
|
||||
+#define CPER_MEM_VALID_REQUESTOR_ID 0x0800
|
||||
+#define CPER_MEM_VALID_RESPONDER_ID 0x1000
|
||||
+#define CPER_MEM_VALID_TARGET_ID 0x2000
|
||||
+#define CPER_MEM_VALID_RANK_NUMBER 0x8000
|
||||
+#define CPER_MEM_VALID_CARD_HANDLE 0x10000
|
||||
+#define CPER_MEM_VALID_MODULE_HANDLE 0x20000
|
||||
+
|
||||
+struct cper_mem_err_compact {
|
||||
+ unsigned long long validation_bits;
|
||||
+ unsigned short node;
|
||||
+ unsigned short card;
|
||||
+ unsigned short module;
|
||||
+ unsigned short bank;
|
||||
+ unsigned short device;
|
||||
+ unsigned short row;
|
||||
+ unsigned short column;
|
||||
+ unsigned short bit_pos;
|
||||
+ unsigned long long requestor_id;
|
||||
+ unsigned long long responder_id;
|
||||
+ unsigned long long target_id;
|
||||
+ unsigned short rank;
|
||||
+ unsigned short mem_array_handle;
|
||||
+ unsigned short mem_dev_handle;
|
||||
+};
|
||||
+
|
||||
+static char *err_cper_data(const char *c)
|
||||
+{
|
||||
+ const struct cper_mem_err_compact *cpd = (struct cper_mem_err_compact *)c;
|
||||
+ static char buf[256];
|
||||
+ char *p = buf;
|
||||
+
|
||||
+ if (cpd->validation_bits == 0)
|
||||
+ return "";
|
||||
+ p += sprintf(p, " (");
|
||||
+ if (cpd->validation_bits & CPER_MEM_VALID_NODE)
|
||||
+ p += sprintf(p, "node: %d ", cpd->node);
|
||||
+ if (cpd->validation_bits & CPER_MEM_VALID_CARD)
|
||||
+ p += sprintf(p, "card: %d ", cpd->card);
|
||||
+ if (cpd->validation_bits & CPER_MEM_VALID_MODULE)
|
||||
+ p += sprintf(p, "module: %d ", cpd->module);
|
||||
+ if (cpd->validation_bits & CPER_MEM_VALID_BANK)
|
||||
+ p += sprintf(p, "bank: %d ", cpd->bank);
|
||||
+ if (cpd->validation_bits & CPER_MEM_VALID_DEVICE)
|
||||
+ p += sprintf(p, "device: %d ", cpd->device);
|
||||
+ if (cpd->validation_bits & CPER_MEM_VALID_ROW)
|
||||
+ p += sprintf(p, "row: %d ", cpd->row);
|
||||
+ if (cpd->validation_bits & CPER_MEM_VALID_COLUMN)
|
||||
+ p += sprintf(p, "column: %d ", cpd->column);
|
||||
+ if (cpd->validation_bits & CPER_MEM_VALID_BIT_POSITION)
|
||||
+ p += sprintf(p, "bit_pos: %d ", cpd->bit_pos);
|
||||
+ if (cpd->validation_bits & CPER_MEM_VALID_REQUESTOR_ID)
|
||||
+ p += sprintf(p, "req_id: 0x%llx ", cpd->requestor_id);
|
||||
+ if (cpd->validation_bits & CPER_MEM_VALID_RESPONDER_ID)
|
||||
+ p += sprintf(p, "resp_id: 0x%llx ", cpd->responder_id);
|
||||
+ if (cpd->validation_bits & CPER_MEM_VALID_TARGET_ID)
|
||||
+ p += sprintf(p, "tgt_id: 0x%llx ", cpd->target_id);
|
||||
+ if (cpd->validation_bits & CPER_MEM_VALID_RANK_NUMBER)
|
||||
+ p += sprintf(p, "rank: %d ", cpd->rank);
|
||||
+ if (cpd->validation_bits & CPER_MEM_VALID_CARD_HANDLE)
|
||||
+ p += sprintf(p, "card_handle: %d ", cpd->mem_array_handle);
|
||||
+ if (cpd->validation_bits & CPER_MEM_VALID_MODULE_HANDLE)
|
||||
+ p += sprintf(p, "module_handle: %d ", cpd->mem_dev_handle);
|
||||
+ p += sprintf(p-1, ")");
|
||||
+
|
||||
+ return buf;
|
||||
+}
|
||||
+
|
||||
+static char *uuid_le(const char *uu)
|
||||
+{
|
||||
+ static char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")];
|
||||
+ char *p = uuid;
|
||||
+ int i;
|
||||
+ static const unsigned char le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15};
|
||||
+
|
||||
+ for (i = 0; i < 16; i++) {
|
||||
+ p += sprintf(p, "%.2x", uu[le[i]]);
|
||||
+ switch (i) {
|
||||
+ case 3:
|
||||
+ case 5:
|
||||
+ case 7:
|
||||
+ case 9:
|
||||
+ *p++ = '-';
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ *p = 0;
|
||||
+
|
||||
+ return uuid;
|
||||
+}
|
||||
+
|
||||
+
|
||||
+static void report_extlog_mem_event(struct ras_events *ras,
|
||||
+ struct pevent_record *record,
|
||||
+ struct trace_seq *s,
|
||||
+ struct ras_extlog_event *ev)
|
||||
+{
|
||||
+ trace_seq_printf(s, "%d %s error: %s physical addr: 0x%llx mask: 0x%llx%s %s %s",
|
||||
+ ev->error_seq, err_severity(ev->severity),
|
||||
+ err_type(ev->etype), ev->address,
|
||||
+ err_mask(ev->pa_mask_lsb),
|
||||
+ err_cper_data(ev->cper_data),
|
||||
+ ev->fru_text,
|
||||
+ uuid_le(ev->fru_id));
|
||||
+}
|
||||
+
|
||||
+int ras_extlog_mem_event_handler(struct trace_seq *s,
|
||||
+ struct pevent_record *record,
|
||||
+ struct event_format *event, void *context)
|
||||
+{
|
||||
+ int len;
|
||||
+ unsigned long long val;
|
||||
+ struct ras_events *ras = context;
|
||||
+ time_t now;
|
||||
+ struct tm *tm;
|
||||
+ struct ras_extlog_event ev;
|
||||
+
|
||||
+ /*
|
||||
+ * Newer kernels (3.10-rc1 or upper) provide an uptime clock.
|
||||
+ * On previous kernels, the way to properly generate an event would
|
||||
+ * be to inject a fake one, measure its timestamp and diff it against
|
||||
+ * gettimeofday. We won't do it here. Instead, let's use uptime,
|
||||
+ * falling-back to the event report's time, if "uptime" clock is
|
||||
+ * not available (legacy kernels).
|
||||
+ */
|
||||
+
|
||||
+ if (ras->use_uptime)
|
||||
+ now = record->ts/user_hz + ras->uptime_diff;
|
||||
+ else
|
||||
+ now = time(NULL);
|
||||
+
|
||||
+ tm = localtime(&now);
|
||||
+ if (tm)
|
||||
+ strftime(ev.timestamp, sizeof(ev.timestamp),
|
||||
+ "%Y-%m-%d %H:%M:%S %z", tm);
|
||||
+ trace_seq_printf(s, "%s ", ev.timestamp);
|
||||
+
|
||||
+ if (pevent_get_field_val(s, event, "etype", record, &val, 1) < 0)
|
||||
+ return -1;
|
||||
+ ev.etype = val;
|
||||
+ if (pevent_get_field_val(s, event, "err_seq", record, &val, 1) < 0)
|
||||
+ return -1;
|
||||
+ ev.error_seq = val;
|
||||
+ if (pevent_get_field_val(s, event, "sev", record, &val, 1) < 0)
|
||||
+ return -1;
|
||||
+ ev.severity = val;
|
||||
+ if (pevent_get_field_val(s, event, "pa", record, &val, 1) < 0)
|
||||
+ return -1;
|
||||
+ ev.address = val;
|
||||
+ if (pevent_get_field_val(s, event, "pa_mask_lsb", record, &val, 1) < 0)
|
||||
+ return -1;
|
||||
+ ev.pa_mask_lsb = val;
|
||||
+
|
||||
+ ev.cper_data = pevent_get_field_raw(s, event, "data",
|
||||
+ record, &len, 1);
|
||||
+ ev.cper_data_length = len;
|
||||
+ ev.fru_text = pevent_get_field_raw(s, event, "fru_text",
|
||||
+ record, &len, 1);
|
||||
+ ev.fru_id = pevent_get_field_raw(s, event, "fru_id",
|
||||
+ record, &len, 1);
|
||||
+
|
||||
+ report_extlog_mem_event(ras, record, s, &ev);
|
||||
+
|
||||
+ ras_store_extlog_mem_record(ras, &ev);
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
diff --git a/ras-extlog-handler.h b/ras-extlog-handler.h
|
||||
new file mode 100644
|
||||
index 0000000..54e8cec
|
||||
--- /dev/null
|
||||
+++ b/ras-extlog-handler.h
|
||||
@@ -0,0 +1,31 @@
|
||||
+/*
|
||||
+ * Copyright (C) 2014 Tony Luck <tony.luck@intel.com>
|
||||
+ *
|
||||
+ * This program is free software; you can redistribute it and/or modify
|
||||
+ * it under the terms of the GNU General Public License as published by
|
||||
+ * the Free Software Foundation; either version 2 of the License, or
|
||||
+ * (at your option) any later version.
|
||||
+ *
|
||||
+ * This program is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
+ * GNU General Public License for more details.
|
||||
+ *
|
||||
+ * You should have received a copy of the GNU General Public License
|
||||
+ * along with this program; if not, write to the Free Software
|
||||
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
+*/
|
||||
+
|
||||
+#ifndef __RAS_EXTLOG_HANDLER_H
|
||||
+#define __RAS_EXTLOG_HANDLER_H
|
||||
+
|
||||
+#include <stdint.h>
|
||||
+
|
||||
+#include "ras-events.h"
|
||||
+#include "libtrace/event-parse.h"
|
||||
+
|
||||
+extern int ras_extlog_mem_event_handler(struct trace_seq *s,
|
||||
+ struct pevent_record *record,
|
||||
+ struct event_format *event, void *context);
|
||||
+
|
||||
+#endif
|
||||
diff --git a/ras-mc-handler.c b/ras-mc-handler.c
|
||||
index ffb3805..704a41c 100644
|
||||
--- a/ras-mc-handler.c
|
||||
+++ b/ras-mc-handler.c
|
||||
@@ -47,7 +47,7 @@ int ras_mc_event_handler(struct trace_seq *s,
|
||||
*/
|
||||
|
||||
if (ras->use_uptime)
|
||||
- now = record->ts/1000000000L + ras->uptime_diff;
|
||||
+ now = record->ts/user_hz + ras->uptime_diff;
|
||||
else
|
||||
now = time(NULL);
|
||||
|
||||
diff --git a/ras-mce-handler.c b/ras-mce-handler.c
|
||||
index 1431049..a1d0b5d 100644
|
||||
--- a/ras-mce-handler.c
|
||||
+++ b/ras-mce-handler.c
|
||||
@@ -237,7 +237,7 @@ static void report_mce_event(struct ras_events *ras,
|
||||
*/
|
||||
|
||||
if (ras->use_uptime)
|
||||
- now = record->ts/1000000000L + ras->uptime_diff;
|
||||
+ now = record->ts/user_hz + ras->uptime_diff;
|
||||
else
|
||||
now = time(NULL);
|
||||
|
||||
diff --git a/ras-record.c b/ras-record.c
|
||||
index e5150ad..3dc4493 100644
|
||||
--- a/ras-record.c
|
||||
+++ b/ras-record.c
|
||||
@@ -157,6 +157,57 @@ int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev)
|
||||
}
|
||||
#endif
|
||||
|
||||
+#ifdef HAVE_EXTLOG
|
||||
+static const struct db_fields extlog_event_fields[] = {
|
||||
+ { .name="id", .type="INTEGER PRIMARY KEY" },
|
||||
+ { .name="timestamp", .type="TEXT" },
|
||||
+ { .name="etype", .type="INTEGER" },
|
||||
+ { .name="error_count", .type="INTEGER" },
|
||||
+ { .name="severity", .type="INTEGER" },
|
||||
+ { .name="address", .type="INTEGER" },
|
||||
+ { .name="fru_id", .type="BLOB" },
|
||||
+ { .name="fru_text", .type="TEXT" },
|
||||
+ { .name="cper_data", .type="BLOB" },
|
||||
+};
|
||||
+
|
||||
+static const struct db_table_descriptor extlog_event_tab = {
|
||||
+ .name = "extlog_event",
|
||||
+ .fields = extlog_event_fields,
|
||||
+ .num_fields = ARRAY_SIZE(extlog_event_fields),
|
||||
+};
|
||||
+
|
||||
+int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event *ev)
|
||||
+{
|
||||
+ int rc;
|
||||
+ struct sqlite3_priv *priv = ras->db_priv;
|
||||
+
|
||||
+ if (!priv || !priv->stmt_extlog_record)
|
||||
+ return 0;
|
||||
+ log(TERM, LOG_INFO, "extlog_record store: %p\n", priv->stmt_extlog_record);
|
||||
+
|
||||
+ sqlite3_bind_text (priv->stmt_extlog_record, 1, ev->timestamp, -1, NULL);
|
||||
+ sqlite3_bind_int (priv->stmt_extlog_record, 2, ev->etype);
|
||||
+ sqlite3_bind_int (priv->stmt_extlog_record, 3, ev->error_seq);
|
||||
+ sqlite3_bind_int (priv->stmt_extlog_record, 4, ev->severity);
|
||||
+ sqlite3_bind_int64 (priv->stmt_extlog_record, 5, ev->address);
|
||||
+ sqlite3_bind_blob (priv->stmt_extlog_record, 6, ev->fru_id, 16, NULL);
|
||||
+ sqlite3_bind_text (priv->stmt_extlog_record, 7, ev->fru_text, -1, NULL);
|
||||
+ sqlite3_bind_blob (priv->stmt_extlog_record, 8, ev->cper_data, ev->cper_data_length, NULL);
|
||||
+
|
||||
+ rc = sqlite3_step(priv->stmt_extlog_record);
|
||||
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "Failed to do extlog_mem_record step on sqlite: error = %d\n", rc);
|
||||
+ rc = sqlite3_reset(priv->stmt_extlog_record);
|
||||
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "Failed reset extlog_mem_record on sqlite: error = %d\n",
|
||||
+ rc);
|
||||
+ log(TERM, LOG_INFO, "register inserted at db\n");
|
||||
+
|
||||
+ return rc;
|
||||
+}
|
||||
+#endif
|
||||
|
||||
/*
|
||||
* Table and functions to handle mce:mce_record
|
||||
@@ -385,6 +436,13 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras)
|
||||
&aer_event_tab);
|
||||
#endif
|
||||
|
||||
+#ifdef HAVE_EXTLOG
|
||||
+ rc = ras_mc_create_table(priv, &extlog_event_tab);
|
||||
+ if (rc == SQLITE_OK)
|
||||
+ rc = ras_mc_prepare_stmt(priv, &priv->stmt_extlog_record,
|
||||
+ &extlog_event_tab);
|
||||
+#endif
|
||||
+
|
||||
#ifdef HAVE_MCE
|
||||
rc = ras_mc_create_table(priv, &mce_record_tab);
|
||||
if (rc == SQLITE_OK)
|
||||
diff --git a/ras-record.h b/ras-record.h
|
||||
index 6f146a8..5d84297 100644
|
||||
--- a/ras-record.h
|
||||
+++ b/ras-record.h
|
||||
@@ -19,8 +19,11 @@
|
||||
#ifndef __RAS_RECORD_H
|
||||
#define __RAS_RECORD_H
|
||||
|
||||
+#include <stdint.h>
|
||||
#include "config.h"
|
||||
|
||||
+extern long user_hz;
|
||||
+
|
||||
struct ras_events *ras;
|
||||
|
||||
struct ras_mc_event {
|
||||
@@ -40,8 +43,22 @@ struct ras_aer_event {
|
||||
const char *msg;
|
||||
};
|
||||
|
||||
+struct ras_extlog_event {
|
||||
+ char timestamp[64];
|
||||
+ int32_t error_seq;
|
||||
+ int8_t etype;
|
||||
+ int8_t severity;
|
||||
+ unsigned long long address;
|
||||
+ int8_t pa_mask_lsb;
|
||||
+ const char *fru_id;
|
||||
+ const char *fru_text;
|
||||
+ const char *cper_data;
|
||||
+ unsigned short cper_data_length;
|
||||
+};
|
||||
+
|
||||
struct ras_mc_event;
|
||||
struct ras_aer_event;
|
||||
+struct ras_extlog_event;
|
||||
struct mce_event;
|
||||
|
||||
#ifdef HAVE_SQLITE3
|
||||
@@ -57,18 +74,23 @@ struct sqlite3_priv {
|
||||
#ifdef HAVE_MCE
|
||||
sqlite3_stmt *stmt_mce_record;
|
||||
#endif
|
||||
+#ifdef HAVE_EXTLOG
|
||||
+ sqlite3_stmt *stmt_extlog_record;
|
||||
+#endif
|
||||
};
|
||||
|
||||
int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras);
|
||||
int ras_store_mc_event(struct ras_events *ras, struct ras_mc_event *ev);
|
||||
int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev);
|
||||
int ras_store_mce_record(struct ras_events *ras, struct mce_event *ev);
|
||||
+int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event *ev);
|
||||
|
||||
#else
|
||||
static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; };
|
||||
static inline int ras_store_mc_event(struct ras_events *ras, struct ras_mc_event *ev) { return 0; };
|
||||
static inline int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev) { return 0; };
|
||||
static inline int ras_store_mce_record(struct ras_events *ras, struct mce_event *ev) { return 0; };
|
||||
+static inline int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event *ev) { return 0; };
|
||||
|
||||
#endif
|
||||
|
||||
diff --git a/rasdaemon.c b/rasdaemon.c
|
||||
index 85ac2d4..41022ef 100644
|
||||
--- a/rasdaemon.c
|
||||
+++ b/rasdaemon.c
|
||||
@@ -68,6 +68,8 @@ static error_t parse_opt(int k, char *arg, struct argp_state *state)
|
||||
return 0;
|
||||
}
|
||||
|
||||
+long user_hz;
|
||||
+
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
struct arguments args;
|
||||
@@ -91,6 +93,8 @@ int main(int argc, char *argv[])
|
||||
};
|
||||
memset (&args, 0, sizeof(args));
|
||||
|
||||
+ user_hz = sysconf(_SC_CLK_TCK);
|
||||
+
|
||||
argp_parse(&argp, argc, argv, 0, &idx, &args);
|
||||
|
||||
if (idx < 0) {
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index e9f9c59..110262f 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -842,11 +842,141 @@ sub find_prog
|
||||
return "";
|
||||
}
|
||||
|
||||
+sub get_extlog_type
|
||||
+{
|
||||
+ my @types;
|
||||
+
|
||||
+ if ($_[0] < 0 || $_[0] > 15) {
|
||||
+ return "unknown-type";
|
||||
+ }
|
||||
+
|
||||
+ @types = ("unknown",
|
||||
+ "no error",
|
||||
+ "single-bit ECC",
|
||||
+ "multi-bit ECC",
|
||||
+ "single-symbol chipkill ECC",
|
||||
+ "multi-symbol chipkill ECC",
|
||||
+ "master abort",
|
||||
+ "target abort",
|
||||
+ "parity error",
|
||||
+ "watchdog timeout",
|
||||
+ "invalid address",
|
||||
+ "mirror Broken",
|
||||
+ "memory sparing",
|
||||
+ "scrub corrected error",
|
||||
+ "scrub uncorrected error",
|
||||
+ "physical memory map-out event",
|
||||
+ "unknown-type");
|
||||
+ return $types[$_[0]];
|
||||
+}
|
||||
+
|
||||
+sub get_extlog_severity
|
||||
+{
|
||||
+ my @sev;
|
||||
+
|
||||
+ if ($_[0] < 0 || $_[0] > 3) {
|
||||
+ return "unknown-severity";
|
||||
+ }
|
||||
+
|
||||
+ @sev = ("recoverable",
|
||||
+ "fatal",
|
||||
+ "corrected",
|
||||
+ "informational",
|
||||
+ "unknown-severity");
|
||||
+ return $sev[$_[0]];
|
||||
+}
|
||||
+
|
||||
+use constant {
|
||||
+ CPER_MEM_VALID_NODE => 0x0008,
|
||||
+ CPER_MEM_VALID_CARD => 0x0010,
|
||||
+ CPER_MEM_VALID_MODULE => 0x0020,
|
||||
+ CPER_MEM_VALID_BANK => 0x0040,
|
||||
+ CPER_MEM_VALID_DEVICE => 0x0080,
|
||||
+ CPER_MEM_VALID_ROW => 0x0100,
|
||||
+ CPER_MEM_VALID_COLUMN => 0x0200,
|
||||
+ CPER_MEM_VALID_BIT_POSITION => 0x0400,
|
||||
+ CPER_MEM_VALID_REQUESTOR_ID => 0x0800,
|
||||
+ CPER_MEM_VALID_RESPONDER_ID => 0x1000,
|
||||
+ CPER_MEM_VALID_TARGET_ID => 0x2000,
|
||||
+ CPER_MEM_VALID_ERROR_TYPE => 0x4000,
|
||||
+ CPER_MEM_VALID_RANK_NUMBER => 0x8000,
|
||||
+ CPER_MEM_VALID_CARD_HANDLE => 0x10000,
|
||||
+ CPER_MEM_VALID_MODULE_HANDLE => 0x20000,
|
||||
+};
|
||||
+
|
||||
+sub get_cper_data_text
|
||||
+{
|
||||
+ my $cper_data = $_[0];
|
||||
+ my ($validation_bits, $node, $card, $module, $bank, $device, $row, $column, $bit_pos, $requestor_id, $responder_id, $target_id, $rank, $mem_array_handle, $mem_dev_handle) = unpack 'QSSSSSSSSQQQSSS', $cper_data;
|
||||
+ my @out;
|
||||
+
|
||||
+ if ($validation_bits & CPER_MEM_VALID_NODE) {
|
||||
+ push @out, (sprintf "node=%d", $node);
|
||||
+ }
|
||||
+ if ($validation_bits & CPER_MEM_VALID_CARD) {
|
||||
+ push @out, (sprintf "card=%d", $card);
|
||||
+ }
|
||||
+ if ($validation_bits & CPER_MEM_VALID_MODULE) {
|
||||
+ push @out, (sprintf "module=%d", $module);
|
||||
+ }
|
||||
+ if ($validation_bits & CPER_MEM_VALID_BANK) {
|
||||
+ push @out, (sprintf "bank=%d", $bank);
|
||||
+ }
|
||||
+ if ($validation_bits & CPER_MEM_VALID_DEVICE) {
|
||||
+ push @out, (sprintf "device=%d", $device);
|
||||
+ }
|
||||
+ if ($validation_bits & CPER_MEM_VALID_ROW) {
|
||||
+ push @out, (sprintf "row=%d", $row);
|
||||
+ }
|
||||
+ if ($validation_bits & CPER_MEM_VALID_COLUMN) {
|
||||
+ push @out, (sprintf "column=%d", $column);
|
||||
+ }
|
||||
+ if ($validation_bits & CPER_MEM_VALID_BIT_POSITION) {
|
||||
+ push @out, (sprintf "bit_position=%d", $bit_pos);
|
||||
+ }
|
||||
+ if ($validation_bits & CPER_MEM_VALID_REQUESTOR_ID) {
|
||||
+ push @out, (sprintf "0x%08x", $requestor_id);
|
||||
+ }
|
||||
+ if ($validation_bits & CPER_MEM_VALID_RESPONDER_ID) {
|
||||
+ push @out, (sprintf "0x%08x", $responder_id);
|
||||
+ }
|
||||
+ if ($validation_bits & CPER_MEM_VALID_TARGET_ID) {
|
||||
+ push @out, (sprintf "0x%08x", $target_id);
|
||||
+ }
|
||||
+ if ($validation_bits & CPER_MEM_VALID_RANK_NUMBER) {
|
||||
+ push @out, (sprintf "rank=%d", $rank);
|
||||
+ }
|
||||
+ if ($validation_bits & CPER_MEM_VALID_CARD_HANDLE) {
|
||||
+ push @out, (sprintf "mem_array_handle=%d", $mem_array_handle);
|
||||
+ }
|
||||
+ if ($validation_bits & CPER_MEM_VALID_MODULE_HANDLE) {
|
||||
+ push @out, (sprintf "mem_dev_handle=%d", $mem_dev_handle);
|
||||
+ }
|
||||
+
|
||||
+ return join (", ", @out);
|
||||
+}
|
||||
+
|
||||
+sub get_uuid_le
|
||||
+{
|
||||
+ my $out = "";
|
||||
+ my @bytes = unpack "C*", $_[0];
|
||||
+ my @le16_table = (3, 2, 1, 0, 5, 4, 7, 6, 8, 9, 10, 11, 12, 13, 14, 15);
|
||||
+
|
||||
+ for (my $i = 0; $i < 16; $i++) {
|
||||
+ $out .= sprintf "%.2x", $bytes[$le16_table[$i]];
|
||||
+ if ($i == 3 or $i == 5 or $i == 7 or $i == 9) {
|
||||
+ $out .= "-";
|
||||
+ }
|
||||
+ }
|
||||
+ return $out;
|
||||
+}
|
||||
+
|
||||
sub summary
|
||||
{
|
||||
require DBI;
|
||||
my ($query, $query_handle, $out);
|
||||
my ($err_type, $label, $mc, $top, $mid, $low, $count, $msg);
|
||||
+ my ($etype, $severity, $etype_string, $severity_string);
|
||||
|
||||
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
|
||||
|
||||
@@ -882,6 +1012,24 @@ sub summary
|
||||
}
|
||||
$query_handle->finish;
|
||||
|
||||
+ # extlog errors
|
||||
+ $query = "select etype, severity, count(*) from extlog_event group by etype, severity";
|
||||
+ $query_handle = $dbh->prepare($query);
|
||||
+ $query_handle->execute();
|
||||
+ $query_handle->bind_columns(\($etype, $severity, $count));
|
||||
+ $out = "";
|
||||
+ while($query_handle->fetch()) {
|
||||
+ $etype_string = get_extlog_type($etype);
|
||||
+ $severity_string = get_extlog_severity($severity);
|
||||
+ $out .= "\t$count $etype_string $severity_string errors\n";
|
||||
+ }
|
||||
+ if ($out ne "") {
|
||||
+ print "Extlog records summary:\n$out";
|
||||
+ } else {
|
||||
+ print "No Extlog errors.\n";
|
||||
+ }
|
||||
+ $query_handle->finish;
|
||||
+
|
||||
# MCE mce_record errors
|
||||
$query = "select error_msg, count(*) from mce_record group by error_msg";
|
||||
$query_handle = $dbh->prepare($query);
|
||||
@@ -906,6 +1054,7 @@ sub errors
|
||||
require DBI;
|
||||
my ($query, $query_handle, $id, $time, $count, $type, $msg, $label, $mc, $top, $mid, $low, $addr, $grain, $syndrome, $detail, $out);
|
||||
my ($mcgcap,$mcgstatus, $status, $misc, $ip, $tsc, $walltime, $cpu, $cpuid, $apicid, $socketid, $cs, $bank, $cpuvendor, $bank_name, $mcgstatus_msg, $mcistatus_msg, $user_action, $mc_location);
|
||||
+ my ($timestamp, $etype, $severity, $etype_string, $severity_string, $fru_id, $fru_text, $cper_data);
|
||||
|
||||
my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", {});
|
||||
|
||||
@@ -945,6 +1094,31 @@ sub errors
|
||||
}
|
||||
$query_handle->finish;
|
||||
|
||||
+ # Extlog errors
|
||||
+ $query = "select id, timestamp, etype, severity, address, fru_id, fru_text, cper_data from extlog_event order by id";
|
||||
+ $query_handle = $dbh->prepare($query);
|
||||
+ $query_handle->execute();
|
||||
+ $query_handle->bind_columns(\($id, $timestamp, $etype, $severity, $addr, $fru_id, $fru_text, $cper_data));
|
||||
+ $out = "";
|
||||
+ while($query_handle->fetch()) {
|
||||
+ $etype_string = get_extlog_type($etype);
|
||||
+ $severity_string = get_extlog_severity($severity);
|
||||
+ $out .= "$id $timestamp error: ";
|
||||
+ $out .= "type=$etype_string, ";
|
||||
+ $out .= "severity=$severity_string, ";
|
||||
+ $out .= sprintf "address=0x%08x, ", $addr;
|
||||
+ $out .= sprintf "fru_id=%s, ", get_uuid_le($fru_id);
|
||||
+ $out .= "fru_text='$fru_text', ";
|
||||
+ $out .= get_cper_data_text($cper_data) if ($cper_data);
|
||||
+ $out .= "\n";
|
||||
+ }
|
||||
+ if ($out ne "") {
|
||||
+ print "Extlog events:\n$out\n";
|
||||
+ } else {
|
||||
+ print "No Extlog errors.\n\n";
|
||||
+ }
|
||||
+ $query_handle->finish;
|
||||
+
|
||||
# MCE mce_record errors
|
||||
$query = "select id, timestamp, mcgcap, mcgstatus, status, addr, misc, ip, tsc, walltime, cpu, cpuid, apicid, socketid, cs, bank, cpuvendor, bank_name, error_msg, mcgstatus_msg, mcistatus_msg, user_action, mc_location from mce_record order by id";
|
||||
$query_handle = $dbh->prepare($query);
|
|
@ -0,0 +1,37 @@
|
|||
commit d3d336471119f16368e40b68643d9dd928be5385
|
||||
Author: Luck, Tony <tony.luck@intel.com>
|
||||
Date: Mon Apr 7 12:23:25 2014 -0700
|
||||
|
||||
rasdaemon: fix some typos and cut/paste errors in sqlite bits
|
||||
|
||||
aer event has the error_type as field 2 and msg as field 3 - but the calls
|
||||
the sqlite3_bind_text use 3 and 4.
|
||||
|
||||
mce event forgot to declare the "mcastatus_msg"
|
||||
|
||||
Signed-off-by: Tony Luck <tony.luck@intel.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com>
|
||||
|
||||
diff --git a/ras-record.c b/ras-record.c
|
||||
index daa3cb1..e602edb 100644
|
||||
--- a/ras-record.c
|
||||
+++ b/ras-record.c
|
||||
@@ -139,8 +139,8 @@ int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev)
|
||||
log(TERM, LOG_INFO, "aer_event store: %p\n", priv->stmt_aer_event);
|
||||
|
||||
sqlite3_bind_text(priv->stmt_aer_event, 1, ev->timestamp, -1, NULL);
|
||||
- sqlite3_bind_text(priv->stmt_aer_event, 3, ev->error_type, -1, NULL);
|
||||
- sqlite3_bind_text(priv->stmt_aer_event, 4, ev->msg, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_aer_event, 2, ev->error_type, -1, NULL);
|
||||
+ sqlite3_bind_text(priv->stmt_aer_event, 3, ev->msg, -1, NULL);
|
||||
|
||||
rc = sqlite3_step(priv->stmt_aer_event);
|
||||
if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
@@ -189,6 +189,7 @@ static const struct db_fields mce_record_fields[] = {
|
||||
{ .name="error_msg", .type="TEXT" },
|
||||
{ .name="mcgstatus_msg", .type="TEXT" },
|
||||
{ .name="mcistatus_msg", .type="TEXT" }, // 20
|
||||
+ { .name="mcastatus_msg", .type="TEXT" },
|
||||
{ .name="user_action", .type="TEXT" },
|
||||
{ .name="mc_location", .type="TEXT" },
|
||||
};
|
|
@ -0,0 +1,73 @@
|
|||
commit 52e60e3050105a55e1ff2382979d5f370f398200
|
||||
Author: Luck, Tony <tony.luck@intel.com>
|
||||
Date: Mon Apr 7 11:27:47 2014 -0700
|
||||
|
||||
rasdaemon: sqlite truncates some MCE fields to 32-bit
|
||||
|
||||
The sqlite3_bind_int() function takes an "int" as the argument value to
|
||||
save to the database. But some fields are wider than 32-bits. Use
|
||||
sqlite3_bind_int64() for the fields where we know values can exceed
|
||||
4G.
|
||||
|
||||
Before:
|
||||
|
||||
# ./rasdaemon/util/ras-mc-ctl --errors
|
||||
...
|
||||
MCE events:
|
||||
1 2014-04-04 08:50:32 -0700 error: MEMORY CONTROLLER RD_CHANNEL0_ERR Transaction: Memory read error, mcg mcgstatus= 0, mci Corrected_error, mcgcap=0x07000c16, status=0x00010090, addr=0x35fcb9c0, misc=0x5026a686, walltime=0x5342e4f9, cpu=0x0000000e, cpuid=0x000306f1, apicid=0x00000020, socketid=0x00000001, bank=0x00000008
|
||||
2 2014-04-04 08:50:35 -0700 error: MEMORY CONTROLLER RD_CHANNEL0_ERR Transaction: Memory read error, mcg mcgstatus= 0, mci Corrected_error, mcgcap=0x07000c16, status=0x00010090, addr=0x4187adc0, misc=0x4274f486, walltime=0x5342e4fc, cpu=0x0000000e, cpuid=0x000306f1, apicid=0x00000020, socketid=0x00000001, bank=0x00000007
|
||||
3 2014-04-04 08:50:37 -0700 error: MEMORY CONTROLLER RD_CHANNEL0_ERR Transaction: Memory read error, mcg mcgstatus= 0, mci Corrected_error, mcgcap=0x07000c16, status=0x00010090, addr=0x52efc600, misc=0x50028286, walltime=0x5342e4fd, cpu=0x0000000e, cpuid=0x000306f1, apicid=0x00000020, socketid=0x00000001, bank=0x00000008
|
||||
|
||||
After:
|
||||
./rasdaemon/util/ras-mc-ctl --errors
|
||||
...
|
||||
1 2014-04-04 09:00:07 -0700 error: MEMORY CONTROLLER RD_CHANNEL0_ERR Transaction: Memory read error, mcg mcgstatus= 0, mci Corrected_error, mcgcap=0x07000c16, status=0x8c00004000010090, addr=0x45340a180, misc=0x140686886, walltime=0x5342e736, cpuid=0x000306f1, bank=0x00000008
|
||||
2 2014-04-04 09:00:08 -0700 error: MEMORY CONTROLLER RD_CHANNEL0_ERR Transaction: Memory read error, mcg mcgstatus= 0, mci Corrected_error, mcgcap=0x07000c16, status=0x8c00004000010090, addr=0x44d6e4780, misc=0x15060e086, walltime=0x5342e737, cpuid=0x000306f1, bank=0x00000007
|
||||
3 2014-04-04 09:00:10 -0700 error: MEMORY CONTROLLER RD_CHANNEL0_ERR Transaction: Memory read error, mcg mcgstatus= 0, mci Corrected_error, mcgcap=0x07000c16, status=0x8c00004000010090, addr=0x44cb64640, misc=0x140505086, walltime=0x5342e739, cpuid=0x000306f1, bank=0x00000008
|
||||
|
||||
Signed-off-by: Tony Luck <tony.luck@intel.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com>
|
||||
|
||||
diff --git a/ras-record.c b/ras-record.c
|
||||
index e602edb..e5150ad 100644
|
||||
--- a/ras-record.c
|
||||
+++ b/ras-record.c
|
||||
@@ -209,22 +209,22 @@ int ras_store_mce_record(struct ras_events *ras, struct mce_event *ev)
|
||||
return 0;
|
||||
log(TERM, LOG_INFO, "mce_record store: %p\n", priv->stmt_mce_record);
|
||||
|
||||
- sqlite3_bind_text(priv->stmt_mce_record, 1, ev->timestamp, -1, NULL);
|
||||
- sqlite3_bind_int (priv->stmt_mce_record, 2, ev->mcgcap);
|
||||
- sqlite3_bind_int (priv->stmt_mce_record, 3, ev->mcgstatus);
|
||||
- sqlite3_bind_int (priv->stmt_mce_record, 4, ev->status);
|
||||
- sqlite3_bind_int (priv->stmt_mce_record, 5, ev->addr);
|
||||
- sqlite3_bind_int (priv->stmt_mce_record, 6, ev->misc);
|
||||
- sqlite3_bind_int (priv->stmt_mce_record, 7, ev->ip);
|
||||
- sqlite3_bind_int (priv->stmt_mce_record, 8, ev->tsc);
|
||||
- sqlite3_bind_int (priv->stmt_mce_record, 9, ev->walltime);
|
||||
- sqlite3_bind_int (priv->stmt_mce_record, 10, ev->cpu);
|
||||
- sqlite3_bind_int (priv->stmt_mce_record, 11, ev->cpuid);
|
||||
- sqlite3_bind_int (priv->stmt_mce_record, 12, ev->apicid);
|
||||
- sqlite3_bind_int (priv->stmt_mce_record, 13, ev->socketid);
|
||||
- sqlite3_bind_int (priv->stmt_mce_record, 14, ev->cs);
|
||||
- sqlite3_bind_int (priv->stmt_mce_record, 15, ev->bank);
|
||||
- sqlite3_bind_int (priv->stmt_mce_record, 16, ev->cpuvendor);
|
||||
+ sqlite3_bind_text (priv->stmt_mce_record, 1, ev->timestamp, -1, NULL);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 2, ev->mcgcap);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 3, ev->mcgstatus);
|
||||
+ sqlite3_bind_int64 (priv->stmt_mce_record, 4, ev->status);
|
||||
+ sqlite3_bind_int64 (priv->stmt_mce_record, 5, ev->addr);
|
||||
+ sqlite3_bind_int64 (priv->stmt_mce_record, 6, ev->misc);
|
||||
+ sqlite3_bind_int64 (priv->stmt_mce_record, 7, ev->ip);
|
||||
+ sqlite3_bind_int64 (priv->stmt_mce_record, 8, ev->tsc);
|
||||
+ sqlite3_bind_int64 (priv->stmt_mce_record, 9, ev->walltime);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 10, ev->cpu);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 11, ev->cpuid);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 12, ev->apicid);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 13, ev->socketid);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 14, ev->cs);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 15, ev->bank);
|
||||
+ sqlite3_bind_int (priv->stmt_mce_record, 16, ev->cpuvendor);
|
||||
|
||||
sqlite3_bind_text(priv->stmt_mce_record, 17, ev->bank_name, -1, NULL);
|
||||
sqlite3_bind_text(priv->stmt_mce_record, 18, ev->error_msg, -1, NULL);
|
|
@ -0,0 +1,44 @@
|
|||
commit f20a366a9b7a32a1be6fc89e7546cc2b4cb690bf
|
||||
Author: Xie XiuQi <xiexiuqi@huawei.com>
|
||||
Date: Thu May 8 20:07:19 2014 +0800
|
||||
|
||||
rasdaemon: fix mce numfield decoded error
|
||||
|
||||
Some fields are missing in mce decode information, as below:
|
||||
...
|
||||
rasdaemon: register inserted at db
|
||||
<...>-31568 [000] 4023.214080: mce_record:
|
||||
2014-05-07 15:51:16 +0800 bank=2, status= bd000000000000c0, MEMORY
|
||||
CONTROLLER MS_CHANNEL0_ERR Transaction: Memory scrubbing error %s: %Lu
|
||||
%s: %Lx
|
||||
%s: %Lx
|
||||
%s: %Lu
|
||||
%s: %Lu
|
||||
%s: %Lx
|
||||
, mci=Uncorrected_error Error_enabled SRAO, n_errors=0 channel=0,
|
||||
dimm=0, cpu_type= Intel Xeon 5500 series / Core i3/5/7
|
||||
("Nehalem/Westmere"), cpu= 0, socketid= 0, ip= 1eadbabe (INEXACT), cs=
|
||||
73, misc= 8c, addr= 62b000, mcgstatus= 5 RIPV MCIP, mcgcap= 1c09,
|
||||
apicid= 0
|
||||
|
||||
"f->name" & "v" are missed to print in decode_numfield(), so fix it.
|
||||
|
||||
Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com>
|
||||
|
||||
diff --git a/bitfield.c b/bitfield.c
|
||||
index b2895b4..07795a9 100644
|
||||
--- a/bitfield.c
|
||||
+++ b/bitfield.c
|
||||
@@ -92,8 +92,9 @@ void decode_numfield(struct mce_event *e, uint64_t status,
|
||||
uint64_t mask = (1ULL << (f->end - f->start + 1)) - 1;
|
||||
uint64_t v = (status >> f->start) & mask;
|
||||
if (v > 0 || f->force) {
|
||||
- mce_snprintf(e->error_msg, "%%s: %s\n",
|
||||
- f->fmt ? f->fmt : "%Lu");
|
||||
+ char fmt[32] = {0};
|
||||
+ snprintf(fmt, 32, "%%s: %s\n", f->fmt ? f->fmt : "%Lu");
|
||||
+ mce_snprintf(e->error_msg, fmt, f->name, v);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,84 @@
|
|||
From 7e79fa94dc6c294cd731c0c684b277dd4811c5db Mon Sep 17 00:00:00 2001
|
||||
From: Aristeu Rozanski <aris@redhat.com>
|
||||
Date: Fri, 15 Aug 2014 13:50:58 -0400
|
||||
Subject: [PATCH 3/4] rasdaemon: do not assume dimmX/ directories will be
|
||||
present
|
||||
|
||||
While finding the labels, size and location, ras-mc-ctl will search /sys for
|
||||
the files and calculate the location. When it uses the location trying to map
|
||||
back to files to print labels or write labels, it'll just assume dimm*
|
||||
directories exist which is not correct while using drivers like amd64_edac.
|
||||
This patch adds two new hashes to store the location and the label file path
|
||||
so it can be used later.
|
||||
|
||||
Signed-off-by: Aristeu Rozanski <aris@redhat.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <m.chehab@samsung.com>
|
||||
---
|
||||
util/ras-mc-ctl.in | 21 +++++++++++++--------
|
||||
1 file changed, 13 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index 110262f..7b6d798 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -45,6 +45,8 @@ my %conf = ();
|
||||
my %bus = ();
|
||||
my %dimm_size = ();
|
||||
my %dimm_node = ();
|
||||
+my %dimm_label_file = ();
|
||||
+my %dimm_location = ();
|
||||
my %csrow_size = ();
|
||||
my %rank_size = ();
|
||||
my %csrow_ranks = ();
|
||||
@@ -278,6 +280,9 @@ sub parse_dimm_nodes
|
||||
my $str_loc = join(':', $mc, @pos);
|
||||
$dimm_size{$str_loc} = $size;
|
||||
$dimm_node{$str_loc} = $dimm;
|
||||
+ $file =~ s/size/dimm_label/;
|
||||
+ $dimm_label_file{$str_loc} = $file;
|
||||
+ $dimm_location{$str_loc} = $location;
|
||||
|
||||
return;
|
||||
}
|
||||
@@ -557,12 +562,14 @@ sub read_dimm_label
|
||||
|
||||
my $dimm = $dimm_node{$pos};
|
||||
|
||||
- my $file = "$sysfs/mc$mc/dimm$dimm/dimm_label";
|
||||
+ my $dimm_label_file = $dimm_label_file{$pos};
|
||||
|
||||
- return ("$pos missing") unless -f $file;
|
||||
+ my $location = $dimm_location{$pos};
|
||||
|
||||
- if (!open (LABEL, "$file")) {
|
||||
- warn "Failed to open $file: $!\n";
|
||||
+ return ("label missing", "$pos missing") unless -f $dimm_label_file;
|
||||
+
|
||||
+ if (!open (LABEL, "$dimm_label_file")) {
|
||||
+ warn "Failed to open $dimm_label_file: $!\n";
|
||||
return ("Error");
|
||||
}
|
||||
|
||||
@@ -570,7 +577,7 @@ sub read_dimm_label
|
||||
|
||||
close (LABEL);
|
||||
|
||||
- $pos = "mc$mc " . qx(cat $sysfs/mc$mc/dimm$dimm/dimm_location);
|
||||
+ $pos = "mc$mc $location";
|
||||
|
||||
return ($label, $pos);
|
||||
}
|
||||
@@ -587,9 +594,7 @@ sub get_dimm_label_node
|
||||
|
||||
return "" if (!defined($dimm_node{$pos}));
|
||||
|
||||
- my $dimm = $dimm_node{$pos};
|
||||
-
|
||||
- return "$sysfs/mc$mc/dimm$dimm/dimm_label";
|
||||
+ return "$dimm_label_file{$pos}";
|
||||
}
|
||||
|
||||
|
||||
--
|
||||
1.8.3.1
|
||||
|
|
@ -0,0 +1,119 @@
|
|||
Hello,
|
||||
|
||||
This patch adds labels for these Dell PowerEdge Servers:
|
||||
|
||||
R610,R/T710, R220, R/T620, R720/xd, R730/xd, M520, M620 and M820.
|
||||
|
||||
The current T610 (0F5XM3) mapping is incorrect. This patch fixes it.
|
||||
|
||||
Reqest review and inclusion to git repo.
|
||||
|
||||
Acked-by: Aristeu Rozanski <aris@redhat.com>
|
||||
Signed-off-by: Charles Rose <charles.rose.linux@gmail.com>
|
||||
---
|
||||
Changes in v2:
|
||||
- Include T110 II, T20, R/T320, M420, R/T420, R/T630, FC620, FC420
|
||||
- Include additional model numbers for M820 and some 2-socket systems.
|
||||
- Consolidate systems with similar maps.
|
||||
---
|
||||
labels/dell | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
|
||||
1 file changed, 79 insertions(+), 7 deletions(-)
|
||||
|
||||
diff --git a/labels/dell b/labels/dell
|
||||
index e1a09a7..d7e797b 100644
|
||||
--- a/labels/dell
|
||||
+++ b/labels/dell
|
||||
@@ -9,12 +9,84 @@
|
||||
#
|
||||
|
||||
Vendor: Dell Inc.
|
||||
+#### 11G ####
|
||||
+# 2-socket
|
||||
+# PowerEdge R610
|
||||
+ Model: 0K399H, 0F0XJ6
|
||||
+ DIMM_A1: 0.0.0; DIMM_A2: 0.0.1; DIMM_A3: 0.0.2;
|
||||
+ DIMM_A4: 0.1.0; DIMM_A5: 0.1.1; DIMM_A6: 0.1.2;
|
||||
|
||||
- Model: 0F5XM3
|
||||
- DIMM_A1: 0.0.0; DIMM_A2: 0.0.1; DIMM_A3: 0.0.2; DIMM_A4: 0.0.3;
|
||||
- DIMM_A5: 0.1.0; DIMM_A6: 0.1.1; DIMM_A7: 0.1.2; DIMM_A8: 0.1.3;
|
||||
- DIMM_A9: 0.2.0; DIMM_A10: 0.2.1; DIMM_A11: 0.2.2; DIMM_A12: 0.2.3;
|
||||
+ DIMM_B1: 1.0.0; DIMM_B2: 1.0.1; DIMM_B3: 1.0.2;
|
||||
+ DIMM_B4: 1.1.0; DIMM_B5: 1.1.1; DIMM_B6: 1.1.2;
|
||||
|
||||
- DIMM_B1: 1.0.0; DIMM_B2: 1.0.1; DIMM_B3: 1.0.2; DIMM_B4: 1.0.3;
|
||||
- DIMM_B5: 1.1.0; DIMM_B6: 1.1.1; DIMM_B7: 1.1.2; DIMM_B8: 1.1.3;
|
||||
- DIMM_B9: 1.2.0; DIMM_B10: 1.2.1; DIMM_B11: 1.2.2; DIMM_B12: 1.2.3;
|
||||
+# PowerEdge T710 R710
|
||||
+ Model: 01CTXG, 0N0H4P, 0MD99X, 0N047H, 0PV9DG
|
||||
+ DIMM_A3: 0.0.0; DIMM_A2: 0.1.0; DIMM_A1: 0.2.0;
|
||||
+ DIMM_A6: 0.0.1; DIMM_A5: 0.1.1; DIMM_A4: 0.2.1;
|
||||
+ DIMM_A9: 0.0.2; DIMM_A8: 0.1.2; DIMM_A7: 0.2.2;
|
||||
+
|
||||
+ DIMM_B3: 1.0.0; DIMM_B2: 1.1.0; DIMM_B1: 1.2.0;
|
||||
+ DIMM_B6: 1.0.1; DIMM_B5: 1.1.1; DIMM_B4: 1.2.1;
|
||||
+ DIMM_B9: 1.0.2; DIMM_B8: 1.1.2; DIMM_B7: 1.2.2;
|
||||
+
|
||||
+#### 12/13G ####
|
||||
+# 1-socket
|
||||
+# PowerEdge R220
|
||||
+ Model: 081N4V
|
||||
+ DIMM_A1: 0.0.0; DIMM_A2: 0.0.1;
|
||||
+ DIMM_A3: 0.1.0; DIMM_A4: 0.1.1;
|
||||
+
|
||||
+#PowerEdge T110 II, T20
|
||||
+ Model: 0PC2WT, 0PM2CW, 015TH9, 0MDHN4, 0VD5HY
|
||||
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0;
|
||||
+
|
||||
+ DIMM_B1: 0.0.1; DIMM_B2: 0.1.1;
|
||||
+
|
||||
+#PowerEdge R320 T320
|
||||
+ Model: 0YCV59, 0Y97HY, 07DKYR, 0VJ84C, 07MYHN, 04DMNN, 0W7H8C, 0K20G5, 0V719V, 0FDT3J
|
||||
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0;
|
||||
+ DIMM_A4: 0.0.1; DIMM_A5: 0.1.1; DIMM_A6: 0.2.1;
|
||||
+
|
||||
+# 2-socket
|
||||
+# PowerEdge R620/T620 R720/xd R730/xd T630 R730 R630 T620 M620, FC620
|
||||
+ Model: 0VWT90, 07NDJ2, 0F5XM3, 0PXXHP, 0X3D66, 061P35, 0H5J4J, 00W9X3, 0599V5, 0W9WXC, 0599V5, 0H21J3, 0CNCJW, 02CD1V, 0T5TFW, 0F5XM3, 0G1CNH, 05YV77, 0PDCCX, 093MW8, 0NJVT7
|
||||
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0;
|
||||
+ DIMM_A5: 0.0.1; DIMM_A6: 0.1.1; DIMM_A7: 0.2.1; DIMM_A8: 0.3.1;
|
||||
+ DIMM_A9: 0.0.2; DIMM_A10: 0.1.2; DIMM_A11: 0.2.2; DIMM_A12: 0.3.2;
|
||||
+
|
||||
+ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0;
|
||||
+ DIMM_B5: 1.0.1; DIMM_B6: 1.1.1; DIMM_B7: 1.2.1; DIMM_B8: 1.3.1;
|
||||
+ DIMM_B9: 1.0.2; DIMM_B10: 1.1.2; DIMM_B11: 1.2.2; DIMM_B12: 1.3.2;
|
||||
+
|
||||
+# PowerEdge M520 R420 T420
|
||||
+ Model: 0NRG83, 0DW6GX, 03WPHJ, 06HTRX, 0H1Y24, 02T9N6, 0TT5P2, 0CPKXG, 03015M, 061VPC, 0PC9H0, 0K3G34, 0PC0V5, 08NVYK
|
||||
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0;
|
||||
+ DIMM_A4: 0.0.1; DIMM_A5: 0.1.1; DIMM_A6: 0.2.1;
|
||||
+
|
||||
+ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0;
|
||||
+ DIMM_B4: 1.0.1; DIMM_B5: 1.1.1; DIMM_B6: 1.2.1;
|
||||
+
|
||||
+#PowerEdge FC420, M420
|
||||
+ Model: 0DPJGD, 068CTP, 0MN3VC, 0417VP
|
||||
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0;
|
||||
+
|
||||
+ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0;
|
||||
+
|
||||
+# 4-socket
|
||||
+# # PowerEdge M820
|
||||
+ Model: 0RN9TC, 0YWR73, 066N7P, 0PFG1N, 0JC2W3
|
||||
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0;
|
||||
+ DIMM_A5: 0.0.1; DIMM_A6: 0.1.1; DIMM_A7: 0.2.1; DIMM_A8: 0.3.1;
|
||||
+ DIMM_A9: 0.0.2; DIMM_A10: 0.1.2; DIMM_A11: 0.2.2; DIMM_A12: 0.3.2;
|
||||
+
|
||||
+ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0;
|
||||
+ DIMM_B5: 1.0.1; DIMM_B6: 1.1.1; DIMM_B7: 1.2.1; DIMM_B8: 1.3.1;
|
||||
+ DIMM_B9: 1.0.2; DIMM_B10: 1.1.2; DIMM_B11: 1.2.2; DIMM_B12: 1.3.2;
|
||||
+
|
||||
+ DIMM_C1: 2.0.0; DIMM_C2: 2.1.0; DIMM_C3: 2.2.0; DIMM_C4: 2.3.0;
|
||||
+ DIMM_C5: 2.0.1; DIMM_C6: 2.1.1; DIMM_C7: 2.2.1; DIMM_C8: 2.3.1;
|
||||
+ DIMM_C9: 2.0.2; DIMM_C10: 2.1.2; DIMM_C11: 2.2.2; DIMM_C12: 2.3.2;
|
||||
+
|
||||
+ DIMM_D1: 3.0.0; DIMM_D2: 3.1.0; DIMM_D3: 3.2.0; DIMM_D4: 3.3.0;
|
||||
+ DIMM_D5: 3.0.1; DIMM_D6: 3.1.1; DIMM_D7: 3.2.1; DIMM_D8: 3.3.1;
|
||||
+ DIMM_D9: 3.0.2; DIMM_D10: 3.1.2; DIMM_D11: 3.2.2; DIMM_D12: 3.3.2;
|
||||
--
|
||||
1.9.3
|
|
@ -0,0 +1,295 @@
|
|||
From 108b124a09512d44cd810d1ef6b823c9d029d5d6 Mon Sep 17 00:00:00 2001
|
||||
From: Aristeu Rozanski <arozansk@redhat.com>
|
||||
Date: Mon, 18 May 2015 14:19:28 -0300
|
||||
Subject: [PATCH 01/13] rasdaemon: add support for Haswell
|
||||
|
||||
Based on mcelog code.
|
||||
|
||||
Acked-by: Tony Luck <tony.luck@intel,com>
|
||||
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
|
||||
---
|
||||
Makefile.am | 2 +-
|
||||
mce-intel-haswell.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
mce-intel.c | 2 +
|
||||
ras-mce-handler.c | 8 +++
|
||||
ras-mce-handler.h | 3 +
|
||||
5 files changed, 208 insertions(+), 1 deletion(-)
|
||||
create mode 100644 mce-intel-haswell.c
|
||||
|
||||
diff --git a/Makefile.am b/Makefile.am
|
||||
index 9c5f007..a6bf18f 100644
|
||||
--- a/Makefile.am
|
||||
+++ b/Makefile.am
|
||||
@@ -28,7 +28,7 @@ if WITH_MCE
|
||||
rasdaemon_SOURCES += ras-mce-handler.c mce-intel.c mce-amd-k8.c \
|
||||
mce-intel-p4-p6.c mce-intel-nehalem.c \
|
||||
mce-intel-dunnington.c mce-intel-tulsa.c \
|
||||
- mce-intel-sb.c mce-intel-ivb.c
|
||||
+ mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c
|
||||
endif
|
||||
if WITH_EXTLOG
|
||||
rasdaemon_SOURCES += ras-extlog-handler.c
|
||||
diff --git a/mce-intel-haswell.c b/mce-intel-haswell.c
|
||||
new file mode 100644
|
||||
index 0000000..c32704c
|
||||
--- /dev/null
|
||||
+++ b/mce-intel-haswell.c
|
||||
@@ -0,0 +1,194 @@
|
||||
+/*
|
||||
+ * The code below came from Tony Luck mcelog code,
|
||||
+ * released under GNU Public General License, v.2
|
||||
+ *
|
||||
+ * This program is free software; you can redistribute it and/or modify
|
||||
+ * it under the terms of the GNU General Public License as published by
|
||||
+ * the Free Software Foundation; either version 2 of the License, or
|
||||
+ * (at your option) any later version.
|
||||
+ *
|
||||
+ * This program is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
+ * GNU General Public License for more details.
|
||||
+ *
|
||||
+ * You should have received a copy of the GNU General Public License
|
||||
+ * along with this program; if not, write to the Free Software
|
||||
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
+*/
|
||||
+
|
||||
+#include <string.h>
|
||||
+#include <stdio.h>
|
||||
+
|
||||
+#include "ras-mce-handler.h"
|
||||
+#include "bitfield.h"
|
||||
+
|
||||
+
|
||||
+/* See IA32 SDM Vol3B Table 16-20 */
|
||||
+
|
||||
+static char *pcu_1[] = {
|
||||
+ [0x00] = "No Error",
|
||||
+ [0x09] = "MC_MESSAGE_CHANNEL_TIMEOUT",
|
||||
+ [0x0D] = "MC_IMC_FORCE_SR_S3_TIMEOUT",
|
||||
+ [0x0E] = "MC_CPD_UNCPD_SD_TIMEOUT",
|
||||
+ [0x13] = "MC_DMI_TRAINING_TIMEOUT",
|
||||
+ [0x15] = "MC_DMI_CPU_RESET_ACK_TIMEOUT",
|
||||
+ [0x1E] = "MC_VR_ICC_MAX_LT_FUSED_ICC_MAX",
|
||||
+ [0x25] = "MC_SVID_COMMAN_TIMEOUT",
|
||||
+ [0x29] = "MC_VR_VOUT_MAC_LT_FUSED_SVID",
|
||||
+ [0x2B] = "MC_PKGC_WATCHDOG_HANG_CBZ_DOWN",
|
||||
+ [0x2C] = "MC_PKGC_WATCHDOG_HANG_CBZ_UP",
|
||||
+ [0x39] = "MC_PKGC_WATCHDOG_HANG_C3_UP_SF",
|
||||
+ [0x44] = "MC_CRITICAL_VR_FAILED",
|
||||
+ [0x45] = "MC_ICC_MAX_NOTSUPPORTED",
|
||||
+ [0x46] = "MC_VID_RAMP_DOWN_FAILED",
|
||||
+ [0x47] = "MC_EXCL_MODE_NO_PMREQ_CMP",
|
||||
+ [0x48] = "MC_SVID_READ_REG_ICC_MAX_FAILED",
|
||||
+ [0x49] = "MC_SVID_WRITE_REG_VOUT_MAX_FAILED",
|
||||
+ [0x4B] = "MC_BOOT_VID_TIMEOUT_DRAM_0",
|
||||
+ [0x4C] = "MC_BOOT_VID_TIMEOUT_DRAM_1",
|
||||
+ [0x4D] = "MC_BOOT_VID_TIMEOUT_DRAM_2",
|
||||
+ [0x4E] = "MC_BOOT_VID_TIMEOUT_DRAM_3",
|
||||
+ [0x4F] = "MC_SVID_COMMAND_ERROR",
|
||||
+ [0x52] = "MC_FIVR_CATAS_OVERVOL_FAULT",
|
||||
+ [0x53] = "MC_FIVR_CATAS_OVERCUR_FAULT",
|
||||
+ [0x57] = "MC_SVID_PKGC_REQUEST_FAILED",
|
||||
+ [0x58] = "MC_SVID_IMON_REQUEST_FAILED",
|
||||
+ [0x59] = "MC_SVID_ALERT_REQUEST_FAILED",
|
||||
+ [0x60] = "MC_INVALID_PKGS_REQ_PCH",
|
||||
+ [0x61] = "MC_INVALID_PKGS_REQ_QPI",
|
||||
+ [0x62] = "MC_INVALID_PKGS_RSP_QPI",
|
||||
+ [0x63] = "MC_INVALID_PKGS_RSP_PCH",
|
||||
+ [0x64] = "MC_INVALID_PKG_STATE_CONFIG",
|
||||
+ [0x67] = "MC_HA_IMC_RW_BLOCK_ACK_TIMEOUT",
|
||||
+ [0x68] = "MC_IMC_RW_SMBUS_TIMEOUT",
|
||||
+ [0x69] = "MC_HA_FAILSTS_CHANGE_DETECTED",
|
||||
+ [0x6A] = "MC_MSGCH_PMREQ_CMP_TIMEOUT",
|
||||
+ [0x70] = "MC_WATCHDOG_TIMEOUT_PKGC_SLAVE",
|
||||
+ [0x71] = "MC_WATCHDOG_TIMEOUT_PKGC_MASTER",
|
||||
+ [0x72] = "MC_WATCHDOG_TIMEOUT_PKGS_MASTER",
|
||||
+ [0x7C] = "MC_BIOS_RST_CPL_INVALID_SEQ",
|
||||
+ [0x7D] = "MC_MORE_THAN_ONE_TXT_AGENT",
|
||||
+ [0x81] = "MC_RECOVERABLE_DIE_THERMAL_TOO_HOT"
|
||||
+};
|
||||
+
|
||||
+static struct field pcu_mc4[] = {
|
||||
+ FIELD(24, pcu_1),
|
||||
+ {}
|
||||
+};
|
||||
+
|
||||
+/* See IA32 SDM Vol3B Table 16-21 */
|
||||
+
|
||||
+static char *qpi[] = {
|
||||
+ [0x02] = "Intel QPI physical layer detected drift buffer alarm",
|
||||
+ [0x03] = "Intel QPI physical layer detected latency buffer rollover",
|
||||
+ [0x10] = "Intel QPI link layer detected control error from R3QPI",
|
||||
+ [0x11] = "Rx entered LLR abort state on CRC error",
|
||||
+ [0x12] = "Unsupported or undefined packet",
|
||||
+ [0x13] = "Intel QPI link layer control error",
|
||||
+ [0x15] = "RBT used un-initialized value",
|
||||
+ [0x20] = "Intel QPI physical layer detected a QPI in-band reset but aborted initialization",
|
||||
+ [0x21] = "Link failover data self healing",
|
||||
+ [0x22] = "Phy detected in-band reset (no width change)",
|
||||
+ [0x23] = "Link failover clock failover",
|
||||
+ [0x30] = "Rx detected CRC error - successful LLR after Phy re-init",
|
||||
+ [0x31] = "Rx detected CRC error - successful LLR wihout Phy re-init",
|
||||
+};
|
||||
+
|
||||
+static struct field qpi_mc[] = {
|
||||
+ FIELD(16, qpi),
|
||||
+ {}
|
||||
+};
|
||||
+
|
||||
+/* See IA32 SDM Vol3B Table 16-22 */
|
||||
+
|
||||
+static struct field memctrl_mc9[] = {
|
||||
+ SBITFIELD(16, "DDR3 address parity error"),
|
||||
+ SBITFIELD(17, "Uncorrected HA write data error"),
|
||||
+ SBITFIELD(18, "Uncorrected HA data byte enable error"),
|
||||
+ SBITFIELD(19, "Corrected patrol scrub error"),
|
||||
+ SBITFIELD(20, "Uncorrected patrol scrub error"),
|
||||
+ SBITFIELD(21, "Corrected spare error"),
|
||||
+ SBITFIELD(22, "Uncorrected spare error"),
|
||||
+ SBITFIELD(23, "Corrected memory read error"),
|
||||
+ SBITFIELD(24, "iMC write data buffer parity error"),
|
||||
+ SBITFIELD(25, "DDR4 command address parity error"),
|
||||
+ {}
|
||||
+};
|
||||
+
|
||||
+void hsw_decode_model(struct ras_events *ras, struct mce_event *e)
|
||||
+{
|
||||
+ uint64_t status = e->status;
|
||||
+ uint32_t mca = status & 0xffff;
|
||||
+ unsigned rank0 = -1, rank1 = -1, chan;
|
||||
+
|
||||
+ switch (e->bank) {
|
||||
+ case 4:
|
||||
+ switch (EXTRACT(status, 0, 15) & ~(1ull << 12)) {
|
||||
+ case 0x402: case 0x403:
|
||||
+ /* Internal errors */
|
||||
+ break;
|
||||
+ case 0x406:
|
||||
+ /* Intel TXT errors */
|
||||
+ break;
|
||||
+ case 0x407:
|
||||
+ /* Other UBOX Internal errors */
|
||||
+ break;
|
||||
+ }
|
||||
+ if (EXTRACT(status, 16, 19))
|
||||
+ /* PCU internal error */
|
||||
+ decode_bitfield(e, status, pcu_mc4);
|
||||
+ break;
|
||||
+ case 5:
|
||||
+ case 20:
|
||||
+ case 21:
|
||||
+ decode_bitfield(e, status, qpi_mc);
|
||||
+ break;
|
||||
+ case 9: case 10: case 11: case 12:
|
||||
+ case 13: case 14: case 15: case 16:
|
||||
+ decode_bitfield(e, status, memctrl_mc9);
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * Memory error specific code. Returns if the error is not a MC one
|
||||
+ */
|
||||
+
|
||||
+ /* Check if the error is at the memory controller */
|
||||
+ if ((mca >> 7) != 1)
|
||||
+ return;
|
||||
+
|
||||
+ /* Ignore unless this is an corrected extended error from an iMC bank */
|
||||
+ if (e->bank < 9 || e->bank > 16 || (status & MCI_STATUS_UC) ||
|
||||
+ !test_prefix(7, status & 0xefff))
|
||||
+ return;
|
||||
+
|
||||
+ /*
|
||||
+ * Parse the reported channel and ranks
|
||||
+ */
|
||||
+
|
||||
+ chan = EXTRACT(status, 0, 3);
|
||||
+ if (chan == 0xf)
|
||||
+ return;
|
||||
+
|
||||
+ mce_snprintf(e->mc_location, "memory_channel=%d", chan);
|
||||
+
|
||||
+ if (EXTRACT(e->misc, 62, 62))
|
||||
+ rank0 = EXTRACT(e->misc, 46, 50);
|
||||
+
|
||||
+ if (EXTRACT(e->misc, 63, 63))
|
||||
+ rank1 = EXTRACT(e->misc, 51, 55);
|
||||
+
|
||||
+ /*
|
||||
+ * FIXME: The conversion from rank to dimm requires to parse the
|
||||
+ * DMI tables and call failrank2dimm().
|
||||
+ */
|
||||
+ if (rank0 >= 0 && rank1 >= 0)
|
||||
+ mce_snprintf(e->mc_location, "ranks=%d and %d",
|
||||
+ rank0, rank1);
|
||||
+ else if (rank0 >= 0)
|
||||
+ mce_snprintf(e->mc_location, "rank=%d", rank0);
|
||||
+ else
|
||||
+ mce_snprintf(e->mc_location, "rank=%d", rank1);
|
||||
+}
|
||||
+
|
||||
diff --git a/mce-intel.c b/mce-intel.c
|
||||
index 427b98e..1546a1d 100644
|
||||
--- a/mce-intel.c
|
||||
+++ b/mce-intel.c
|
||||
@@ -392,6 +392,8 @@ int parse_intel_event(struct ras_events *ras, struct mce_event *e)
|
||||
case CPU_IVY_BRIDGE_EPEX:
|
||||
ivb_decode_model(ras, e);
|
||||
break;
|
||||
+ case CPU_HASWELL_EPEX:
|
||||
+ hsw_decode_model(ras, e);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
diff --git a/ras-mce-handler.c b/ras-mce-handler.c
|
||||
index a1d0b5d..d2de096 100644
|
||||
--- a/ras-mce-handler.c
|
||||
+++ b/ras-mce-handler.c
|
||||
@@ -47,6 +47,8 @@ static char *cputype_name[] = {
|
||||
[CPU_SANDY_BRIDGE_EP] = "Sandy Bridge EP", /* Fill in better name */
|
||||
[CPU_IVY_BRIDGE] = "Ivy Bridge", /* Fill in better name */
|
||||
[CPU_IVY_BRIDGE_EPEX] = "Ivy Bridge EP/EX", /* Fill in better name */
|
||||
+ [CPU_HASWELL] = "Haswell",
|
||||
+ [CPU_HASWELL_EPEX] = "Intel Xeon v3 (Haswell) EP/EX",
|
||||
};
|
||||
|
||||
static enum cputype select_intel_cputype(struct ras_events *ras)
|
||||
@@ -81,6 +83,12 @@ static enum cputype select_intel_cputype(struct ras_events *ras)
|
||||
return CPU_IVY_BRIDGE;
|
||||
else if (mce->model == 0x3e)
|
||||
return CPU_IVY_BRIDGE_EPEX;
|
||||
+ else if (mce->model == 0x3c || mce->model == 0x45 ||
|
||||
+ mce->model == 0x46)
|
||||
+ return CPU_HASWELL;
|
||||
+ else if (mce->model == 0x3f)
|
||||
+ return CPU_HASWELL_EPEX;
|
||||
+
|
||||
if (mce->model > 0x1a) {
|
||||
log(ALL, LOG_INFO,
|
||||
"Family 6 Model %x CPU: only decoding architectural errors\n",
|
||||
diff --git a/ras-mce-handler.h b/ras-mce-handler.h
|
||||
index 80e9769..b8b3d4f 100644
|
||||
--- a/ras-mce-handler.h
|
||||
+++ b/ras-mce-handler.h
|
||||
@@ -42,6 +42,8 @@ enum cputype {
|
||||
CPU_SANDY_BRIDGE_EP,
|
||||
CPU_IVY_BRIDGE,
|
||||
CPU_IVY_BRIDGE_EPEX,
|
||||
+ CPU_HASWELL,
|
||||
+ CPU_HASWELL_EPEX,
|
||||
};
|
||||
|
||||
struct mce_event {
|
||||
@@ -114,6 +116,7 @@ void xeon75xx_decode_model(struct mce_event *e);
|
||||
void dunnington_decode_model(struct mce_event *e);
|
||||
void snb_decode_model(struct ras_events *ras, struct mce_event *e);
|
||||
void ivb_decode_model(struct ras_events *ras, struct mce_event *e);
|
||||
+void hsw_decode_model(struct ras_events *ras, struct mce_event *e);
|
||||
void tulsa_decode_model(struct mce_event *e);
|
||||
|
||||
/* Software defined banks */
|
||||
--
|
||||
1.8.3.1
|
||||
|
|
@ -0,0 +1,40 @@
|
|||
From 85a2ead8f2d6e380be8d8234ba752a558e8027ed Mon Sep 17 00:00:00 2001
|
||||
From: Aristeu Rozanski <arozansk@redhat.com>
|
||||
Date: Mon, 18 May 2015 14:19:29 -0300
|
||||
Subject: [PATCH 02/13] rasdaemon: decode new simple error code number 6
|
||||
|
||||
This patch was based on fa313dd0144596dfa140bd66805367250d6eae9b
|
||||
(mcelog)
|
||||
|
||||
mcelog: Decode new simple error code number 6
|
||||
|
||||
Edition 050 of the Intel SDM released in late February 2014
|
||||
includes a new simple error code in "Table 15-8. IA32_MCi_Status
|
||||
[15:0] Simple Error Code Encoding". Code 6 (0000 0000 0000 0110)
|
||||
has been allocated for the reporting of cases where the BIOS SMM
|
||||
code attempts to execute code outside of the protected SMRR area.
|
||||
|
||||
Signed-off-by: Tony Luck <tony.luck@intel.com>
|
||||
Signed-off-by: Andi Kleen <ak@linux.intel.com>
|
||||
|
||||
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
|
||||
---
|
||||
mce-intel.c | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
diff --git a/mce-intel.c b/mce-intel.c
|
||||
index 1546a1d..69ea00e 100644
|
||||
--- a/mce-intel.c
|
||||
+++ b/mce-intel.c
|
||||
@@ -115,6 +115,7 @@ static char *mca_msg[] = {
|
||||
[3] = "External error",
|
||||
[4] = "FRC error",
|
||||
[5] = "Internal parity error",
|
||||
+ [6] = "SMM Handler Code Access Violation",
|
||||
};
|
||||
|
||||
static char *tracking_msg[] = {
|
||||
--
|
||||
1.8.3.1
|
||||
|
|
@ -0,0 +1,38 @@
|
|||
From 064a74b1202e529b5e16a54218fc17974906af2d Mon Sep 17 00:00:00 2001
|
||||
From: Aristeu Rozanski <arozansk@redhat.com>
|
||||
Date: Mon, 18 May 2015 14:19:30 -0300
|
||||
Subject: [PATCH 03/13] rasdaemon: Add missing entry to Ivy Bridge memory
|
||||
controller decode table
|
||||
|
||||
This patch is based on 2577aeb662374cb87169ee675b2e37c06f1aed99 (mcelog)
|
||||
|
||||
mcelog: Add missing entry to Ivy Bridge memory controller decode table
|
||||
|
||||
September 2013 edition of the software developer manual added an
|
||||
entry that had been inadvertently omitted from earlier editions.
|
||||
Add the 0x80 entry for "Corrected memory read error".
|
||||
|
||||
Signed-off-by: Tony Luck <tony.luck@intel.com>
|
||||
Signed-off-by: Andi Kleen <ak@linux.intel.com>
|
||||
|
||||
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
|
||||
---
|
||||
mce-intel-ivb.c | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
diff --git a/mce-intel-ivb.c b/mce-intel-ivb.c
|
||||
index f2a133a..0c5bebc 100644
|
||||
--- a/mce-intel-ivb.c
|
||||
+++ b/mce-intel-ivb.c
|
||||
@@ -76,6 +76,7 @@ static char *memctrl_1[] = {
|
||||
[0x010] = "Uncorrected patrol scrub error",
|
||||
[0x020] = "Corrected spare error",
|
||||
[0x040] = "Uncorrected spare error",
|
||||
+ [0x080] = "Corrected memory read error",
|
||||
[0x100] = "iMC, WDB, parity errors",
|
||||
};
|
||||
|
||||
--
|
||||
1.8.3.1
|
||||
|
|
@ -0,0 +1,38 @@
|
|||
From 66021c20c92b5df16b5c8dae4fb664788fa40376 Mon Sep 17 00:00:00 2001
|
||||
From: Aristeu Rozanski <arozansk@redhat.com>
|
||||
Date: Mon, 18 May 2015 14:19:31 -0300
|
||||
Subject: [PATCH 04/13] rasdaemon: Identify Ivy Bridge properly
|
||||
|
||||
This patch is based on b29cc4d615cead87cbc163ada0645b10c5b1217d (mcelog)
|
||||
mcelog: Identify Ivy Bridge properly
|
||||
|
||||
Uniquely identify Ivy Bridge even though the machine checks are the same
|
||||
for Sandy Bridge and Ivy Bridge. This makes the output for the processor
|
||||
display "Ivy Bridge".
|
||||
|
||||
Signed-off-by: Prarit Bhargava <prarit@redhat.com>
|
||||
Cc: tony.luck@intel.com
|
||||
Signed-off-by: Andi Kleen <ak@linux.intel.com>
|
||||
|
||||
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
|
||||
---
|
||||
ras-mce-handler.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/ras-mce-handler.c b/ras-mce-handler.c
|
||||
index d2de096..07e298f 100644
|
||||
--- a/ras-mce-handler.c
|
||||
+++ b/ras-mce-handler.c
|
||||
@@ -75,7 +75,7 @@ static enum cputype select_intel_cputype(struct ras_events *ras)
|
||||
return CPU_NEHALEM;
|
||||
else if (mce->model == 0x2e || mce->model == 0x2f)
|
||||
return CPU_XEON75XX;
|
||||
- else if (mce->model == 0x2a || mce->model == 0x3a)
|
||||
+ else if (mce->model == 0x2a)
|
||||
return CPU_SANDY_BRIDGE;
|
||||
else if (mce->model == 0x2d)
|
||||
return CPU_SANDY_BRIDGE_EP;
|
||||
--
|
||||
1.8.3.1
|
||||
|
|
@ -0,0 +1,52 @@
|
|||
From a9810094cf838e03102f95333db7ddfe810ccabd Mon Sep 17 00:00:00 2001
|
||||
From: Aristeu Rozanski <arozansk@redhat.com>
|
||||
Date: Mon, 18 May 2015 14:19:32 -0300
|
||||
Subject: [PATCH 05/13] rasdaemon: add support for Broadwell
|
||||
|
||||
Only basic support for now.
|
||||
|
||||
Based on mcelog code.
|
||||
|
||||
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
|
||||
---
|
||||
ras-mce-handler.c | 3 +++
|
||||
ras-mce-handler.h | 1 +
|
||||
2 files changed, 4 insertions(+)
|
||||
|
||||
diff --git a/ras-mce-handler.c b/ras-mce-handler.c
|
||||
index 07e298f..e059b92 100644
|
||||
--- a/ras-mce-handler.c
|
||||
+++ b/ras-mce-handler.c
|
||||
@@ -49,6 +49,7 @@ static char *cputype_name[] = {
|
||||
[CPU_IVY_BRIDGE_EPEX] = "Ivy Bridge EP/EX", /* Fill in better name */
|
||||
[CPU_HASWELL] = "Haswell",
|
||||
[CPU_HASWELL_EPEX] = "Intel Xeon v3 (Haswell) EP/EX",
|
||||
+ [CPU_BROADWELL] = "Broadwell",
|
||||
};
|
||||
|
||||
static enum cputype select_intel_cputype(struct ras_events *ras)
|
||||
@@ -88,6 +89,8 @@ static enum cputype select_intel_cputype(struct ras_events *ras)
|
||||
return CPU_HASWELL;
|
||||
else if (mce->model == 0x3f)
|
||||
return CPU_HASWELL_EPEX;
|
||||
+ else if (mce->model == 0x3d)
|
||||
+ return CPU_BROADWELL;
|
||||
|
||||
if (mce->model > 0x1a) {
|
||||
log(ALL, LOG_INFO,
|
||||
diff --git a/ras-mce-handler.h b/ras-mce-handler.h
|
||||
index b8b3d4f..ba01f55 100644
|
||||
--- a/ras-mce-handler.h
|
||||
+++ b/ras-mce-handler.h
|
||||
@@ -44,6 +44,7 @@ enum cputype {
|
||||
CPU_IVY_BRIDGE_EPEX,
|
||||
CPU_HASWELL,
|
||||
CPU_HASWELL_EPEX,
|
||||
+ CPU_BROADWELL,
|
||||
};
|
||||
|
||||
struct mce_event {
|
||||
--
|
||||
1.8.3.1
|
||||
|
|
@ -0,0 +1,50 @@
|
|||
From bd6c78d89f4e934fafb1136a15efc0d6df4635ed Mon Sep 17 00:00:00 2001
|
||||
From: Aristeu Rozanski <arozansk@redhat.com>
|
||||
Date: Mon, 18 May 2015 14:19:33 -0300
|
||||
Subject: [PATCH 06/13] rasdaemon: add support for Knights Landing
|
||||
|
||||
Patch based on mcelog.
|
||||
|
||||
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
|
||||
---
|
||||
ras-mce-handler.c | 3 +++
|
||||
ras-mce-handler.h | 1 +
|
||||
2 files changed, 4 insertions(+)
|
||||
|
||||
diff --git a/ras-mce-handler.c b/ras-mce-handler.c
|
||||
index e059b92..63f14fd 100644
|
||||
--- a/ras-mce-handler.c
|
||||
+++ b/ras-mce-handler.c
|
||||
@@ -50,6 +50,7 @@ static char *cputype_name[] = {
|
||||
[CPU_HASWELL] = "Haswell",
|
||||
[CPU_HASWELL_EPEX] = "Intel Xeon v3 (Haswell) EP/EX",
|
||||
[CPU_BROADWELL] = "Broadwell",
|
||||
+ [CPU_KNIGHTS_LANDING] = "Knights Landing",
|
||||
};
|
||||
|
||||
static enum cputype select_intel_cputype(struct ras_events *ras)
|
||||
@@ -91,6 +92,8 @@ static enum cputype select_intel_cputype(struct ras_events *ras)
|
||||
return CPU_HASWELL_EPEX;
|
||||
else if (mce->model == 0x3d)
|
||||
return CPU_BROADWELL;
|
||||
+ else if (mce->model == 0x57)
|
||||
+ return CPU_KNIGHTS_LANDING;
|
||||
|
||||
if (mce->model > 0x1a) {
|
||||
log(ALL, LOG_INFO,
|
||||
diff --git a/ras-mce-handler.h b/ras-mce-handler.h
|
||||
index ba01f55..28aad00 100644
|
||||
--- a/ras-mce-handler.h
|
||||
+++ b/ras-mce-handler.h
|
||||
@@ -45,6 +45,7 @@ enum cputype {
|
||||
CPU_HASWELL,
|
||||
CPU_HASWELL_EPEX,
|
||||
CPU_BROADWELL,
|
||||
+ CPU_KNIGHTS_LANDING,
|
||||
};
|
||||
|
||||
struct mce_event {
|
||||
--
|
||||
1.8.3.1
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
From 5dd11c60b84294a3c6ce5ccb0db726b3dce35b10 Mon Sep 17 00:00:00 2001
|
||||
From: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com>
|
||||
Date: Tue, 26 May 2015 11:59:36 -0300
|
||||
Subject: [PATCH 07/13] rasdaemon: properly pring message strings in
|
||||
decode_bitfield()
|
||||
|
||||
Fix decode_bitfield() so that it does print message strings from the struct
|
||||
field table.
|
||||
|
||||
Signed-off-by: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com>
|
||||
Signed-off-by: Aristeu Rozanski <aris@redhat.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
|
||||
---
|
||||
bitfield.c | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/bitfield.c b/bitfield.c
|
||||
index 1dda30d..d6931c9 100644
|
||||
--- a/bitfield.c
|
||||
+++ b/bitfield.c
|
||||
@@ -84,7 +84,8 @@ void decode_bitfield(struct mce_event *e, uint64_t status,
|
||||
continue;
|
||||
mce_snprintf(e->error_msg, "<%u:%llx>",
|
||||
f->start_bit, (long long)v);
|
||||
- }
|
||||
+ } else
|
||||
+ mce_snprintf(e->error_msg, "%s", s);
|
||||
}
|
||||
}
|
||||
|
||||
--
|
||||
1.8.3.1
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
From abf36efe909c4022260cb4016c54d1ec3ec18cb8 Mon Sep 17 00:00:00 2001
|
||||
From: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com>
|
||||
Date: Tue, 26 May 2015 11:59:37 -0300
|
||||
Subject: [PATCH 08/13] rasdaemon: add missing semicolon in hsw_decode_model()
|
||||
|
||||
hsw_decode_model() tries to skip decode_bitfield() if IA32_MC4_STATUS indicates
|
||||
some internal errors. Unfortunately, here behaves opposite to the intention
|
||||
because a semicolon is missing.
|
||||
|
||||
Signed-off-by: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com>
|
||||
Signed-off-by: Aristeu Rozanski <aris@redhat.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
|
||||
---
|
||||
mce-intel-haswell.c | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
diff --git a/mce-intel-haswell.c b/mce-intel-haswell.c
|
||||
index c32704c..3ac12f2 100644
|
||||
--- a/mce-intel-haswell.c
|
||||
+++ b/mce-intel-haswell.c
|
||||
@@ -137,6 +137,7 @@ void hsw_decode_model(struct ras_events *ras, struct mce_event *e)
|
||||
}
|
||||
if (EXTRACT(status, 16, 19))
|
||||
/* PCU internal error */
|
||||
+ ;
|
||||
decode_bitfield(e, status, pcu_mc4);
|
||||
break;
|
||||
case 5:
|
||||
--
|
||||
1.8.3.1
|
||||
|
|
@ -0,0 +1,43 @@
|
|||
From f892a390c55c0b350c57cda9d166a9cf331aa36f Mon Sep 17 00:00:00 2001
|
||||
From: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com>
|
||||
Date: Tue, 26 May 2015 11:59:38 -0300
|
||||
Subject: [PATCH 09/13] rasdaemon: enable IMC status usage for Haswell-E
|
||||
|
||||
Enable IMC status bank for Haswell-E, as described in Intel SDM Vol.3C
|
||||
Table 35-27.
|
||||
|
||||
Signed-off-by: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com>
|
||||
Signed-off-by: Aristeu Rozanski <aris@redhat.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
|
||||
---
|
||||
mce-intel.c | 1 +
|
||||
ras-mce-handler.c | 1 +
|
||||
2 files changed, 2 insertions(+)
|
||||
|
||||
diff --git a/mce-intel.c b/mce-intel.c
|
||||
index 69ea00e..3684602 100644
|
||||
--- a/mce-intel.c
|
||||
+++ b/mce-intel.c
|
||||
@@ -457,6 +457,7 @@ int set_intel_imc_log(enum cputype cputype, unsigned ncpus)
|
||||
switch (cputype) {
|
||||
case CPU_SANDY_BRIDGE_EP:
|
||||
case CPU_IVY_BRIDGE_EPEX:
|
||||
+ case CPU_HASWELL_EPEX:
|
||||
msr = 0x17f; /* MSR_ERROR_CONTROL */
|
||||
bit = 0x2; /* MemError Log Enable */
|
||||
break;
|
||||
diff --git a/ras-mce-handler.c b/ras-mce-handler.c
|
||||
index 63f14fd..fb6db8a 100644
|
||||
--- a/ras-mce-handler.c
|
||||
+++ b/ras-mce-handler.c
|
||||
@@ -221,6 +221,7 @@ int register_mce_handler(struct ras_events *ras, unsigned ncpus)
|
||||
switch (mce->cputype) {
|
||||
case CPU_SANDY_BRIDGE_EP:
|
||||
case CPU_IVY_BRIDGE_EPEX:
|
||||
+ case CPU_HASWELL_EPEX:
|
||||
set_intel_imc_log(mce->cputype, ncpus);
|
||||
default:
|
||||
break;
|
||||
--
|
||||
1.8.3.1
|
||||
|
|
@ -0,0 +1,54 @@
|
|||
From 56913e2f2a5a6ddf8ab684c8d528e9ef1d55cfba Mon Sep 17 00:00:00 2001
|
||||
From: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com>
|
||||
Date: Tue, 26 May 2015 11:59:39 -0300
|
||||
Subject: [PATCH 10/13] rasdaemon: make sure the error is valid before handling
|
||||
ranks
|
||||
|
||||
Fix "rank" handling according to the Bit 63 description in Intel SDM Vol.3C
|
||||
Table 16-23, that says "... Use this information only after there is valid
|
||||
first error info indicated by bit 62".
|
||||
Also fix invalid comparisons of unsigned variables "rank0" and "rank1".
|
||||
|
||||
Signed-off-by: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com>
|
||||
Signed-off-by: Aristeu Rozanski <aris@redhat.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
|
||||
---
|
||||
mce-intel-haswell.c | 14 ++++++--------
|
||||
1 file changed, 6 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/mce-intel-haswell.c b/mce-intel-haswell.c
|
||||
index 3ac12f2..0a817bf 100644
|
||||
--- a/mce-intel-haswell.c
|
||||
+++ b/mce-intel-haswell.c
|
||||
@@ -174,22 +174,20 @@ void hsw_decode_model(struct ras_events *ras, struct mce_event *e)
|
||||
|
||||
mce_snprintf(e->mc_location, "memory_channel=%d", chan);
|
||||
|
||||
- if (EXTRACT(e->misc, 62, 62))
|
||||
+ if (EXTRACT(e->misc, 62, 62)) {
|
||||
rank0 = EXTRACT(e->misc, 46, 50);
|
||||
-
|
||||
- if (EXTRACT(e->misc, 63, 63))
|
||||
- rank1 = EXTRACT(e->misc, 51, 55);
|
||||
+ if (EXTRACT(e->misc, 63, 63))
|
||||
+ rank1 = EXTRACT(e->misc, 51, 55);
|
||||
+ }
|
||||
|
||||
/*
|
||||
* FIXME: The conversion from rank to dimm requires to parse the
|
||||
* DMI tables and call failrank2dimm().
|
||||
*/
|
||||
- if (rank0 >= 0 && rank1 >= 0)
|
||||
+ if (rank0 != -1 && rank1 != -1)
|
||||
mce_snprintf(e->mc_location, "ranks=%d and %d",
|
||||
rank0, rank1);
|
||||
- else if (rank0 >= 0)
|
||||
+ else if (rank0 != -1)
|
||||
mce_snprintf(e->mc_location, "rank=%d", rank0);
|
||||
- else
|
||||
- mce_snprintf(e->mc_location, "rank=%d", rank1);
|
||||
}
|
||||
|
||||
--
|
||||
1.8.3.1
|
||||
|
|
@ -0,0 +1,261 @@
|
|||
From 3a38f8e66a2aa5c477cea152e1acc9a781834b83 Mon Sep 17 00:00:00 2001
|
||||
From: Aristeu Rozanski <aris@redhat.com>
|
||||
Date: Mon, 1 Jun 2015 17:04:00 -0300
|
||||
Subject: [PATCH 11/13] rasdaemon: add support to match the machine by system's
|
||||
product name
|
||||
|
||||
In some cases the motherboard names will change but the mapping won't
|
||||
across a line of products. This patch adds support for "Product:" to be
|
||||
specified in the label files instead of Model:.
|
||||
|
||||
An example:
|
||||
Vendor: Dell Inc.
|
||||
Product: PowerEdge R610
|
||||
DIMM_A1: 0.0.0; DIMM_A2: 0.0.1; DIMM_A3: 0.0.2;
|
||||
DIMM_A4: 0.1.0; DIMM_A5: 0.1.1; DIMM_A6: 0.1.2;
|
||||
|
||||
DIMM_B1: 1.0.0; DIMM_B2: 1.0.1; DIMM_B3: 1.0.2;
|
||||
DIMM_B4: 1.1.0; DIMM_B5: 1.1.1; DIMM_B6: 1.1.2;
|
||||
|
||||
Would match all 'PowerEdge R610' machines.
|
||||
|
||||
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
|
||||
---
|
||||
util/ras-mc-ctl.in | 127 +++++++++++++++++++++++++++++++++++++++++------------
|
||||
1 file changed, 98 insertions(+), 29 deletions(-)
|
||||
|
||||
diff --git a/util/ras-mc-ctl.in b/util/ras-mc-ctl.in
|
||||
index 7b6d798..6350f62 100755
|
||||
--- a/util/ras-mc-ctl.in
|
||||
+++ b/util/ras-mc-ctl.in
|
||||
@@ -288,8 +288,27 @@ sub parse_dimm_nodes
|
||||
}
|
||||
}
|
||||
|
||||
+sub guess_product {
|
||||
+ my $pvendor = undef;
|
||||
+ my $pname = undef;
|
||||
+
|
||||
+ if (open (VENDOR, "/sys/class/dmi/id/product_vendor")) {
|
||||
+ $pvendor = <VENDOR>;
|
||||
+ close VENDOR;
|
||||
+ chomp($pvendor);
|
||||
+ }
|
||||
+ if (open (NAME, "/sys/class/dmi/id/product_name")) {
|
||||
+ $pname = <NAME>;
|
||||
+ close NAME;
|
||||
+ chomp($pname);
|
||||
+ }
|
||||
+
|
||||
+ return ($pvendor, $pname);
|
||||
+}
|
||||
+
|
||||
sub get_mainboard_info {
|
||||
my ($vendor, $model);
|
||||
+ my ($pvendor, $pname);
|
||||
|
||||
if ($conf{opt}{mainboard} && $conf{opt}{mainboard} ne "report") {
|
||||
($vendor, $model) = split (/[: ]/, $conf{opt}{mainboard}, 2);
|
||||
@@ -301,6 +320,15 @@ sub get_mainboard_info {
|
||||
|
||||
$conf{mainboard}{vendor} = $vendor;
|
||||
$conf{mainboard}{model} = $model;
|
||||
+
|
||||
+ ($pvendor, $pname) = guess_product ();
|
||||
+ # since product vendor is rare, use mainboard's vendor
|
||||
+ if ($pvendor) {
|
||||
+ $conf{mainboard}{product_vendor} = $pvendor;
|
||||
+ } else {
|
||||
+ $conf{mainboard}{product_vendor} = $vendor;
|
||||
+ }
|
||||
+ $conf{mainboard}{product_name} = $pname if $pname;
|
||||
}
|
||||
|
||||
sub guess_vendor_model_dmidecode {
|
||||
@@ -449,10 +477,11 @@ sub guess_dimm_label {
|
||||
|
||||
sub parse_dimm_labels_file
|
||||
{
|
||||
- my ($lh, $num_layers, $file) = (@_);
|
||||
+ my ($lh, $num_layers, $lh_prod, $num_layers_prod, $file) = (@_);
|
||||
my $line = -1;
|
||||
my $vendor = "";
|
||||
my @models = ();
|
||||
+ my @products = ();
|
||||
my $num;
|
||||
|
||||
open (LABELS, "$file")
|
||||
@@ -469,12 +498,21 @@ sub parse_dimm_labels_file
|
||||
if (/vendor\s*:\s*(.*\S)\s*/i) {
|
||||
$vendor = lc $1;
|
||||
@models = ();
|
||||
+ @products = ();
|
||||
$num = 0;
|
||||
next;
|
||||
}
|
||||
if (/(model|board)\s*:\s*(.*)$/i) {
|
||||
!$vendor && die "$file: line $line: MB model without vendor\n";
|
||||
@models = grep { s/\s*(.*)\s*$/$1/ } split(/[,;]+/, $2);
|
||||
+ @products = ();
|
||||
+ $num = 0;
|
||||
+ next;
|
||||
+ }
|
||||
+ if (/(product)\s*:\s*(.*)$/i) {
|
||||
+ !$vendor && die "$file: line $line: product without vendor\n";
|
||||
+ @models = ();
|
||||
+ @products = grep { s/\s*(.*)\s*$/$1/ } split(/[,;]+/, $2);
|
||||
$num = 0;
|
||||
next;
|
||||
}
|
||||
@@ -513,10 +551,13 @@ sub parse_dimm_labels_file
|
||||
}
|
||||
map { $lh->{$vendor}{lc $_}{$mc}{$top}{$mid}{$low} = $label }
|
||||
@models;
|
||||
+ map { $lh_prod->{$vendor}{lc $_}{$mc}{$top}{$mid}{$low} = $label }
|
||||
+ @products;
|
||||
}
|
||||
if (!$num) {
|
||||
$num = $n;
|
||||
map { $num_layers->{$vendor}{lc $_} = $num } @models;
|
||||
+ map { $num_layers_prod->{$vendor}{lc $_} = $num } @products;
|
||||
} elsif ($num != $n) {
|
||||
die ("Error: Inconsistent number of layers at label db \"$file\"\n");
|
||||
}
|
||||
@@ -531,6 +572,8 @@ sub parse_dimm_labels
|
||||
{
|
||||
my %labels = ();
|
||||
my %num_layers = ();
|
||||
+ my %labels_prod = ();
|
||||
+ my %num_layers_prod = ();
|
||||
|
||||
#
|
||||
# Accrue all DIMM labels from the labels.db file, as
|
||||
@@ -538,10 +581,10 @@ sub parse_dimm_labels
|
||||
#
|
||||
for my $file ($conf{labeldb}, <$conf{labeldir}/*>) {
|
||||
next unless -r $file;
|
||||
- parse_dimm_labels_file (\%labels, \%num_layers, $file);
|
||||
+ parse_dimm_labels_file (\%labels, \%num_layers, \%labels_prod, \%num_layers_prod, $file);
|
||||
}
|
||||
|
||||
- return (\%labels, \%num_layers);
|
||||
+ return (\%labels, \%num_layers, \%labels_prod, \%num_layers_prod);
|
||||
}
|
||||
|
||||
sub read_dimm_label
|
||||
@@ -598,25 +641,9 @@ sub get_dimm_label_node
|
||||
}
|
||||
|
||||
|
||||
-sub print_dimm_labels
|
||||
+sub _print_dimm_labels
|
||||
{
|
||||
- my $fh = shift || *STDOUT;
|
||||
- my ($lref, $num_layers) = parse_dimm_labels ();
|
||||
- my $vendor = lc $conf{mainboard}{vendor};
|
||||
- my $model = lc $conf{mainboard}{model};
|
||||
- my $format = "%-35s %-20s %-20s\n";
|
||||
-
|
||||
- if (!exists $$lref{$vendor}{$model}) {
|
||||
- log_error ("No dimm labels for $conf{mainboard}{vendor} " .
|
||||
- "model $conf{mainboard}{model}\n");
|
||||
- return;
|
||||
- }
|
||||
-
|
||||
- my $sysfs_dir = "/sys/devices/system/edac/mc";
|
||||
-
|
||||
- find({wanted => \&parse_dimm_nodes, no_chdir => 1}, $sysfs_dir);
|
||||
-
|
||||
- printf $fh $format, "LOCATION", "CONFIGURED LABEL", "SYSFS CONTENTS";
|
||||
+ my ($lref, $num_layers, $vendor, $model, $fh, $format) = @_;
|
||||
|
||||
for my $mc (sort keys %{$$lref{$vendor}{$model}}) {
|
||||
for my $top (sort keys %{$$lref{$vendor}{$model}{$mc}}) {
|
||||
@@ -631,26 +658,40 @@ sub print_dimm_labels
|
||||
}
|
||||
}
|
||||
print $fh "\n";
|
||||
-
|
||||
}
|
||||
|
||||
-sub register_dimm_labels
|
||||
+sub print_dimm_labels
|
||||
{
|
||||
- my ($lref, $num_layers) = parse_dimm_labels ();
|
||||
+ my $fh = shift || *STDOUT;
|
||||
+ my ($lref, $num_layers, $lref_prod, $num_layers_prod) = parse_dimm_labels ();
|
||||
my $vendor = lc $conf{mainboard}{vendor};
|
||||
my $model = lc $conf{mainboard}{model};
|
||||
- my $sysfs = "/sys/devices/system/edac/mc";
|
||||
+ my $pvendor = lc $conf{mainboard}{product_vendor};
|
||||
+ my $pname = lc $conf{mainboard}{product_name};
|
||||
+ my $format = "%-35s %-20s %-20s\n";
|
||||
|
||||
- if (!exists $$lref{$vendor}{$model}) {
|
||||
+ if (!exists $$lref{$vendor}{$model} && !exists $$lref_prod{$pvendor}{$pname}) {
|
||||
log_error ("No dimm labels for $conf{mainboard}{vendor} " .
|
||||
- "model $conf{mainboard}{model}\n");
|
||||
- return 0;
|
||||
+ "model $conf{mainboard}{model}\n");
|
||||
+ return;
|
||||
}
|
||||
+
|
||||
my $sysfs_dir = "/sys/devices/system/edac/mc";
|
||||
|
||||
find({wanted => \&parse_dimm_nodes, no_chdir => 1}, $sysfs_dir);
|
||||
|
||||
- select (undef, undef, undef, $conf{opt}{delay});
|
||||
+ printf $fh $format, "LOCATION", "CONFIGURED LABEL", "SYSFS CONTENTS";
|
||||
+
|
||||
+ if (exists $$lref{$vendor}{$model}) {
|
||||
+ _print_dimm_labels($lref, $num_layers, $vendor, $model, $fh, $format);
|
||||
+ } elsif (exists $$lref_prod{$pvendor}{$pname}) {
|
||||
+ _print_dimm_labels($lref_prod, $num_layers_prod, $pvendor, $pname, $fh, $format);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+sub write_dimm_labels
|
||||
+{
|
||||
+ my ($lref, $num_layers, $vendor, $model) = @_;
|
||||
|
||||
for my $mc (sort keys %{$$lref{$vendor}{$model}}) {
|
||||
for my $top (sort keys %{$$lref{$vendor}{$model}{$mc}}) {
|
||||
@@ -675,6 +716,34 @@ sub register_dimm_labels
|
||||
}
|
||||
}
|
||||
}
|
||||
+}
|
||||
+
|
||||
+sub register_dimm_labels
|
||||
+{
|
||||
+ my ($lref, $num_layers, $lref_prod, $num_layers_prod) = parse_dimm_labels ();
|
||||
+ my $vendor = lc $conf{mainboard}{vendor};
|
||||
+ my $model = lc $conf{mainboard}{model};
|
||||
+ my $pvendor = lc $conf{mainboard}{product_vendor};
|
||||
+ my $pname = lc $conf{mainboard}{product_name};
|
||||
+ my $sysfs = "/sys/devices/system/edac/mc";
|
||||
+
|
||||
+ if (!exists $$lref{$vendor}{$model} && !exists $$lref_prod{$pvendor}{$pname}) {
|
||||
+ log_error ("No dimm labels for $conf{mainboard}{vendor} " .
|
||||
+ "model $conf{mainboard}{model}\n");
|
||||
+ return 0;
|
||||
+ }
|
||||
+ my $sysfs_dir = "/sys/devices/system/edac/mc";
|
||||
+
|
||||
+ find({wanted => \&parse_dimm_nodes, no_chdir => 1}, $sysfs_dir);
|
||||
+
|
||||
+ select (undef, undef, undef, $conf{opt}{delay});
|
||||
+
|
||||
+ if (exists $$lref{$vendor}{$model}) {
|
||||
+ write_dimm_labels($lref, $num_layers, $vendor, $model);
|
||||
+ } else {
|
||||
+ write_dimm_labels($lref_prod, $num_layers_prod, $pvendor, $pname);
|
||||
+ }
|
||||
+
|
||||
return 1;
|
||||
}
|
||||
|
||||
--
|
||||
1.8.3.1
|
||||
|
|
@ -0,0 +1,48 @@
|
|||
From a50a2ae341f8821d71a19d9a3c6ca345e1499e25 Mon Sep 17 00:00:00 2001
|
||||
From: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com>
|
||||
Date: Wed, 17 Jun 2015 07:56:57 -0300
|
||||
Subject: [PATCH 5/5] rasdaemon: add internal errors of IA32_MC4_STATUS for
|
||||
Haswell
|
||||
|
||||
Now rasdaemon looks purposely omitting internal errors of
|
||||
IA32_MC4_STATUS for Haswell-family processors, which are described in
|
||||
Intel SDM vol3 Table 16-20. I think it's better to show these errors
|
||||
because mcelog does show them.
|
||||
|
||||
Signed-off-by: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
|
||||
---
|
||||
mce-intel-haswell.c | 11 +++++------
|
||||
1 file changed, 5 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/mce-intel-haswell.c b/mce-intel-haswell.c
|
||||
index 0a817bf..b70e399 100644
|
||||
--- a/mce-intel-haswell.c
|
||||
+++ b/mce-intel-haswell.c
|
||||
@@ -126,18 +126,17 @@ void hsw_decode_model(struct ras_events *ras, struct mce_event *e)
|
||||
case 4:
|
||||
switch (EXTRACT(status, 0, 15) & ~(1ull << 12)) {
|
||||
case 0x402: case 0x403:
|
||||
- /* Internal errors */
|
||||
+ mce_snprintf(e->mcastatus_msg, "PCU Internal Errors");
|
||||
break;
|
||||
case 0x406:
|
||||
- /* Intel TXT errors */
|
||||
+ mce_snprintf(e->mcastatus_msg, "Intel TXT Errors");
|
||||
break;
|
||||
case 0x407:
|
||||
- /* Other UBOX Internal errors */
|
||||
+ mce_snprintf(e->mcastatus_msg, "Other UBOX Internal Errors");
|
||||
break;
|
||||
}
|
||||
- if (EXTRACT(status, 16, 19))
|
||||
- /* PCU internal error */
|
||||
- ;
|
||||
+ if (EXTRACT(status, 16, 17) && !EXTRACT(status, 18, 19))
|
||||
+ mce_snprintf(e->error_msg, "PCU Internal error");
|
||||
decode_bitfield(e, status, pcu_mc4);
|
||||
break;
|
||||
case 5:
|
||||
--
|
||||
1.8.3.1
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
From 45b575b791dbd3d5660a0c08065a9fbcb6e21eb9 Mon Sep 17 00:00:00 2001
|
||||
From: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com>
|
||||
Date: Wed, 10 Jun 2015 07:29:03 -0300
|
||||
Subject: [PATCH 2/5] rasdaemon: remove a space from mcgstatus_msg
|
||||
|
||||
"ras-mc-ctl --errors" shows an unnecessary space character in the
|
||||
mcgstatus string of MCE event, like below:
|
||||
|
||||
2 2015-04-04 19:57:22 +0900 error: MC_HA_IMC_RW_BLOCK_ACK_TIMEOUT, mcg mcgstatus= 0, mci Corrected_error, mcgcap=0x07000c16, status=0x8000000067000e0b, walltime=0x555da140, cpu=0x00000001, cpuid=0x000306f3, apicid=0x00000002, bank=0x00000004
|
||||
|
||||
Let's remove it.
|
||||
|
||||
Signed-off-by: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
|
||||
---
|
||||
mce-intel.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/mce-intel.c b/mce-intel.c
|
||||
index 3503c6a..77b929b 100644
|
||||
--- a/mce-intel.c
|
||||
+++ b/mce-intel.c
|
||||
@@ -176,7 +176,7 @@ static void decode_mcg(struct mce_event *e)
|
||||
{
|
||||
uint64_t mcgstatus = e->mcgstatus;
|
||||
|
||||
- mce_snprintf(e->mcgstatus_msg, "mcgstatus= %lld",
|
||||
+ mce_snprintf(e->mcgstatus_msg, "mcgstatus=%lld",
|
||||
(long long)e->mcgstatus);
|
||||
|
||||
if (mcgstatus & MCG_STATUS_RIPV)
|
||||
--
|
||||
1.8.3.1
|
||||
|
|
@ -0,0 +1,36 @@
|
|||
From 349da4c3d63ec6dceef66a405561984561d31582 Mon Sep 17 00:00:00 2001
|
||||
From: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com>
|
||||
Date: Wed, 10 Jun 2015 20:49:55 -0300
|
||||
Subject: [PATCH 3/5] rasdaemon: unnecessary comma for empty mc_location string
|
||||
|
||||
Into the /var/log/messages, rasdaemon sometimes prints an unnecessary
|
||||
comma ", " between mca= and cpu_type= like below:
|
||||
|
||||
Jun 9 02:44:39 localhost rasdaemon: <...>-4585 [1638893312] 1031.109000: mce_record: 2015-06-08 10:07:28 +0900 bank=3, status= 9c0000000000017a, mci=Corrected_error Error_enabled, mca=Generic CACHE Level-2 Eviction Error, , cpu_type= Intel Xeon v3 (Haswell) EP/EX, cpu= 1, socketid= 0, misc= 4004000000000080, addr= 204fffffff, mcgstatus= 0, mcgcap= 7000c16, apicid= 2
|
||||
|
||||
That's the comma for mc_location which is printed even if mc_location is
|
||||
empty due to a wrong if condition.
|
||||
|
||||
Signed-off-by: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com>
|
||||
Acked-by: Aristeu Rozanski <aris@redhat.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
|
||||
---
|
||||
ras-mce-handler.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/ras-mce-handler.c b/ras-mce-handler.c
|
||||
index fb6db8a..07252a0 100644
|
||||
--- a/ras-mce-handler.c
|
||||
+++ b/ras-mce-handler.c
|
||||
@@ -278,7 +278,7 @@ static void report_mce_event(struct ras_events *ras,
|
||||
if (*e->user_action)
|
||||
trace_seq_printf(s, " %s", e->user_action);
|
||||
|
||||
- if (e->mc_location)
|
||||
+ if (*e->mc_location)
|
||||
trace_seq_printf(s, ", %s", e->mc_location);
|
||||
|
||||
#if 0
|
||||
--
|
||||
1.8.3.1
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
From 9136d7422a6b53c50a920f3dd2539bf7fcd4fdf5 Mon Sep 17 00:00:00 2001
|
||||
From: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com>
|
||||
Date: Fri, 12 Jun 2015 06:35:37 -0300
|
||||
Subject: [PATCH 4/5] rasdaemon: use MCA error msg as error_msg
|
||||
|
||||
In the case of machine-checks which do not have a model-specific MCA
|
||||
error code but have an architectural code only, mce_event.error_msg
|
||||
becomes empty then you don't know what happened.
|
||||
|
||||
(snip)
|
||||
MCE records summary:
|
||||
1 errors
|
||||
^
|
||||
empty!
|
||||
|
||||
(snip)
|
||||
MCE events:
|
||||
1 2015-06-12 00:21:46 +0900 error: , mcg mcgstatus= 0, mci Corrected_error
|
||||
^
|
||||
empty!
|
||||
|
||||
Error_enabled, mcgcap=0x07000c16, status=0x9c0000000000017a, addr=0x204fffffff, misc=0x4004000000000080, walltime=0x557b0db2, cpu=0x00000001, cpuid=0x000306f3, apicid=0x00000002, bank=0x00000003
|
||||
|
||||
In such a case, let's use the content of mcastatus_msg as error_msg
|
||||
instead.
|
||||
|
||||
(snip)
|
||||
MCE records summary:
|
||||
1 Generic CACHE Level-2 Eviction Error errors
|
||||
(snip)
|
||||
MCE events:
|
||||
1 2015-06-12 02:39:04 +0900 error: Generic CACHE Level-2 Eviction Error, mcg mcgstatus= 0, mci Corrected_error Error_enabled, mcgcap=0x07000c16, status=0x9c0000000000017a, addr=0x204fffffff, misc=0x4004000000000080, walltime=0x557b1f22, cpu=0x00000001, cpuid=0x000306f3, apicid=0x00000002, bank=0x00000003
|
||||
|
||||
Signed-off-by: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com>
|
||||
Acked-by: Aristeu Rozanski <aris@redhat.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
|
||||
---
|
||||
ras-mce-handler.c | 3 +++
|
||||
1 file changed, 3 insertions(+)
|
||||
|
||||
diff --git a/ras-mce-handler.c b/ras-mce-handler.c
|
||||
index 07252a0..3976f90 100644
|
||||
--- a/ras-mce-handler.c
|
||||
+++ b/ras-mce-handler.c
|
||||
@@ -411,6 +411,9 @@ int ras_mce_event_handler(struct trace_seq *s,
|
||||
if (rc)
|
||||
return rc;
|
||||
|
||||
+ if (!*e.error_msg && *e.mcastatus_msg)
|
||||
+ mce_snprintf(e.error_msg, "%s", e.mcastatus_msg);
|
||||
+
|
||||
report_mce_event(ras, record, s, &e);
|
||||
|
||||
#ifdef HAVE_SQLITE3
|
||||
--
|
||||
1.8.3.1
|
||||
|
|
@ -0,0 +1,50 @@
|
|||
From fa6260eb1304c6c829af177ab4aa1937db36fab1 Mon Sep 17 00:00:00 2001
|
||||
From: Ashok Raj <ashok.raj@intel.com>
|
||||
Date: Fri, 5 Jun 2015 13:32:47 -0300
|
||||
Subject: [PATCH 1/5] x86, rasdaemon: Add support to log Local Machine Check
|
||||
Exception (LMCE)
|
||||
|
||||
Local Machine Check Exception allows certain errors to be signaled to
|
||||
only the affected logical processor. This change captures them for
|
||||
rasdaemon.
|
||||
|
||||
log:Changes to rasdaemon to support new architectural changes to MCE
|
||||
|
||||
Changet to rasdaemon to support new architectural extentions in Intel
|
||||
CPUs.
|
||||
|
||||
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
|
||||
---
|
||||
mce-intel.c | 2 ++
|
||||
ras-mce-handler.h | 1 +
|
||||
2 files changed, 3 insertions(+)
|
||||
|
||||
diff --git a/mce-intel.c b/mce-intel.c
|
||||
index 3684602..3503c6a 100644
|
||||
--- a/mce-intel.c
|
||||
+++ b/mce-intel.c
|
||||
@@ -185,6 +185,8 @@ static void decode_mcg(struct mce_event *e)
|
||||
mce_snprintf(e->mcgstatus_msg, "EIPV");
|
||||
if (mcgstatus & MCG_STATUS_MCIP)
|
||||
mce_snprintf(e->mcgstatus_msg, "MCIP");
|
||||
+ if (mcgstatus & MCG_STATUS_LMCE)
|
||||
+ mce_snprintf(e->mcgstatus_msg, "LMCE");
|
||||
}
|
||||
|
||||
static void bank_name(struct mce_event *e)
|
||||
diff --git a/ras-mce-handler.h b/ras-mce-handler.h
|
||||
index 28aad00..13b8f52 100644
|
||||
--- a/ras-mce-handler.h
|
||||
+++ b/ras-mce-handler.h
|
||||
@@ -139,6 +139,7 @@ void tulsa_decode_model(struct mce_event *e);
|
||||
#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */
|
||||
#define MCG_STATUS_EIPV (1ULL<<1) /* eip points to correct instruction */
|
||||
#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */
|
||||
+#define MCG_STATUS_LMCE (1ULL<<3) /* local machine check signaled */
|
||||
|
||||
/* Those functions are defined on per-cpu vendor C files */
|
||||
int parse_intel_event(struct ras_events *ras, struct mce_event *e);
|
||||
--
|
||||
1.8.3.1
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
Based on mcelog code.
|
||||
|
||||
Signed-off-by: Seiichi Ikarashi <s.ikarashi@jp.fujitsu.com>
|
||||
|
||||
---
|
||||
ras-mce-handler.c | 3 ++-
|
||||
1 files changed, 2 insertions(+), 1 deletions(-)
|
||||
|
||||
diff --git a/ras-mce-handler.c b/ras-mce-handler.c
|
||||
index 3976f90..23f2488 100644
|
||||
--- a/ras-mce-handler.c
|
||||
+++ b/ras-mce-handler.c
|
||||
@@ -90,7 +90,8 @@ static enum cputype select_intel_cputype(struct ras_events *ras)
|
||||
return CPU_HASWELL;
|
||||
else if (mce->model == 0x3f)
|
||||
return CPU_HASWELL_EPEX;
|
||||
- else if (mce->model == 0x3d)
|
||||
+ else if (mce->model == 0x3d || mce->model == 0x4f ||
|
||||
+ mce->model == 0x56)
|
||||
return CPU_BROADWELL;
|
||||
else if (mce->model == 0x57)
|
||||
return CPU_KNIGHTS_LANDING;
|
|
@ -0,0 +1,43 @@
|
|||
From d9fe70fe7db45618f7b46b81ebee85e7a8801870 Mon Sep 17 00:00:00 2001
|
||||
From: Aristeu Rozanski <aris@redhat.com>
|
||||
Date: Mon, 10 Aug 2015 14:24:41 -0400
|
||||
Subject: [PATCH 1/5] rasdaemon: fix typos on ras-mc-ctl man page
|
||||
|
||||
Fixed two markers and two typos in the documentation.
|
||||
|
||||
Signed-off-by: Aristeu Rozanski <aris@redhat.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
|
||||
---
|
||||
man/ras-mc-ctl.8.in | 6 +++---
|
||||
1 file changed, 3 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/man/ras-mc-ctl.8.in b/man/ras-mc-ctl.8.in
|
||||
index 7441b3a..60997dd 100644
|
||||
--- a/man/ras-mc-ctl.8.in
|
||||
+++ b/man/ras-mc-ctl.8.in
|
||||
@@ -69,14 +69,14 @@ Display the configured labels for the current hardware, as
|
||||
well as the current labels registered with EDAC.
|
||||
.TP
|
||||
.BI "--guess-labels"
|
||||
-Print DMI labels, when bank locator is available at the DMI table.
|
||||
+Print DMI labels, when bank locator is available in the DMI table.
|
||||
It helps to fill the labels database at @sysconfdir@/ras/dimm_labels.d/.
|
||||
.TP
|
||||
.BI "--labeldb="DB
|
||||
Specify an alternate location for the labels database.
|
||||
.TP
|
||||
.BI "--delay="time
|
||||
-Specify a delay of \ftime\fR seconds before registering dimm labels.
|
||||
+Specify a delay of \fBtime\fR seconds before registering DIMM labels.
|
||||
Only meaninful if used together with --register-labels.
|
||||
.TP
|
||||
.BI "--layout
|
||||
@@ -121,4 +121,4 @@ back to parsing output of the \fBdmidecode\fR(8) utility. Use of this
|
||||
utility will most often require that \fBras-mc-ctl\fR be run as root.
|
||||
|
||||
.SH SEE ALSO
|
||||
-\f\fBrasdaemon\fR(1)
|
||||
+\fBrasdaemon\fR(1)
|
||||
--
|
||||
1.8.3.1
|
||||
|
|
@ -0,0 +1,213 @@
|
|||
From 2d656c4ec9d5f68ac39b2a8461b0cd4f77dd7c21 Mon Sep 17 00:00:00 2001
|
||||
From: Marcin Koss <marcin.koss@intel.com>
|
||||
Date: Thu, 3 Dec 2015 15:19:47 +0100
|
||||
Subject: [PATCH 3/5] rasdaemon: Add support for Knights Landing processor
|
||||
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
|
||||
---
|
||||
Makefile.am | 3 +-
|
||||
mce-intel-knl.c | 128 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
mce-intel.c | 5 +++
|
||||
ras-mce-handler.c | 1 +
|
||||
ras-mce-handler.h | 1 +
|
||||
5 files changed, 137 insertions(+), 1 deletion(-)
|
||||
create mode 100644 mce-intel-knl.c
|
||||
|
||||
diff --git a/Makefile.am b/Makefile.am
|
||||
index a6bf18f..a1cb02a 100644
|
||||
--- a/Makefile.am
|
||||
+++ b/Makefile.am
|
||||
@@ -28,7 +28,8 @@ if WITH_MCE
|
||||
rasdaemon_SOURCES += ras-mce-handler.c mce-intel.c mce-amd-k8.c \
|
||||
mce-intel-p4-p6.c mce-intel-nehalem.c \
|
||||
mce-intel-dunnington.c mce-intel-tulsa.c \
|
||||
- mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c
|
||||
+ mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c \
|
||||
+ mce-intel-knl.c
|
||||
endif
|
||||
if WITH_EXTLOG
|
||||
rasdaemon_SOURCES += ras-extlog-handler.c
|
||||
diff --git a/mce-intel-knl.c b/mce-intel-knl.c
|
||||
new file mode 100644
|
||||
index 0000000..96b0a59
|
||||
--- /dev/null
|
||||
+++ b/mce-intel-knl.c
|
||||
@@ -0,0 +1,128 @@
|
||||
+/*
|
||||
+ * This program is free software; you can redistribute it and/or modify
|
||||
+ * it under the terms of the GNU General Public License as published by
|
||||
+ * the Free Software Foundation; either version 2 of the License, or
|
||||
+ * (at your option) any later version.
|
||||
+ *
|
||||
+ * This program is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
+ * GNU General Public License for more details.
|
||||
+ *
|
||||
+ * You should have received a copy of the GNU General Public License
|
||||
+ * along with this program; if not, write to the Free Software
|
||||
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
+*/
|
||||
+
|
||||
+#include <string.h>
|
||||
+#include <stdio.h>
|
||||
+
|
||||
+#include "ras-mce-handler.h"
|
||||
+#include "bitfield.h"
|
||||
+
|
||||
+static struct field memctrl_mc7[] = {
|
||||
+ SBITFIELD(16, "CA Parity error"),
|
||||
+ SBITFIELD(17, "Internal Parity error except WDB"),
|
||||
+ SBITFIELD(18, "Internal Parity error from WDB"),
|
||||
+ SBITFIELD(19, "Correctable Patrol Scrub"),
|
||||
+ SBITFIELD(20, "Uncorrectable Patrol Scrub"),
|
||||
+ SBITFIELD(21, "Spare Correctable Error"),
|
||||
+ SBITFIELD(22, "Spare UC Error"),
|
||||
+ SBITFIELD(23, "CORR Chip fail even MC only, 4 bit burst error EDC only"),
|
||||
+ {}
|
||||
+};
|
||||
+
|
||||
+void knl_decode_model(struct ras_events *ras, struct mce_event *e)
|
||||
+{
|
||||
+ uint64_t status = e->status;
|
||||
+ uint32_t mca = status & 0xffff;
|
||||
+ unsigned rank0 = -1, rank1 = -1, chan = 0;
|
||||
+
|
||||
+ switch (e->bank) {
|
||||
+ case 5:
|
||||
+ switch (EXTRACT(status, 0, 15)) {
|
||||
+ case 0x402:
|
||||
+ mce_snprintf(e->mcastatus_msg, "PCU Internal Errors");
|
||||
+ break;
|
||||
+ case 0x403:
|
||||
+ mce_snprintf(e->mcastatus_msg, "VCU Internal Errors");
|
||||
+ break;
|
||||
+ case 0x407:
|
||||
+ mce_snprintf(e->mcastatus_msg, "Other UBOX Internal Errors");
|
||||
+ break;
|
||||
+ }
|
||||
+ break;
|
||||
+ case 7: case 8: case 9: case 10:
|
||||
+ case 11: case 12: case 13: case 14:
|
||||
+ case 15: case 16:
|
||||
+ if ((EXTRACT(status, 0, 15)) == 0x5) {
|
||||
+ mce_snprintf(e->mcastatus_msg, "Internal Parity error");
|
||||
+ } else {
|
||||
+ chan = (EXTRACT(status, 0, 3)) + 3 * (e->bank == 15);
|
||||
+ switch (EXTRACT(status, 4, 7)) {
|
||||
+ case 0x0:
|
||||
+ mce_snprintf(e->mcastatus_msg, "Undefined request on channel %d", chan);
|
||||
+ break;
|
||||
+ case 0x1:
|
||||
+ mce_snprintf(e->mcastatus_msg, "Read on channel %d", chan);
|
||||
+ break;
|
||||
+ case 0x2:
|
||||
+ mce_snprintf(e->mcastatus_msg, "Write on channel %d", chan);
|
||||
+ break;
|
||||
+ case 0x3:
|
||||
+ mce_snprintf(e->mcastatus_msg, "CA error on channel %d", chan);
|
||||
+ break;
|
||||
+ case 0x4:
|
||||
+ mce_snprintf(e->mcastatus_msg, "Scrub error on channel %d", chan);
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+ decode_bitfield(e, status, memctrl_mc7);
|
||||
+ break;
|
||||
+ default:
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * Memory error specific code. Returns if the error is not a MC one
|
||||
+ */
|
||||
+
|
||||
+ /* Check if the error is at the memory controller */
|
||||
+ if ((mca >> 7) != 1)
|
||||
+ return;
|
||||
+
|
||||
+ /* Ignore unless this is an corrected extended error from an iMC bank */
|
||||
+ if (e->bank < 7 || e->bank > 16 || (status & MCI_STATUS_UC) ||
|
||||
+ !test_prefix(7, status & 0xefff))
|
||||
+ return;
|
||||
+
|
||||
+ /*
|
||||
+ * Parse the reported channel and ranks
|
||||
+ */
|
||||
+
|
||||
+ chan = EXTRACT(status, 0, 3);
|
||||
+ if (chan == 0xf)
|
||||
+ {
|
||||
+ mce_snprintf(e->mc_location, "memory_channel=unspecified");
|
||||
+ }
|
||||
+ else
|
||||
+ {
|
||||
+ chan = chan + 3 * (e->bank == 15);
|
||||
+ mce_snprintf(e->mc_location, "memory_channel=%d", chan);
|
||||
+
|
||||
+ if (EXTRACT(e->misc, 62, 62))
|
||||
+ rank0 = EXTRACT(e->misc, 46, 50);
|
||||
+ if (EXTRACT(e->misc, 63, 63))
|
||||
+ rank1 = EXTRACT(e->misc, 51, 55);
|
||||
+
|
||||
+ /*
|
||||
+ * FIXME: The conversion from rank to dimm requires to parse the
|
||||
+ * DMI tables and call failrank2dimm().
|
||||
+ */
|
||||
+ if (rank0 != -1 && rank1 != -1)
|
||||
+ mce_snprintf(e->mc_location, "ranks=%d and %d",
|
||||
+ rank0, rank1);
|
||||
+ else if (rank0 != -1)
|
||||
+ mce_snprintf(e->mc_location, "rank=%d", rank0);
|
||||
+ }
|
||||
+}
|
||||
diff --git a/mce-intel.c b/mce-intel.c
|
||||
index 77b929b..032f4e0 100644
|
||||
--- a/mce-intel.c
|
||||
+++ b/mce-intel.c
|
||||
@@ -397,6 +397,10 @@ int parse_intel_event(struct ras_events *ras, struct mce_event *e)
|
||||
break;
|
||||
case CPU_HASWELL_EPEX:
|
||||
hsw_decode_model(ras, e);
|
||||
+ break;
|
||||
+ case CPU_KNIGHTS_LANDING:
|
||||
+ knl_decode_model(ras, e);
|
||||
+ break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -460,6 +464,7 @@ int set_intel_imc_log(enum cputype cputype, unsigned ncpus)
|
||||
case CPU_SANDY_BRIDGE_EP:
|
||||
case CPU_IVY_BRIDGE_EPEX:
|
||||
case CPU_HASWELL_EPEX:
|
||||
+ case CPU_KNIGHTS_LANDING:
|
||||
msr = 0x17f; /* MSR_ERROR_CONTROL */
|
||||
bit = 0x2; /* MemError Log Enable */
|
||||
break;
|
||||
diff --git a/ras-mce-handler.c b/ras-mce-handler.c
|
||||
index 23f2488..3b0b05b 100644
|
||||
--- a/ras-mce-handler.c
|
||||
+++ b/ras-mce-handler.c
|
||||
@@ -223,6 +223,7 @@ int register_mce_handler(struct ras_events *ras, unsigned ncpus)
|
||||
case CPU_SANDY_BRIDGE_EP:
|
||||
case CPU_IVY_BRIDGE_EPEX:
|
||||
case CPU_HASWELL_EPEX:
|
||||
+ case CPU_KNIGHTS_LANDING:
|
||||
set_intel_imc_log(mce->cputype, ncpus);
|
||||
default:
|
||||
break;
|
||||
diff --git a/ras-mce-handler.h b/ras-mce-handler.h
|
||||
index 13b8f52..5466743 100644
|
||||
--- a/ras-mce-handler.h
|
||||
+++ b/ras-mce-handler.h
|
||||
@@ -119,6 +119,7 @@ void dunnington_decode_model(struct mce_event *e);
|
||||
void snb_decode_model(struct ras_events *ras, struct mce_event *e);
|
||||
void ivb_decode_model(struct ras_events *ras, struct mce_event *e);
|
||||
void hsw_decode_model(struct ras_events *ras, struct mce_event *e);
|
||||
+void knl_decode_model(struct ras_events *ras, struct mce_event *e);
|
||||
void tulsa_decode_model(struct mce_event *e);
|
||||
|
||||
/* Software defined banks */
|
||||
--
|
||||
1.8.3.1
|
||||
|
|
@ -0,0 +1,106 @@
|
|||
From 17f4e17d9870fbd35572ae6bf6c227c787b07fe9 Mon Sep 17 00:00:00 2001
|
||||
From: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
|
||||
Date: Fri, 5 Feb 2016 15:15:18 -0200
|
||||
Subject: [PATCH 4/5] mce-intel-knl: Fix CodingStyle
|
||||
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
|
||||
---
|
||||
mce-intel-knl.c | 43 +++++++++++++++++++++++++++----------------
|
||||
1 file changed, 27 insertions(+), 16 deletions(-)
|
||||
|
||||
diff --git a/mce-intel-knl.c b/mce-intel-knl.c
|
||||
index 96b0a59..7062fbb 100644
|
||||
--- a/mce-intel-knl.c
|
||||
+++ b/mce-intel-knl.c
|
||||
@@ -48,32 +48,46 @@ void knl_decode_model(struct ras_events *ras, struct mce_event *e)
|
||||
mce_snprintf(e->mcastatus_msg, "VCU Internal Errors");
|
||||
break;
|
||||
case 0x407:
|
||||
- mce_snprintf(e->mcastatus_msg, "Other UBOX Internal Errors");
|
||||
+ mce_snprintf(e->mcastatus_msg,
|
||||
+ "Other UBOX Internal Errors");
|
||||
break;
|
||||
}
|
||||
break;
|
||||
- case 7: case 8: case 9: case 10:
|
||||
- case 11: case 12: case 13: case 14:
|
||||
- case 15: case 16:
|
||||
+ case 7:
|
||||
+ case 8:
|
||||
+ case 9:
|
||||
+ case 10:
|
||||
+ case 11:
|
||||
+ case 12:
|
||||
+ case 13:
|
||||
+ case 14:
|
||||
+ case 15:
|
||||
+ case 16:
|
||||
if ((EXTRACT(status, 0, 15)) == 0x5) {
|
||||
mce_snprintf(e->mcastatus_msg, "Internal Parity error");
|
||||
} else {
|
||||
chan = (EXTRACT(status, 0, 3)) + 3 * (e->bank == 15);
|
||||
switch (EXTRACT(status, 4, 7)) {
|
||||
case 0x0:
|
||||
- mce_snprintf(e->mcastatus_msg, "Undefined request on channel %d", chan);
|
||||
+ mce_snprintf(e->mcastatus_msg,
|
||||
+ "Undefined request on channel %d",
|
||||
+ chan);
|
||||
break;
|
||||
case 0x1:
|
||||
- mce_snprintf(e->mcastatus_msg, "Read on channel %d", chan);
|
||||
+ mce_snprintf(e->mcastatus_msg,
|
||||
+ "Read on channel %d", chan);
|
||||
break;
|
||||
case 0x2:
|
||||
- mce_snprintf(e->mcastatus_msg, "Write on channel %d", chan);
|
||||
+ mce_snprintf(e->mcastatus_msg,
|
||||
+ "Write on channel %d", chan);
|
||||
break;
|
||||
case 0x3:
|
||||
- mce_snprintf(e->mcastatus_msg, "CA error on channel %d", chan);
|
||||
+ mce_snprintf(e->mcastatus_msg,
|
||||
+ "CA error on channel %d", chan);
|
||||
break;
|
||||
case 0x4:
|
||||
- mce_snprintf(e->mcastatus_msg, "Scrub error on channel %d", chan);
|
||||
+ mce_snprintf(e->mcastatus_msg,
|
||||
+ "Scrub error on channel %d", chan);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -93,7 +107,7 @@ void knl_decode_model(struct ras_events *ras, struct mce_event *e)
|
||||
|
||||
/* Ignore unless this is an corrected extended error from an iMC bank */
|
||||
if (e->bank < 7 || e->bank > 16 || (status & MCI_STATUS_UC) ||
|
||||
- !test_prefix(7, status & 0xefff))
|
||||
+ !test_prefix(7, status & 0xefff))
|
||||
return;
|
||||
|
||||
/*
|
||||
@@ -101,12 +115,9 @@ void knl_decode_model(struct ras_events *ras, struct mce_event *e)
|
||||
*/
|
||||
|
||||
chan = EXTRACT(status, 0, 3);
|
||||
- if (chan == 0xf)
|
||||
- {
|
||||
+ if (chan == 0xf) {
|
||||
mce_snprintf(e->mc_location, "memory_channel=unspecified");
|
||||
- }
|
||||
- else
|
||||
- {
|
||||
+ } else {
|
||||
chan = chan + 3 * (e->bank == 15);
|
||||
mce_snprintf(e->mc_location, "memory_channel=%d", chan);
|
||||
|
||||
@@ -121,7 +132,7 @@ void knl_decode_model(struct ras_events *ras, struct mce_event *e)
|
||||
*/
|
||||
if (rank0 != -1 && rank1 != -1)
|
||||
mce_snprintf(e->mc_location, "ranks=%d and %d",
|
||||
- rank0, rank1);
|
||||
+ rank0, rank1);
|
||||
else if (rank0 != -1)
|
||||
mce_snprintf(e->mc_location, "rank=%d", rank0);
|
||||
}
|
||||
--
|
||||
1.8.3.1
|
||||
|
|
@ -0,0 +1,244 @@
|
|||
From e7b88730f8a753a50fa0b8d1f7027f79baa05ca4 Mon Sep 17 00:00:00 2001
|
||||
From: Aristeu Rozanski <arozansk@redhat.com>
|
||||
Date: Fri, 8 Apr 2016 15:07:18 -0400
|
||||
Subject: [PATCH 1/2] Add Broadwell DE MSCOD values
|
||||
|
||||
Based on mcelog commit id 32252e9c37e97ea5083d90d2cf194bb85a4a0cda.
|
||||
|
||||
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
|
||||
---
|
||||
Makefile.am | 2 +-
|
||||
mce-intel-broadwell-de.c | 146 +++++++++++++++++++++++++++++++++++++++++++++++
|
||||
mce-intel.c | 3 +
|
||||
ras-mce-handler.c | 6 +-
|
||||
ras-mce-handler.h | 2 +
|
||||
5 files changed, 156 insertions(+), 3 deletions(-)
|
||||
create mode 100644 mce-intel-broadwell-de.c
|
||||
|
||||
diff --git a/Makefile.am b/Makefile.am
|
||||
index a1cb02a..a8477d3 100644
|
||||
--- a/Makefile.am
|
||||
+++ b/Makefile.am
|
||||
@@ -29,7 +29,7 @@ if WITH_MCE
|
||||
mce-intel-p4-p6.c mce-intel-nehalem.c \
|
||||
mce-intel-dunnington.c mce-intel-tulsa.c \
|
||||
mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c \
|
||||
- mce-intel-knl.c
|
||||
+ mce-intel-knl.c mce-intel-broadwell-de.c
|
||||
endif
|
||||
if WITH_EXTLOG
|
||||
rasdaemon_SOURCES += ras-extlog-handler.c
|
||||
diff --git a/mce-intel-broadwell-de.c b/mce-intel-broadwell-de.c
|
||||
new file mode 100644
|
||||
index 0000000..d52c82e
|
||||
--- /dev/null
|
||||
+++ b/mce-intel-broadwell-de.c
|
||||
@@ -0,0 +1,146 @@
|
||||
+/*
|
||||
+ * The code below came from Tony Luck's mcelog code,
|
||||
+ * released under GNU Public General License, v.2
|
||||
+ *
|
||||
+ * This program is free software; you can redistribute it and/or modify
|
||||
+ * it under the terms of the GNU General Public License as published by
|
||||
+ * the Free Software Foundation; either version 2 of the License, or
|
||||
+ * (at your option) any later version.
|
||||
+ *
|
||||
+ * This program is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
+ * GNU General Public License for more details.
|
||||
+ *
|
||||
+ * You should have received a copy of the GNU General Public License
|
||||
+ * along with this program; if not, write to the Free Software
|
||||
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
+*/
|
||||
+
|
||||
+#include <string.h>
|
||||
+#include <stdio.h>
|
||||
+
|
||||
+#include "ras-mce-handler.h"
|
||||
+#include "bitfield.h"
|
||||
+
|
||||
+/* See IA32 SDM Vol3B Table 16-24 */
|
||||
+
|
||||
+static char *pcu_1[] = {
|
||||
+ [0x00] = "No Error",
|
||||
+ [0x09] = "MC_MESSAGE_CHANNEL_TIMEOUT",
|
||||
+ [0x13] = "MC_DMI_TRAINING_TIMEOUT",
|
||||
+ [0x15] = "MC_DMI_CPU_RESET_ACK_TIMEOUT",
|
||||
+ [0x1E] = "MC_VR_ICC_MAX_LT_FUSED_ICC_MAX",
|
||||
+ [0x25] = "MC_SVID_COMMAN_TIMEOUT",
|
||||
+ [0x26] = "MCA_PKGC_DIRECT_WAKE_RING_TIMEOUT",
|
||||
+ [0x29] = "MC_VR_VOUT_MAC_LT_FUSED_SVID",
|
||||
+ [0x2B] = "MC_PKGC_WATCHDOG_HANG_CBZ_DOWN",
|
||||
+ [0x2C] = "MC_PKGC_WATCHDOG_HANG_CBZ_UP",
|
||||
+ [0x44] = "MC_CRITICAL_VR_FAILED",
|
||||
+ [0x46] = "MC_VID_RAMP_DOWN_FAILED",
|
||||
+ [0x49] = "MC_SVID_WRITE_REG_VOUT_MAX_FAILED",
|
||||
+ [0x4B] = "MC_BOOT_VID_TIMEOUT_DRAM_0",
|
||||
+ [0x4F] = "MC_SVID_COMMAND_ERROR",
|
||||
+ [0x52] = "MC_FIVR_CATAS_OVERVOL_FAULT",
|
||||
+ [0x53] = "MC_FIVR_CATAS_OVERCUR_FAULT",
|
||||
+ [0x57] = "MC_SVID_PKGC_REQUEST_FAILED",
|
||||
+ [0x58] = "MC_SVID_IMON_REQUEST_FAILED",
|
||||
+ [0x59] = "MC_SVID_ALERT_REQUEST_FAILED",
|
||||
+ [0x62] = "MC_INVALID_PKGS_RSP_QPI",
|
||||
+ [0x64] = "MC_INVALID_PKG_STATE_CONFIG",
|
||||
+ [0x67] = "MC_HA_IMC_RW_BLOCK_ACK_TIMEOUT",
|
||||
+ [0x6A] = "MC_MSGCH_PMREQ_CMP_TIMEOUT",
|
||||
+ [0x72] = "MC_WATCHDOG_TIMEOUT_PKGS_MASTER",
|
||||
+ [0x81] = "MC_RECOVERABLE_DIE_THERMAL_TOO_HOT"
|
||||
+};
|
||||
+
|
||||
+static struct field pcu_mc4[] = {
|
||||
+ FIELD(24, pcu_1),
|
||||
+ {}
|
||||
+};
|
||||
+
|
||||
+/* See IA32 SDM Vol3B Table 16-18 */
|
||||
+
|
||||
+static struct field memctrl_mc9[] = {
|
||||
+ SBITFIELD(16, "Address parity error"),
|
||||
+ SBITFIELD(17, "HA Wrt buffer Data parity error"),
|
||||
+ SBITFIELD(18, "HA Wrt byte enable parity error"),
|
||||
+ SBITFIELD(19, "Corrected patrol scrub error"),
|
||||
+ SBITFIELD(20, "Uncorrected patrol scrub error"),
|
||||
+ SBITFIELD(21, "Corrected spare error"),
|
||||
+ SBITFIELD(22, "Uncorrected spare error"),
|
||||
+ SBITFIELD(23, "Corrected memory read error"),
|
||||
+ SBITFIELD(24, "iMC, WDB, parity errors"),
|
||||
+ {}
|
||||
+};
|
||||
+
|
||||
+void broadwell_de_decode_model(struct ras_events *ras, struct mce_event *e)
|
||||
+{
|
||||
+ uint64_t status = e->status;
|
||||
+ uint32_t mca = status & 0xffff;
|
||||
+ unsigned rank0 = -1, rank1 = -1, chan;
|
||||
+
|
||||
+ switch (e->bank) {
|
||||
+ case 4:
|
||||
+ switch (EXTRACT(status, 0, 15) & ~(1ull << 12)) {
|
||||
+ case 0x402: case 0x403:
|
||||
+ mce_snprintf(e->mcastatus_msg, "Internal errors ");
|
||||
+ break;
|
||||
+ case 0x406:
|
||||
+ mce_snprintf(e->mcastatus_msg, "Intel TXT errors ");
|
||||
+ break;
|
||||
+ case 0x407:
|
||||
+ mce_snprintf(e->mcastatus_msg, "Other UBOX Internal errors ");
|
||||
+ break;
|
||||
+ }
|
||||
+ if (EXTRACT(status, 16, 19) & 3)
|
||||
+ mce_snprintf(e->mcastatus_msg, "PCU internal error ");
|
||||
+ if (EXTRACT(status, 20, 23) & 4)
|
||||
+ mce_snprintf(e->mcastatus_msg, "Ubox error ");
|
||||
+ decode_bitfield(e, status, pcu_mc4);
|
||||
+ break;
|
||||
+ case 9: case 10:
|
||||
+ mce_snprintf(e->mcastatus_msg, "MemCtrl: ");
|
||||
+ decode_bitfield(e, status, memctrl_mc9);
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * Memory error specific code. Returns if the error is not a MC one
|
||||
+ */
|
||||
+
|
||||
+ /* Check if the error is at the memory controller */
|
||||
+ if ((mca >> 7) != 1)
|
||||
+ return;
|
||||
+
|
||||
+ /* Ignore unless this is an corrected extended error from an iMC bank */
|
||||
+ if (e->bank < 9 || e->bank > 16 || (status & MCI_STATUS_UC) ||
|
||||
+ !test_prefix(7, status & 0xefff))
|
||||
+ return;
|
||||
+
|
||||
+ /*
|
||||
+ * Parse the reported channel and ranks
|
||||
+ */
|
||||
+
|
||||
+ chan = EXTRACT(status, 0, 3);
|
||||
+ if (chan == 0xf)
|
||||
+ return;
|
||||
+
|
||||
+ mce_snprintf(e->mc_location, "memory_channel=%d", chan);
|
||||
+
|
||||
+ if (EXTRACT(e->misc, 62, 62)) {
|
||||
+ rank0 = EXTRACT(e->misc, 46, 50);
|
||||
+ if (EXTRACT(e->misc, 63, 63))
|
||||
+ rank1 = EXTRACT(e->misc, 51, 55);
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * FIXME: The conversion from rank to dimm requires to parse the
|
||||
+ * DMI tables and call failrank2dimm().
|
||||
+ */
|
||||
+ if (rank0 != -1 && rank1 != -1)
|
||||
+ mce_snprintf(e->mc_location, "ranks=%d and %d",
|
||||
+ rank0, rank1);
|
||||
+ else if (rank0 != -1)
|
||||
+ mce_snprintf(e->mc_location, "rank=%d", rank0);
|
||||
+}
|
||||
diff --git a/mce-intel.c b/mce-intel.c
|
||||
index 032f4e0..b132903 100644
|
||||
--- a/mce-intel.c
|
||||
+++ b/mce-intel.c
|
||||
@@ -401,6 +401,9 @@ int parse_intel_event(struct ras_events *ras, struct mce_event *e)
|
||||
case CPU_KNIGHTS_LANDING:
|
||||
knl_decode_model(ras, e);
|
||||
break;
|
||||
+ case CPU_BROADWELL_DE:
|
||||
+ broadwell_de_decode_model(ras, e);
|
||||
+ break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
diff --git a/ras-mce-handler.c b/ras-mce-handler.c
|
||||
index 3b0b05b..b58d6e0 100644
|
||||
--- a/ras-mce-handler.c
|
||||
+++ b/ras-mce-handler.c
|
||||
@@ -50,6 +50,7 @@ static char *cputype_name[] = {
|
||||
[CPU_HASWELL] = "Haswell",
|
||||
[CPU_HASWELL_EPEX] = "Intel Xeon v3 (Haswell) EP/EX",
|
||||
[CPU_BROADWELL] = "Broadwell",
|
||||
+ [CPU_BROADWELL_DE] = "Broadwell DE",
|
||||
[CPU_KNIGHTS_LANDING] = "Knights Landing",
|
||||
};
|
||||
|
||||
@@ -90,8 +91,9 @@ static enum cputype select_intel_cputype(struct ras_events *ras)
|
||||
return CPU_HASWELL;
|
||||
else if (mce->model == 0x3f)
|
||||
return CPU_HASWELL_EPEX;
|
||||
- else if (mce->model == 0x3d || mce->model == 0x4f ||
|
||||
- mce->model == 0x56)
|
||||
+ else if (mce->model == 0x56)
|
||||
+ return CPU_BROADWELL_DE;
|
||||
+ else if (mce->model == 0x3d || mce->model == 0x4f)
|
||||
return CPU_BROADWELL;
|
||||
else if (mce->model == 0x57)
|
||||
return CPU_KNIGHTS_LANDING;
|
||||
diff --git a/ras-mce-handler.h b/ras-mce-handler.h
|
||||
index 5466743..2648048 100644
|
||||
--- a/ras-mce-handler.h
|
||||
+++ b/ras-mce-handler.h
|
||||
@@ -45,6 +45,7 @@ enum cputype {
|
||||
CPU_HASWELL,
|
||||
CPU_HASWELL_EPEX,
|
||||
CPU_BROADWELL,
|
||||
+ CPU_BROADWELL_DE,
|
||||
CPU_KNIGHTS_LANDING,
|
||||
};
|
||||
|
||||
@@ -121,6 +122,7 @@ void ivb_decode_model(struct ras_events *ras, struct mce_event *e);
|
||||
void hsw_decode_model(struct ras_events *ras, struct mce_event *e);
|
||||
void knl_decode_model(struct ras_events *ras, struct mce_event *e);
|
||||
void tulsa_decode_model(struct mce_event *e);
|
||||
+void broadwell_de_decode_model(struct ras_events *ras, struct mce_event *e);
|
||||
|
||||
/* Software defined banks */
|
||||
#define MCE_EXTENDED_BANK 128
|
||||
--
|
||||
1.8.3.1
|
||||
|
|
@ -0,0 +1,289 @@
|
|||
From 0dd44fca9d756990acf01cd2cdaa585f369168bc Mon Sep 17 00:00:00 2001
|
||||
From: Aristeu Rozanski <arozansk@redhat.com>
|
||||
Date: Fri, 8 Apr 2016 15:07:19 -0400
|
||||
Subject: [PATCH 2/2] Add Broadwell EP/EX MSCOD values
|
||||
|
||||
Based on mcelog commit id 32252e9c37e97ea5083d90d2cf194bb85a4a0cda.
|
||||
|
||||
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
|
||||
---
|
||||
Makefile.am | 3 +-
|
||||
mce-intel-broadwell-epex.c | 191 +++++++++++++++++++++++++++++++++++++++++++++
|
||||
mce-intel.c | 3 +
|
||||
ras-mce-handler.c | 5 +-
|
||||
ras-mce-handler.h | 2 +
|
||||
5 files changed, 202 insertions(+), 2 deletions(-)
|
||||
create mode 100644 mce-intel-broadwell-epex.c
|
||||
|
||||
diff --git a/Makefile.am b/Makefile.am
|
||||
index a8477d3..c9e4481 100644
|
||||
--- a/Makefile.am
|
||||
+++ b/Makefile.am
|
||||
@@ -29,7 +29,8 @@ if WITH_MCE
|
||||
mce-intel-p4-p6.c mce-intel-nehalem.c \
|
||||
mce-intel-dunnington.c mce-intel-tulsa.c \
|
||||
mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c \
|
||||
- mce-intel-knl.c mce-intel-broadwell-de.c
|
||||
+ mce-intel-knl.c mce-intel-broadwell-de.c \
|
||||
+ mce-intel-broadwell-epex.c
|
||||
endif
|
||||
if WITH_EXTLOG
|
||||
rasdaemon_SOURCES += ras-extlog-handler.c
|
||||
diff --git a/mce-intel-broadwell-epex.c b/mce-intel-broadwell-epex.c
|
||||
new file mode 100644
|
||||
index 0000000..f7cd3b6
|
||||
--- /dev/null
|
||||
+++ b/mce-intel-broadwell-epex.c
|
||||
@@ -0,0 +1,191 @@
|
||||
+/*
|
||||
+ * The code below came from Tony Luck's mcelog code,
|
||||
+ * released under GNU Public General License, v.2
|
||||
+ *
|
||||
+ * This program is free software; you can redistribute it and/or modify
|
||||
+ * it under the terms of the GNU General Public License as published by
|
||||
+ * the Free Software Foundation; either version 2 of the License, or
|
||||
+ * (at your option) any later version.
|
||||
+ *
|
||||
+ * This program is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
+ * GNU General Public License for more details.
|
||||
+ *
|
||||
+ * You should have received a copy of the GNU General Public License
|
||||
+ * along with this program; if not, write to the Free Software
|
||||
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
+*/
|
||||
+
|
||||
+#include <string.h>
|
||||
+#include <stdio.h>
|
||||
+
|
||||
+#include "ras-mce-handler.h"
|
||||
+#include "bitfield.h"
|
||||
+
|
||||
+/* See IA32 SDM Vol3B Table 16-20 */
|
||||
+
|
||||
+static char *pcu_1[] = {
|
||||
+ [0x00] = "No Error",
|
||||
+ [0x09] = "MC_MESSAGE_CHANNEL_TIMEOUT",
|
||||
+ [0x0D] = "MC_IMC_FORCE_SR_S3_TIMEOUT",
|
||||
+ [0x0E] = "MC_CPD_UNCPD_SD_TIMEOUT",
|
||||
+ [0x13] = "MC_DMI_TRAINING_TIMEOUT",
|
||||
+ [0x15] = "MC_DMI_CPU_RESET_ACK_TIMEOUT",
|
||||
+ [0x1E] = "MC_VR_ICC_MAX_LT_FUSED_ICC_MAX",
|
||||
+ [0x25] = "MC_SVID_COMMAN_TIMEOUT",
|
||||
+ [0x29] = "MC_VR_VOUT_MAC_LT_FUSED_SVID",
|
||||
+ [0x2B] = "MC_PKGC_WATCHDOG_HANG_CBZ_DOWN",
|
||||
+ [0x2C] = "MC_PKGC_WATCHDOG_HANG_CBZ_UP",
|
||||
+ [0x39] = "MC_PKGC_WATCHDOG_HANG_C3_UP_SF",
|
||||
+ [0x44] = "MC_CRITICAL_VR_FAILED",
|
||||
+ [0x45] = "MC_ICC_MAX_NOTSUPPORTED",
|
||||
+ [0x46] = "MC_VID_RAMP_DOWN_FAILED",
|
||||
+ [0x47] = "MC_EXCL_MODE_NO_PMREQ_CMP",
|
||||
+ [0x48] = "MC_SVID_READ_REG_ICC_MAX_FAILED",
|
||||
+ [0x49] = "MC_SVID_WRITE_REG_VOUT_MAX_FAILED",
|
||||
+ [0x4B] = "MC_BOOT_VID_TIMEOUT_DRAM_0",
|
||||
+ [0x4C] = "MC_BOOT_VID_TIMEOUT_DRAM_1",
|
||||
+ [0x4D] = "MC_BOOT_VID_TIMEOUT_DRAM_2",
|
||||
+ [0x4E] = "MC_BOOT_VID_TIMEOUT_DRAM_3",
|
||||
+ [0x4F] = "MC_SVID_COMMAND_ERROR",
|
||||
+ [0x52] = "MC_FIVR_CATAS_OVERVOL_FAULT",
|
||||
+ [0x53] = "MC_FIVR_CATAS_OVERCUR_FAULT",
|
||||
+ [0x57] = "MC_SVID_PKGC_REQUEST_FAILED",
|
||||
+ [0x58] = "MC_SVID_IMON_REQUEST_FAILED",
|
||||
+ [0x59] = "MC_SVID_ALERT_REQUEST_FAILED",
|
||||
+ [0x60] = "MC_INVALID_PKGS_REQ_PCH",
|
||||
+ [0x61] = "MC_INVALID_PKGS_REQ_QPI",
|
||||
+ [0x62] = "MC_INVALID_PKGS_RSP_QPI",
|
||||
+ [0x63] = "MC_INVALID_PKGS_RSP_PCH",
|
||||
+ [0x64] = "MC_INVALID_PKG_STATE_CONFIG",
|
||||
+ [0x67] = "MC_HA_IMC_RW_BLOCK_ACK_TIMEOUT",
|
||||
+ [0x68] = "MC_IMC_RW_SMBUS_TIMEOUT",
|
||||
+ [0x69] = "MC_HA_FAILSTS_CHANGE_DETECTED",
|
||||
+ [0x6A] = "MC_MSGCH_PMREQ_CMP_TIMEOUT",
|
||||
+ [0x70] = "MC_WATCHDOG_TIMEOUT_PKGC_SLAVE",
|
||||
+ [0x71] = "MC_WATCHDOG_TIMEOUT_PKGC_MASTER",
|
||||
+ [0x72] = "MC_WATCHDOG_TIMEOUT_PKGS_MASTER",
|
||||
+ [0x7C] = "MC_BIOS_RST_CPL_INVALID_SEQ",
|
||||
+ [0x7D] = "MC_MORE_THAN_ONE_TXT_AGENT",
|
||||
+ [0x81] = "MC_RECOVERABLE_DIE_THERMAL_TOO_HOT"
|
||||
+};
|
||||
+
|
||||
+static struct field pcu_mc4[] = {
|
||||
+ FIELD(24, pcu_1),
|
||||
+ {}
|
||||
+};
|
||||
+
|
||||
+/* See IA32 SDM Vol3B Table 16-21 */
|
||||
+
|
||||
+static char *qpi[] = {
|
||||
+ [0x02] = "Intel QPI physical layer detected drift buffer alarm",
|
||||
+ [0x03] = "Intel QPI physical layer detected latency buffer rollover",
|
||||
+ [0x10] = "Intel QPI link layer detected control error from R3QPI",
|
||||
+ [0x11] = "Rx entered LLR abort state on CRC error",
|
||||
+ [0x12] = "Unsupported or undefined packet",
|
||||
+ [0x13] = "Intel QPI link layer control error",
|
||||
+ [0x15] = "RBT used un-initialized value",
|
||||
+ [0x20] = "Intel QPI physical layer detected a QPI in-band reset but aborted initialization",
|
||||
+ [0x21] = "Link failover data self healing",
|
||||
+ [0x22] = "Phy detected in-band reset (no width change)",
|
||||
+ [0x23] = "Link failover clock failover",
|
||||
+ [0x30] = "Rx detected CRC error - successful LLR after Phy re-init",
|
||||
+ [0x31] = "Rx detected CRC error - successful LLR wihout Phy re-init",
|
||||
+};
|
||||
+
|
||||
+static struct field qpi_mc[] = {
|
||||
+ FIELD(16, qpi),
|
||||
+ {}
|
||||
+};
|
||||
+
|
||||
+/* See IA32 SDM Vol3B Table 16-26 */
|
||||
+
|
||||
+static struct field memctrl_mc9[] = {
|
||||
+ SBITFIELD(16, "DDR3 address parity error"),
|
||||
+ SBITFIELD(17, "Uncorrected HA write data error"),
|
||||
+ SBITFIELD(18, "Uncorrected HA data byte enable error"),
|
||||
+ SBITFIELD(19, "Corrected patrol scrub error"),
|
||||
+ SBITFIELD(20, "Uncorrected patrol scrub error"),
|
||||
+ SBITFIELD(21, "Corrected spare error"),
|
||||
+ SBITFIELD(22, "Uncorrected spare error"),
|
||||
+ SBITFIELD(24, "iMC write data buffer parity error"),
|
||||
+ SBITFIELD(25, "DDR4 command address parity error"),
|
||||
+ {}
|
||||
+};
|
||||
+
|
||||
+void broadwell_epex_decode_model(struct ras_events *ras, struct mce_event *e)
|
||||
+{
|
||||
+ uint64_t status = e->status;
|
||||
+ uint32_t mca = status & 0xffff;
|
||||
+ unsigned rank0 = -1, rank1 = -1, chan;
|
||||
+
|
||||
+ switch (e->bank) {
|
||||
+ case 4:
|
||||
+ switch (EXTRACT(status, 0, 15) & ~(1ull << 12)) {
|
||||
+ case 0x402: case 0x403:
|
||||
+ mce_snprintf(e->mcastatus_msg, "Internal errors ");
|
||||
+ break;
|
||||
+ case 0x406:
|
||||
+ mce_snprintf(e->mcastatus_msg, "Intel TXT errors ");
|
||||
+ break;
|
||||
+ case 0x407:
|
||||
+ mce_snprintf(e->mcastatus_msg, "Other UBOX Internal errors ");
|
||||
+ break;
|
||||
+ }
|
||||
+ if (EXTRACT(status, 16, 19))
|
||||
+ mce_snprintf(e->mcastatus_msg, "PCU internal error ");
|
||||
+ decode_bitfield(e, status, pcu_mc4);
|
||||
+ break;
|
||||
+ case 5:
|
||||
+ case 20:
|
||||
+ case 21:
|
||||
+ mce_snprintf(e->mcastatus_msg, "QPI: ");
|
||||
+ decode_bitfield(e, status, qpi_mc);
|
||||
+ break;
|
||||
+ case 9: case 10: case 11: case 12:
|
||||
+ case 13: case 14: case 15: case 16:
|
||||
+ mce_snprintf(e->mcastatus_msg, "MemCtrl: ");
|
||||
+ decode_bitfield(e, status, memctrl_mc9);
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * Memory error specific code. Returns if the error is not a MC one
|
||||
+ */
|
||||
+
|
||||
+ /* Check if the error is at the memory controller */
|
||||
+ if ((mca >> 7) != 1)
|
||||
+ return;
|
||||
+
|
||||
+ /* Ignore unless this is an corrected extended error from an iMC bank */
|
||||
+ if (e->bank < 9 || e->bank > 16 || (status & MCI_STATUS_UC) ||
|
||||
+ !test_prefix(7, status & 0xefff))
|
||||
+ return;
|
||||
+
|
||||
+ /*
|
||||
+ * Parse the reported channel and ranks
|
||||
+ */
|
||||
+
|
||||
+ chan = EXTRACT(status, 0, 3);
|
||||
+ if (chan == 0xf)
|
||||
+ return;
|
||||
+
|
||||
+ mce_snprintf(e->mc_location, "memory_channel=%d", chan);
|
||||
+
|
||||
+ if (EXTRACT(e->misc, 62, 62)) {
|
||||
+ rank0 = EXTRACT(e->misc, 46, 50);
|
||||
+ if (EXTRACT(e->misc, 63, 63))
|
||||
+ rank1 = EXTRACT(e->misc, 51, 55);
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * FIXME: The conversion from rank to dimm requires to parse the
|
||||
+ * DMI tables and call failrank2dimm().
|
||||
+ */
|
||||
+ if (rank0 != -1 && rank1 != -1)
|
||||
+ mce_snprintf(e->mc_location, "ranks=%d and %d",
|
||||
+ rank0, rank1);
|
||||
+ else if (rank0 != -1)
|
||||
+ mce_snprintf(e->mc_location, "rank=%d", rank0);
|
||||
+}
|
||||
diff --git a/mce-intel.c b/mce-intel.c
|
||||
index b132903..bf68d9b 100644
|
||||
--- a/mce-intel.c
|
||||
+++ b/mce-intel.c
|
||||
@@ -404,6 +404,9 @@ int parse_intel_event(struct ras_events *ras, struct mce_event *e)
|
||||
case CPU_BROADWELL_DE:
|
||||
broadwell_de_decode_model(ras, e);
|
||||
break;
|
||||
+ case CPU_BROADWELL_EPEX:
|
||||
+ broadwell_epex_decode_model(ras, e);
|
||||
+ break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
diff --git a/ras-mce-handler.c b/ras-mce-handler.c
|
||||
index b58d6e0..b875512 100644
|
||||
--- a/ras-mce-handler.c
|
||||
+++ b/ras-mce-handler.c
|
||||
@@ -51,6 +51,7 @@ static char *cputype_name[] = {
|
||||
[CPU_HASWELL_EPEX] = "Intel Xeon v3 (Haswell) EP/EX",
|
||||
[CPU_BROADWELL] = "Broadwell",
|
||||
[CPU_BROADWELL_DE] = "Broadwell DE",
|
||||
+ [CPU_BROADWELL_EPEX] = "Broadwell EP/EX",
|
||||
[CPU_KNIGHTS_LANDING] = "Knights Landing",
|
||||
};
|
||||
|
||||
@@ -93,7 +94,9 @@ static enum cputype select_intel_cputype(struct ras_events *ras)
|
||||
return CPU_HASWELL_EPEX;
|
||||
else if (mce->model == 0x56)
|
||||
return CPU_BROADWELL_DE;
|
||||
- else if (mce->model == 0x3d || mce->model == 0x4f)
|
||||
+ else if (mce->model == 0x4f)
|
||||
+ return CPU_BROADWELL_EPEX;
|
||||
+ else if (mce->model == 0x3d)
|
||||
return CPU_BROADWELL;
|
||||
else if (mce->model == 0x57)
|
||||
return CPU_KNIGHTS_LANDING;
|
||||
diff --git a/ras-mce-handler.h b/ras-mce-handler.h
|
||||
index 2648048..c5a3717 100644
|
||||
--- a/ras-mce-handler.h
|
||||
+++ b/ras-mce-handler.h
|
||||
@@ -46,6 +46,7 @@ enum cputype {
|
||||
CPU_HASWELL_EPEX,
|
||||
CPU_BROADWELL,
|
||||
CPU_BROADWELL_DE,
|
||||
+ CPU_BROADWELL_EPEX,
|
||||
CPU_KNIGHTS_LANDING,
|
||||
};
|
||||
|
||||
@@ -123,6 +124,7 @@ void hsw_decode_model(struct ras_events *ras, struct mce_event *e);
|
||||
void knl_decode_model(struct ras_events *ras, struct mce_event *e);
|
||||
void tulsa_decode_model(struct mce_event *e);
|
||||
void broadwell_de_decode_model(struct ras_events *ras, struct mce_event *e);
|
||||
+void broadwell_epex_decode_model(struct ras_events *ras, struct mce_event *e);
|
||||
|
||||
/* Software defined banks */
|
||||
#define MCE_EXTENDED_BANK 128
|
||||
--
|
||||
1.8.3.1
|
||||
|
|
@ -0,0 +1,63 @@
|
|||
---
|
||||
mce-intel.c | 3 +++
|
||||
ras-mce-handler.c | 5 +++++
|
||||
ras-mce-handler.h | 1 +
|
||||
3 files changed, 9 insertions(+)
|
||||
|
||||
--- rasdaemon-0.4.1.orig/mce-intel.c 2017-05-30 12:04:54.440167730 -0400
|
||||
+++ rasdaemon-0.4.1/mce-intel.c 2017-05-30 12:06:51.705755469 -0400
|
||||
@@ -399,6 +399,7 @@ if (test_prefix(11, (e->status & 0xffffL
|
||||
hsw_decode_model(ras, e);
|
||||
break;
|
||||
case CPU_KNIGHTS_LANDING:
|
||||
+ case CPU_KNIGHTS_MILL:
|
||||
knl_decode_model(ras, e);
|
||||
break;
|
||||
case CPU_BROADWELL_DE:
|
||||
@@ -470,6 +471,8 @@ int set_intel_imc_log(enum cputype cputy
|
||||
case CPU_SANDY_BRIDGE_EP:
|
||||
case CPU_IVY_BRIDGE_EPEX:
|
||||
case CPU_HASWELL_EPEX:
|
||||
+ case CPU_KNIGHTS_LANDING:
|
||||
+ case CPU_KNIGHTS_MILL:
|
||||
msr = 0x17f; /* MSR_ERROR_CONTROL */
|
||||
bit = 0x2; /* MemError Log Enable */
|
||||
break;
|
||||
--- rasdaemon-0.4.1.orig/ras-mce-handler.c 2017-05-30 12:04:54.440167730 -0400
|
||||
+++ rasdaemon-0.4.1/ras-mce-handler.c 2017-05-30 12:07:59.850934779 -0400
|
||||
@@ -53,6 +53,7 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series
|
||||
[CPU_BROADWELL_DE] = "Broadwell DE",
|
||||
[CPU_BROADWELL_EPEX] = "Broadwell EP/EX",
|
||||
[CPU_KNIGHTS_LANDING] = "Knights Landing",
|
||||
+ [CPU_KNIGHTS_MILL] = "Knights Mill",
|
||||
};
|
||||
|
||||
static enum cputype select_intel_cputype(struct ras_events *ras)
|
||||
@@ -100,6 +101,8 @@ else if (mce->model == 0x3d)
|
||||
return CPU_BROADWELL;
|
||||
else if (mce->model == 0x57)
|
||||
return CPU_KNIGHTS_LANDING;
|
||||
+ else if (mce->model == 0x85)
|
||||
+ return CPU_KNIGHTS_MILL;
|
||||
|
||||
if (mce->model > 0x1a) {
|
||||
log(ALL, LOG_INFO,
|
||||
@@ -228,6 +231,8 @@ int register_mce_handler(struct ras_even
|
||||
case CPU_SANDY_BRIDGE_EP:
|
||||
case CPU_IVY_BRIDGE_EPEX:
|
||||
case CPU_HASWELL_EPEX:
|
||||
+ case CPU_KNIGHTS_LANDING:
|
||||
+ case CPU_KNIGHTS_MILL:
|
||||
set_intel_imc_log(mce->cputype, ncpus);
|
||||
default:
|
||||
break;
|
||||
--- rasdaemon-0.4.1.orig/ras-mce-handler.h 2017-05-30 12:04:54.440167730 -0400
|
||||
+++ rasdaemon-0.4.1/ras-mce-handler.h 2017-05-30 12:04:58.976113103 -0400
|
||||
@@ -48,6 +48,7 @@ enum cputype {
|
||||
CPU_BROADWELL_DE,
|
||||
CPU_BROADWELL_EPEX,
|
||||
CPU_KNIGHTS_LANDING,
|
||||
+ CPU_KNIGHTS_MILL,
|
||||
};
|
||||
|
||||
struct mce_event {
|
|
@ -0,0 +1,344 @@
|
|||
commit f9a5724021d8bc9f38cee3a0a71eb4032da1ec66
|
||||
Author: Aristeu Rozanski <arozansk@redhat.com>
|
||||
Date: Mon Sep 19 15:28:33 2016 -0400
|
||||
|
||||
rasdaemon: add support for Skylake client and server
|
||||
|
||||
Base on upstream mcelog commits
|
||||
6c07f906dadfe2c4bb7a21e5fc60dc2f34056bf0
|
||||
e4aca6312aee03066ab45632a7bee23dc892a425
|
||||
|
||||
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
|
||||
|
||||
---
|
||||
Makefile.am | 2
|
||||
mce-intel-skx.c | 257 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
mce-intel.c | 3
|
||||
ras-mce-handler.c | 6 +
|
||||
ras-mce-handler.h | 3
|
||||
5 files changed, 270 insertions(+), 1 deletion(-)
|
||||
|
||||
--- rasdaemon-0.4.1.orig/Makefile.am 2017-05-30 12:43:11.975591485 -0400
|
||||
+++ rasdaemon-0.4.1/Makefile.am 2017-05-30 12:43:16.948531592 -0400
|
||||
@@ -30,7 +30,7 @@ if WITH_MCE
|
||||
mce-intel-dunnington.c mce-intel-tulsa.c \
|
||||
mce-intel-sb.c mce-intel-ivb.c mce-intel-haswell.c \
|
||||
mce-intel-knl.c mce-intel-broadwell-de.c \
|
||||
- mce-intel-broadwell-epex.c
|
||||
+ mce-intel-broadwell-epex.c mce-intel-skx.c
|
||||
endif
|
||||
if WITH_EXTLOG
|
||||
rasdaemon_SOURCES += ras-extlog-handler.c
|
||||
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
|
||||
+++ rasdaemon-0.4.1/mce-intel-skx.c 2017-05-30 12:43:16.948531592 -0400
|
||||
@@ -0,0 +1,257 @@
|
||||
+/*
|
||||
+ * The code below came from Tony Luck mcelog code,
|
||||
+ * released under GNU Public General License, v.2
|
||||
+ *
|
||||
+ * This program is free software; you can redistribute it and/or modify
|
||||
+ * it under the terms of the GNU General Public License as published by
|
||||
+ * the Free Software Foundation; either version 2 of the License, or
|
||||
+ * (at your option) any later version.
|
||||
+ *
|
||||
+ * This program is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
+ * GNU General Public License for more details.
|
||||
+ *
|
||||
+ * You should have received a copy of the GNU General Public License
|
||||
+ * along with this program; if not, write to the Free Software
|
||||
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
+*/
|
||||
+
|
||||
+#include <string.h>
|
||||
+#include <stdio.h>
|
||||
+
|
||||
+#include "ras-mce-handler.h"
|
||||
+#include "bitfield.h"
|
||||
+
|
||||
+
|
||||
+/* See IA32 SDM Vol3B Table 16-27 */
|
||||
+
|
||||
+static char *pcu_1[] = {
|
||||
+ [0x00] = "No Error",
|
||||
+ [0x0d] = "MCA_DMI_TRAINING_TIMEOUT",
|
||||
+ [0x0f] = "MCA_DMI_CPU_RESET_ACK_TIMEOUT",
|
||||
+ [0x10] = "MCA_MORE_THAN_ONE_LT_AGENT",
|
||||
+ [0x1e] = "MCA_BIOS_RST_CPL_INVALID_SEQ",
|
||||
+ [0x1f] = "MCA_BIOS_INVALID_PKG_STATE_CONFIG",
|
||||
+ [0x25] = "MCA_MESSAGE_CHANNEL_TIMEOUT",
|
||||
+ [0x27] = "MCA_MSGCH_PMREQ_CMP_TIMEOUT",
|
||||
+ [0x30] = "MCA_PKGC_DIRECT_WAKE_RING_TIMEOUT",
|
||||
+ [0x31] = "MCA_PKGC_INVALID_RSP_PCH",
|
||||
+ [0x33] = "MCA_PKGC_WATCHDOG_HANG_CBZ_DOWN",
|
||||
+ [0x34] = "MCA_PKGC_WATCHDOG_HANG_CBZ_UP",
|
||||
+ [0x38] = "MCA_PKGC_WATCHDOG_HANG_C3_UP_SF",
|
||||
+ [0x40] = "MCA_SVID_VCCIN_VR_ICC_MAX_FAILURE",
|
||||
+ [0x41] = "MCA_SVID_COMMAND_TIMEOUT",
|
||||
+ [0x42] = "MCA_SVID_VCCIN_VR_VOUT_FAILURE",
|
||||
+ [0x43] = "MCA_SVID_CPU_VR_CAPABILITY_ERROR",
|
||||
+ [0x44] = "MCA_SVID_CRITICAL_VR_FAILED",
|
||||
+ [0x45] = "MCA_SVID_SA_ITD_ERROR",
|
||||
+ [0x46] = "MCA_SVID_READ_REG_FAILED",
|
||||
+ [0x47] = "MCA_SVID_WRITE_REG_FAILED",
|
||||
+ [0x48] = "MCA_SVID_PKGC_INIT_FAILED",
|
||||
+ [0x49] = "MCA_SVID_PKGC_CONFIG_FAILED",
|
||||
+ [0x4a] = "MCA_SVID_PKGC_REQUEST_FAILED",
|
||||
+ [0x4b] = "MCA_SVID_IMON_REQUEST_FAILED",
|
||||
+ [0x4c] = "MCA_SVID_ALERT_REQUEST_FAILED",
|
||||
+ [0x4d] = "MCA_SVID_MCP_VR_ABSENT_OR_RAMP_ERROR",
|
||||
+ [0x4e] = "MCA_SVID_UNEXPECTED_MCP_VR_DETECTED",
|
||||
+ [0x51] = "MCA_FIVR_CATAS_OVERVOL_FAULT",
|
||||
+ [0x52] = "MCA_FIVR_CATAS_OVERCUR_FAULT",
|
||||
+ [0x58] = "MCA_WATCHDOG_TIMEOUT_PKGC_SLAVE",
|
||||
+ [0x59] = "MCA_WATCHDOG_TIMEOUT_PKGC_MASTER",
|
||||
+ [0x5a] = "MCA_WATCHDOG_TIMEOUT_PKGS_MASTER",
|
||||
+ [0x61] = "MCA_PKGS_CPD_UNCPD_TIMEOUT",
|
||||
+ [0x63] = "MCA_PKGS_INVALID_REQ_PCH",
|
||||
+ [0x64] = "MCA_PKGS_INVALID_REQ_INTERNAL",
|
||||
+ [0x65] = "MCA_PKGS_INVALID_RSP_INTERNAL",
|
||||
+ [0x6b] = "MCA_PKGS_SMBUS_VPP_PAUSE_TIMEOUT",
|
||||
+ [0x81] = "MCA_RECOVERABLE_DIE_THERMAL_TOO_HOT",
|
||||
+};
|
||||
+
|
||||
+static struct field pcu_mc4[] = {
|
||||
+ FIELD(24, pcu_1),
|
||||
+ {}
|
||||
+};
|
||||
+
|
||||
+/* See IA32 SDM Vol3B Table 16-28 */
|
||||
+
|
||||
+static char *qpi[] = {
|
||||
+ [0x00] = "UC Phy Initialization Failure",
|
||||
+ [0x01] = "UC Phy detected drift buffer alarm",
|
||||
+ [0x02] = "UC Phy detected latency buffer rollover",
|
||||
+ [0x10] = "UC LL Rx detected CRC error: unsuccessful LLR: entered abort state",
|
||||
+ [0x11] = "UC LL Rx unsupported or undefined packet",
|
||||
+ [0x12] = "UC LL or Phy control error",
|
||||
+ [0x13] = "UC LL Rx parameter exchange exception",
|
||||
+ [0x1F] = "UC LL detected control error from the link-mesh interface",
|
||||
+ [0x20] = "COR Phy initialization abort",
|
||||
+ [0x21] = "COR Phy reset",
|
||||
+ [0x22] = "COR Phy lane failure, recovery in x8 width",
|
||||
+ [0x23] = "COR Phy L0c error corrected without Phy reset",
|
||||
+ [0x24] = "COR Phy L0c error triggering Phy Reset",
|
||||
+ [0x25] = "COR Phy L0p exit error corrected with Phy reset",
|
||||
+ [0x30] = "COR LL Rx detected CRC error - successful LLR without Phy Reinit",
|
||||
+ [0x31] = "COR LL Rx detected CRC error - successful LLR with Phy Reinit",
|
||||
+};
|
||||
+
|
||||
+static struct field qpi_mc[] = {
|
||||
+ FIELD(16, qpi),
|
||||
+ {}
|
||||
+};
|
||||
+
|
||||
+/* These apply to MSCOD 0x12 "UC LL or Phy control error" */
|
||||
+static struct field qpi_0x12[] = {
|
||||
+ SBITFIELD(22, "Phy Control Error"),
|
||||
+ SBITFIELD(23, "Unexpected Retry.Ack flit"),
|
||||
+ SBITFIELD(24, "Unexpected Retry.Req flit"),
|
||||
+ SBITFIELD(25, "RF parity error"),
|
||||
+ SBITFIELD(26, "Routeback Table error"),
|
||||
+ SBITFIELD(27, "unexpected Tx Protocol flit (EOP, Header or Data)"),
|
||||
+ SBITFIELD(28, "Rx Header-or-Credit BGF credit overflow/underflow"),
|
||||
+ SBITFIELD(29, "Link Layer Reset still in progress when Phy enters L0"),
|
||||
+ SBITFIELD(30, "Link Layer reset initiated while protocol traffic not idle"),
|
||||
+ SBITFIELD(31, "Link Layer Tx Parity Error"),
|
||||
+ {}
|
||||
+};
|
||||
+
|
||||
+/* See IA32 SDM Vol3B Table 16-29 */
|
||||
+
|
||||
+static struct field mc_bits[] = {
|
||||
+ SBITFIELD(16, "Address parity error"),
|
||||
+ SBITFIELD(17, "HA write data parity error"),
|
||||
+ SBITFIELD(18, "HA write byte enable parity error"),
|
||||
+ SBITFIELD(19, "Corrected patrol scrub error"),
|
||||
+ SBITFIELD(20, "Uncorrected patrol scrub error"),
|
||||
+ SBITFIELD(21, "Corrected spare error"),
|
||||
+ SBITFIELD(22, "Uncorrected spare error"),
|
||||
+ SBITFIELD(23, "Any HA read error"),
|
||||
+ SBITFIELD(24, "WDB read parity error"),
|
||||
+ SBITFIELD(25, "DDR4 command address parity error"),
|
||||
+ SBITFIELD(26, "Uncorrected address parity error"),
|
||||
+ {}
|
||||
+};
|
||||
+
|
||||
+static char *mc_0x8xx[] = {
|
||||
+ [0x0] = "Unrecognized request type",
|
||||
+ [0x1] = "Read response to an invalid scoreboard entry",
|
||||
+ [0x2] = "Unexpected read response",
|
||||
+ [0x3] = "DDR4 completion to an invalid scoreboard entry",
|
||||
+ [0x4] = "Completion to an invalid scoreboard entry",
|
||||
+ [0x5] = "Completion FIFO overflow",
|
||||
+ [0x6] = "Correctable parity error",
|
||||
+ [0x7] = "Uncorrectable error",
|
||||
+ [0x8] = "Interrupt received while outstanding interrupt was not ACKed",
|
||||
+ [0x9] = "ERID FIFO overflow",
|
||||
+ [0xa] = "Error on Write credits",
|
||||
+ [0xb] = "Error on Read credits",
|
||||
+ [0xc] = "Scheduler error",
|
||||
+ [0xd] = "Error event",
|
||||
+};
|
||||
+
|
||||
+static struct field memctrl_mc13[] = {
|
||||
+ FIELD(16, mc_0x8xx),
|
||||
+ {}
|
||||
+};
|
||||
+
|
||||
+/* See IA32 SDM Vol3B Table 16-30 */
|
||||
+
|
||||
+static struct field m2m[] = {
|
||||
+ SBITFIELD(16, "MscodDataRdErr"),
|
||||
+ SBITFIELD(17, "Reserved"),
|
||||
+ SBITFIELD(18, "MscodPtlWrErr"),
|
||||
+ SBITFIELD(19, "MscodFullWrErr"),
|
||||
+ SBITFIELD(20, "MscodBgfErr"),
|
||||
+ SBITFIELD(21, "MscodTimeout"),
|
||||
+ SBITFIELD(22, "MscodParErr"),
|
||||
+ SBITFIELD(23, "MscodBucket1Err"),
|
||||
+ {}
|
||||
+};
|
||||
+
|
||||
+void skylake_xeon_decode_model(struct ras_events *ras, struct mce_event *e)
|
||||
+{
|
||||
+ uint64_t status = e->status;
|
||||
+ uint32_t mca = status & 0xffff;
|
||||
+ unsigned rank0 = -1, rank1 = -1, chan;
|
||||
+
|
||||
+ switch (e->bank) {
|
||||
+ case 4:
|
||||
+ switch (EXTRACT(status, 0, 15) & ~(1ull << 12)) {
|
||||
+ case 0x402: case 0x403:
|
||||
+ mce_snprintf(e->mcastatus_msg, "Internal errors ");
|
||||
+ break;
|
||||
+ case 0x406:
|
||||
+ mce_snprintf(e->mcastatus_msg, "Intel TXT errors ");
|
||||
+ break;
|
||||
+ case 0x407:
|
||||
+ mce_snprintf(e->mcastatus_msg, "Other UBOX Internal errors ");
|
||||
+ break;
|
||||
+ }
|
||||
+ if (EXTRACT(status, 16, 19))
|
||||
+ mce_snprintf(e->mcastatus_msg, "PCU internal error ");
|
||||
+ decode_bitfield(e, status, pcu_mc4);
|
||||
+ break;
|
||||
+ case 5:
|
||||
+ case 12:
|
||||
+ case 19:
|
||||
+ mce_snprintf(e->mcastatus_msg, "QPI: ");
|
||||
+ decode_bitfield(e, status, qpi_mc);
|
||||
+ if ((EXTRACT(status, 16, 21) == 0x12))
|
||||
+ decode_bitfield(e, status, qpi_0x12);
|
||||
+ break;
|
||||
+ case 7:
|
||||
+ case 8:
|
||||
+ mce_snprintf(e->mcastatus_msg, "M2M: ");
|
||||
+ decode_bitfield(e, status, m2m);
|
||||
+ break;
|
||||
+ case 13:
|
||||
+ case 14:
|
||||
+ case 15:
|
||||
+ case 16:
|
||||
+ mce_snprintf(e->mcastatus_msg, "MemCtrl: ");
|
||||
+ if (EXTRACT(status, 27, 27))
|
||||
+ decode_bitfield(e, status, memctrl_mc13);
|
||||
+ else
|
||||
+ decode_bitfield(e, status, mc_bits);
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * Memory error specific code. Returns if the error is not a MC one
|
||||
+ */
|
||||
+
|
||||
+ /* Check if the error is at the memory controller */
|
||||
+ if ((mca >> 7) != 1)
|
||||
+ return;
|
||||
+
|
||||
+ /* Ignore unless this is an corrected extended error from an iMC bank */
|
||||
+ if (e->bank < 9 || e->bank > 16 || (status & MCI_STATUS_UC) ||
|
||||
+ !test_prefix(7, status & 0xefff))
|
||||
+ return;
|
||||
+
|
||||
+ /*
|
||||
+ * Parse the reported channel and ranks
|
||||
+ */
|
||||
+
|
||||
+ chan = EXTRACT(status, 0, 3);
|
||||
+ if (chan == 0xf)
|
||||
+ return;
|
||||
+
|
||||
+ mce_snprintf(e->mc_location, "memory_channel=%d", chan);
|
||||
+
|
||||
+ if (EXTRACT(e->misc, 62, 62)) {
|
||||
+ rank0 = EXTRACT(e->misc, 46, 50);
|
||||
+ if (EXTRACT(e->misc, 63, 63))
|
||||
+ rank1 = EXTRACT(e->misc, 51, 55);
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * FIXME: The conversion from rank to dimm requires to parse the
|
||||
+ * DMI tables and call failrank2dimm().
|
||||
+ */
|
||||
+ if (rank0 != -1 && rank1 != -1)
|
||||
+ mce_snprintf(e->mc_location, "ranks=%d and %d",
|
||||
+ rank0, rank1);
|
||||
+ else if (rank0 != -1)
|
||||
+ mce_snprintf(e->mc_location, "rank=%d", rank0);
|
||||
+}
|
||||
+
|
||||
--- rasdaemon-0.4.1.orig/mce-intel.c 2017-05-30 12:43:11.975591485 -0400
|
||||
+++ rasdaemon-0.4.1/mce-intel.c 2017-05-30 12:43:16.948531592 -0400
|
||||
@@ -408,6 +408,9 @@ if (test_prefix(11, (e->status & 0xffffL
|
||||
case CPU_BROADWELL_EPEX:
|
||||
broadwell_epex_decode_model(ras, e);
|
||||
break;
|
||||
+ case CPU_SKYLAKE_XEON:
|
||||
+ skylake_xeon_decode_model(ras, e);
|
||||
+ break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
--- rasdaemon-0.4.1.orig/ras-mce-handler.c 2017-05-30 12:43:16.948531592 -0400
|
||||
+++ rasdaemon-0.4.1/ras-mce-handler.c 2017-05-30 12:44:00.295009527 -0400
|
||||
@@ -54,6 +54,8 @@ [CPU_XEON75XX] = "Intel Xeon 7500 series
|
||||
[CPU_BROADWELL_EPEX] = "Broadwell EP/EX",
|
||||
[CPU_KNIGHTS_LANDING] = "Knights Landing",
|
||||
[CPU_KNIGHTS_MILL] = "Knights Mill",
|
||||
+ [CPU_SKYLAKE] = "Skylake",
|
||||
+ [CPU_SKYLAKE_XEON] = "Skylake Xeon",
|
||||
};
|
||||
|
||||
static enum cputype select_intel_cputype(struct ras_events *ras)
|
||||
@@ -103,6 +105,10 @@ else if (mce->model == 0x57)
|
||||
return CPU_KNIGHTS_LANDING;
|
||||
else if (mce->model == 0x85)
|
||||
return CPU_KNIGHTS_MILL;
|
||||
+ else if (mce->model == 0x4e || mce->model == 0x5e)
|
||||
+ return CPU_SKYLAKE;
|
||||
+ else if (mce->model == 0x55)
|
||||
+ return CPU_SKYLAKE_XEON;
|
||||
|
||||
if (mce->model > 0x1a) {
|
||||
log(ALL, LOG_INFO,
|
||||
--- rasdaemon-0.4.1.orig/ras-mce-handler.h 2017-05-30 12:43:11.976591473 -0400
|
||||
+++ rasdaemon-0.4.1/ras-mce-handler.h 2017-05-30 12:44:25.745703000 -0400
|
||||
@@ -49,6 +49,8 @@ enum cputype {
|
||||
CPU_BROADWELL_EPEX,
|
||||
CPU_KNIGHTS_LANDING,
|
||||
CPU_KNIGHTS_MILL,
|
||||
+ CPU_SKYLAKE,
|
||||
+ CPU_SKYLAKE_XEON,
|
||||
};
|
||||
|
||||
struct mce_event {
|
||||
@@ -126,6 +128,7 @@ void knl_decode_model(struct ras_events
|
||||
void tulsa_decode_model(struct mce_event *e);
|
||||
void broadwell_de_decode_model(struct ras_events *ras, struct mce_event *e);
|
||||
void broadwell_epex_decode_model(struct ras_events *ras, struct mce_event *e);
|
||||
+void skylake_xeon_decode_model(struct ras_events *ras, struct mce_event *e);
|
||||
|
||||
/* Software defined banks */
|
||||
#define MCE_EXTENDED_BANK 128
|
|
@ -0,0 +1,142 @@
|
|||
---
|
||||
labels/dell | 96 +++++++++++++++++++++++++++++++++++-------------------------
|
||||
1 file changed, 56 insertions(+), 40 deletions(-)
|
||||
|
||||
--- rasdaemon-0.4.1.orig/labels/dell 2017-08-23 16:14:36.086652150 -0400
|
||||
+++ rasdaemon-0.4.1/labels/dell 2017-08-23 16:16:59.091057241 -0400
|
||||
@@ -4,23 +4,35 @@
|
||||
# labels are found from the silk screen on the motherboard.
|
||||
#
|
||||
#Vendor: <vendor-name>
|
||||
+# Product: <product-name>
|
||||
# Model: <model-name>
|
||||
# <label>: <mc>.<top>.<mid>.<low>
|
||||
#
|
||||
|
||||
Vendor: Dell Inc.
|
||||
-#### 11G ####
|
||||
+# 1-socket
|
||||
+ Product: PowerEdge R220, PowerEdge R330, PowerEdge T330, PowerEdge R230, PowerEdge T130, PowerEdge T30
|
||||
+ DIMM_A1: 0.0.0; DIMM_A2: 0.0.1;
|
||||
+ DIMM_A3: 0.1.0; DIMM_A4: 0.1.1;
|
||||
+
|
||||
+ Product: PowerEdge T110 II, PowerEdge T20
|
||||
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0;
|
||||
+
|
||||
+ DIMM_B1: 0.0.1; DIMM_B2: 0.1.1;
|
||||
+
|
||||
+ Product: PowerEdge R320, PowerEdge T320
|
||||
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0;
|
||||
+ DIMM_A4: 0.0.1; DIMM_A5: 0.1.1; DIMM_A6: 0.2.1;
|
||||
+
|
||||
# 2-socket
|
||||
-# PowerEdge R610
|
||||
- Model: 0K399H, 0F0XJ6
|
||||
+ Product: PowerEdge R610
|
||||
DIMM_A1: 0.0.0; DIMM_A2: 0.0.1; DIMM_A3: 0.0.2;
|
||||
DIMM_A4: 0.1.0; DIMM_A5: 0.1.1; DIMM_A6: 0.1.2;
|
||||
|
||||
DIMM_B1: 1.0.0; DIMM_B2: 1.0.1; DIMM_B3: 1.0.2;
|
||||
DIMM_B4: 1.1.0; DIMM_B5: 1.1.1; DIMM_B6: 1.1.2;
|
||||
|
||||
-# PowerEdge T710 R710
|
||||
- Model: 01CTXG, 0N0H4P, 0MD99X, 0N047H, 0PV9DG
|
||||
+ Product: PowerEdge T710, PowerEdge R710
|
||||
DIMM_A3: 0.0.0; DIMM_A2: 0.1.0; DIMM_A1: 0.2.0;
|
||||
DIMM_A6: 0.0.1; DIMM_A5: 0.1.1; DIMM_A4: 0.2.1;
|
||||
DIMM_A9: 0.0.2; DIMM_A8: 0.1.2; DIMM_A7: 0.2.2;
|
||||
@@ -29,27 +41,7 @@ DIMM_B3: 1.0.0; DIMM_B2: 1.1.0; DIMM_B1
|
||||
DIMM_B6: 1.0.1; DIMM_B5: 1.1.1; DIMM_B4: 1.2.1;
|
||||
DIMM_B9: 1.0.2; DIMM_B8: 1.1.2; DIMM_B7: 1.2.2;
|
||||
|
||||
-#### 12/13G ####
|
||||
-# 1-socket
|
||||
-# PowerEdge R220
|
||||
- Model: 081N4V
|
||||
- DIMM_A1: 0.0.0; DIMM_A2: 0.0.1;
|
||||
- DIMM_A3: 0.1.0; DIMM_A4: 0.1.1;
|
||||
-
|
||||
-#PowerEdge T110 II, T20
|
||||
- Model: 0PC2WT, 0PM2CW, 015TH9, 0MDHN4, 0VD5HY
|
||||
- DIMM_A1: 0.0.0; DIMM_A2: 0.1.0;
|
||||
-
|
||||
- DIMM_B1: 0.0.1; DIMM_B2: 0.1.1;
|
||||
-
|
||||
-#PowerEdge R320 T320
|
||||
- Model: 0YCV59, 0Y97HY, 07DKYR, 0VJ84C, 07MYHN, 04DMNN, 0W7H8C, 0K20G5, 0V719V, 0FDT3J
|
||||
- DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0;
|
||||
- DIMM_A4: 0.0.1; DIMM_A5: 0.1.1; DIMM_A6: 0.2.1;
|
||||
-
|
||||
-# 2-socket
|
||||
-# PowerEdge R620/T620 R720/xd R730/xd T630 R730 R630 T620 M620, FC620
|
||||
- Model: 0VWT90, 07NDJ2, 0F5XM3, 0PXXHP, 0X3D66, 061P35, 0H5J4J, 00W9X3, 0599V5, 0W9WXC, 0599V5, 0H21J3, 0CNCJW, 02CD1V, 0T5TFW, 0F5XM3, 0G1CNH, 05YV77, 0PDCCX, 093MW8, 0NJVT7
|
||||
+ Product: PowerEdge R620, PowerEdge T620, PowerEdge R720xd, PowerEdge R730xd, PowerEdge T630, PowerEdge R730, PowerEdge R630, PowerEdge T620, PowerEdge M620, PowerEdge FC620, PowerEdge M630, PowerEdge FC630
|
||||
DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0;
|
||||
DIMM_A5: 0.0.1; DIMM_A6: 0.1.1; DIMM_A7: 0.2.1; DIMM_A8: 0.3.1;
|
||||
DIMM_A9: 0.0.2; DIMM_A10: 0.1.2; DIMM_A11: 0.2.2; DIMM_A12: 0.3.2;
|
||||
@@ -58,23 +50,38 @@ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_
|
||||
DIMM_B5: 1.0.1; DIMM_B6: 1.1.1; DIMM_B7: 1.2.1; DIMM_B8: 1.3.1;
|
||||
DIMM_B9: 1.0.2; DIMM_B10: 1.1.2; DIMM_B11: 1.2.2; DIMM_B12: 1.3.2;
|
||||
|
||||
-# PowerEdge M520 R420 T420
|
||||
- Model: 0NRG83, 0DW6GX, 03WPHJ, 06HTRX, 0H1Y24, 02T9N6, 0TT5P2, 0CPKXG, 03015M, 061VPC, 0PC9H0, 0K3G34, 0PC0V5, 08NVYK
|
||||
- DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0;
|
||||
- DIMM_A4: 0.0.1; DIMM_A5: 0.1.1; DIMM_A6: 0.2.1;
|
||||
-
|
||||
- DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0;
|
||||
- DIMM_B4: 1.0.1; DIMM_B5: 1.1.1; DIMM_B6: 1.2.1;
|
||||
-
|
||||
-#PowerEdge FC420, M420
|
||||
- Model: 0DPJGD, 068CTP, 0MN3VC, 0417VP
|
||||
- DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0;
|
||||
+ Product: PowerEdge M520, PowerEdge R420, PowerEdge T420
|
||||
+ DIMM_A1: 0.1.0; DIMM_A2: 0.2.0; DIMM_A3: 0.3.0;
|
||||
+ DIMM_A4: 0.1.1; DIMM_A5: 0.2.1; DIMM_A6: 0.3.1;
|
||||
+
|
||||
+ DIMM_B1: 1.1.0; DIMM_B2: 1.2.0; DIMM_B3: 1.3.0;
|
||||
+ DIMM_B4: 1.1.1; DIMM_B5: 1.2.1; DIMM_B6: 1.3.1;
|
||||
|
||||
- DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0;
|
||||
+ Product: PowerEdge FC420, PowerEdge M420
|
||||
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0;
|
||||
+
|
||||
+ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0;
|
||||
+
|
||||
+ Product: PowerEdge C6320, PowerEdge C4130
|
||||
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0;
|
||||
+ DIMM_A5: 0.0.1; DIMM_A6: 0.1.1; DIMM_A7: 0.2.1; DIMM_A8: 0.3.1;
|
||||
+
|
||||
+ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0;
|
||||
+ DIMM_B5: 1.0.1; DIMM_B6: 1.1.1; DIMM_B7: 1.2.1; DIMM_B8: 1.3.1;
|
||||
+
|
||||
+ Product: PowerEdge R430, PowerEdge T430, PowerEdge R530
|
||||
+ DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0;
|
||||
+ DIMM_A5: 0.0.1; DIMM_A6: 0.1.1; DIMM_A7: 0.2.1; DIMM_A8: 0.3.1;
|
||||
+
|
||||
+ DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0;
|
||||
+
|
||||
+ Product: PowerEdge FC430
|
||||
+ DIMM_A1: 0.1.0; DIMM_A2: 0.0.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0;
|
||||
+
|
||||
+ DIMM_B1: 1.1.0; DIMM_B2: 1.0.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0;
|
||||
|
||||
# 4-socket
|
||||
-# # PowerEdge M820
|
||||
- Model: 0RN9TC, 0YWR73, 066N7P, 0PFG1N, 0JC2W3
|
||||
+ Product: PowerEdge M820, PowerEdge R830, PowerEdge M830, PowerEdge R930, PowerEdge FC830
|
||||
DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0;
|
||||
DIMM_A5: 0.0.1; DIMM_A6: 0.1.1; DIMM_A7: 0.2.1; DIMM_A8: 0.3.1;
|
||||
DIMM_A9: 0.0.2; DIMM_A10: 0.1.2; DIMM_A11: 0.2.2; DIMM_A12: 0.3.2;
|
||||
@@ -90,3 +97,12 @@ DIMM_C9: 2.0.2; DIMM_C10: 2.1.2;
|
||||
DIMM_D1: 3.0.0; DIMM_D2: 3.1.0; DIMM_D3: 3.2.0; DIMM_D4: 3.3.0;
|
||||
DIMM_D5: 3.0.1; DIMM_D6: 3.1.1; DIMM_D7: 3.2.1; DIMM_D8: 3.3.1;
|
||||
DIMM_D9: 3.0.2; DIMM_D10: 3.1.2; DIMM_D11: 3.2.2; DIMM_D12: 3.3.2;
|
||||
+
|
||||
+ Product: PowerEdge FM120x4
|
||||
+ DIMM_A_A1: 0.1.0; DIMM_A_A2: 0.2.0;
|
||||
+
|
||||
+ DIMM_B_A1: 1.1.0; DIMM_B_A2: 1.2.0;
|
||||
+
|
||||
+ DIMM_C_A1: 2.1.0; DIMM_C_A2: 2.2.0;
|
||||
+
|
||||
+ DIMM_D_A1: 3.1.0; DIMM_D_A2: 3.2.0;
|
|
@ -0,0 +1,69 @@
|
|||
From 993b8c40bd0c09a177d52c4f41b09ef2c969fa8d Mon Sep 17 00:00:00 2001
|
||||
From: "Charles.Rose@dell.com" <Charles.Rose@dell.com>
|
||||
Date: Fri, 11 Aug 2017 20:09:10 +0000
|
||||
Subject: [PATCH] rasdaemon: Update DIMM labels for Intel Skylake servers
|
||||
|
||||
Update labels for Intel Skylake based Dell PowerEdge servers.
|
||||
|
||||
Signed-off-by: Charles Rose <charles_rose@dell.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
|
||||
---
|
||||
labels/dell | 31 +++++++++++++++++++++++++++++++
|
||||
1 file changed, 31 insertions(+)
|
||||
|
||||
diff --git a/labels/dell b/labels/dell
|
||||
index 5abcd90..58455df 100644
|
||||
--- a/labels/dell
|
||||
+++ b/labels/dell
|
||||
@@ -50,6 +50,13 @@ Vendor: Dell Inc.
|
||||
DIMM_B5: 1.0.1; DIMM_B6: 1.1.1; DIMM_B7: 1.2.1; DIMM_B8: 1.3.1;
|
||||
DIMM_B9: 1.0.2; DIMM_B10: 1.1.2; DIMM_B11: 1.2.2; DIMM_B12: 1.3.2;
|
||||
|
||||
+ Product: PowerEdge R640, PowerEdge R740, PowerEdge R740xd
|
||||
+ A1: 0.0.0; A2: 0.1.0; A3: 0.2.0; A4: 1.0.0; A5: 1.1.0; A6: 1.2.0;
|
||||
+ A7: 0.0.1; A8: 0.1.1; A9: 0.2.1; A10: 1.0.1; A11: 1.1.1; A12: 1.2.1;
|
||||
+
|
||||
+ B1: 2.0.0; B2: 2.1.0; B3: 2.2.0; B4: 3.0.0; B5: 3.1.0; B6: 3.2.0;
|
||||
+ B7: 2.0.1; B8: 2.1.1; B9: 2.2.1; B10: 3.0.1; B11: 3.1.1; B12: 3.2.1;
|
||||
+
|
||||
Product: PowerEdge M520, PowerEdge R420, PowerEdge T420
|
||||
DIMM_A1: 0.1.0; DIMM_A2: 0.2.0; DIMM_A3: 0.3.0;
|
||||
DIMM_A4: 0.1.1; DIMM_A5: 0.2.1; DIMM_A6: 0.3.1;
|
||||
@@ -69,6 +76,17 @@ Vendor: Dell Inc.
|
||||
DIMM_B1: 1.0.0; DIMM_B2: 1.1.0; DIMM_B3: 1.2.0; DIMM_B4: 1.3.0;
|
||||
DIMM_B5: 1.0.1; DIMM_B6: 1.1.1; DIMM_B7: 1.2.1; DIMM_B8: 1.3.1;
|
||||
|
||||
+ Product: PowerEdge C6320p
|
||||
+ A1: 0.0.0; B1: 0.1.0; C1: 0.2.0;
|
||||
+ D1: 1.0.0; E1: 1.1.0; F1: 1.2.0;
|
||||
+
|
||||
+ Product: PowerEdge C6420
|
||||
+ A1: 0.0.0; A2: 0.1.0; A3: 0.2.0; A4: 1.0.0; A5: 1.1.0; A6: 1.2.0;
|
||||
+ A7: 0.0.1; A8: 1.0.1;
|
||||
+
|
||||
+ B1: 2.0.0; B2: 2.1.0; B3: 2.2.0; B4: 3.0.0; B5: 3.1.0; B6: 3.2.0;
|
||||
+ B7: 2.0.1; B8: 3.0.1;
|
||||
+
|
||||
Product: PowerEdge R430, PowerEdge T430, PowerEdge R530
|
||||
DIMM_A1: 0.0.0; DIMM_A2: 0.1.0; DIMM_A3: 0.2.0; DIMM_A4: 0.3.0;
|
||||
DIMM_A5: 0.0.1; DIMM_A6: 0.1.1; DIMM_A7: 0.2.1; DIMM_A8: 0.3.1;
|
||||
@@ -106,3 +124,16 @@ Vendor: Dell Inc.
|
||||
DIMM_C_A1: 2.1.0; DIMM_C_A2: 2.2.0;
|
||||
|
||||
DIMM_D_A1: 3.1.0; DIMM_D_A2: 3.2.0;
|
||||
+
|
||||
+ Product: PowerEdge R940
|
||||
+ A1: 0.0.0; A2: 0.1.0; A3: 0.2.0; A4: 1.0.0; A5: 1.1.0; A6: 1.2.0;
|
||||
+ A7: 0.0.1; A8: 0.1.1; A9: 0.2.1; A10: 1.0.1; A11: 1.1.1; A12: 1.2.1;
|
||||
+
|
||||
+ B1: 2.0.0; B2: 2.1.0; B3: 2.2.0; B4: 3.0.0; B5: 3.1.0; B6: 3.2.0;
|
||||
+ B7: 2.0.1; B8: 2.1.1; B9: 2.2.1; B10: 3.0.1; B11: 3.1.1; B12: 3.2.1;
|
||||
+
|
||||
+ C1: 4.0.0; C2: 4.1.0; C3: 4.2.0; C4: 5.0.0; C5: 5.1.0; C6: 5.2.0;
|
||||
+ C7: 4.0.1; C8: 4.1.1; C9: 4.2.1; C10: 5.0.1; C11: 5.1.1; C12: 5.2.1;
|
||||
+
|
||||
+ D1: 6.0.0; D2: 6.1.0; D3: 6.2.0; D4: 7.0.0; D5: 7.1.0; D6: 7.2.0;
|
||||
+ D7: 6.0.1; D8: 6.1.1; D9: 6.2.1; D10: 7.0.1; D11: 7.1.1; D12: 7.2.1;
|
||||
--
|
||||
1.8.3.1
|
||||
|
|
@ -0,0 +1,601 @@
|
|||
From 624d8a1d99a2f3bd06cbc537aff3cc30201ba7c2 Mon Sep 17 00:00:00 2001
|
||||
From: Tyler Baicar <tbaicar@codeaurora.org>
|
||||
Date: Mon, 12 Jun 2017 16:16:04 -0600
|
||||
Subject: [PATCH 1/2] rasdaemon: add support for non standard CPER section
|
||||
events
|
||||
|
||||
Add support to handle the non standard CPER section kernel trace
|
||||
events which cover RAS errors who's section type is unknown.
|
||||
|
||||
Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
|
||||
---
|
||||
Makefile.am | 3 +
|
||||
configure.ac | 9 +++
|
||||
ras-events.c | 15 +++++
|
||||
ras-events.h | 8 +++
|
||||
ras-non-standard-handler.c | 147 +++++++++++++++++++++++++++++++++++++++++++++
|
||||
ras-non-standard-handler.h | 26 ++++++++
|
||||
ras-record.c | 59 ++++++++++++++++++
|
||||
ras-record.h | 15 +++++
|
||||
ras-report.c | 80 ++++++++++++++++++++++++
|
||||
ras-report.h | 18 +++++-
|
||||
10 files changed, 379 insertions(+), 1 deletion(-)
|
||||
create mode 100644 ras-non-standard-handler.c
|
||||
create mode 100644 ras-non-standard-handler.h
|
||||
|
||||
diff --git a/Makefile.am b/Makefile.am
|
||||
index a10e4b3..c5811e8 100644
|
||||
--- a/Makefile.am
|
||||
+++ b/Makefile.am
|
||||
@@ -24,6 +24,9 @@ endif
|
||||
if WITH_AER
|
||||
rasdaemon_SOURCES += ras-aer-handler.c
|
||||
endif
|
||||
+if WITH_NON_STANDARD
|
||||
+ rasdaemon_SOURCES += ras-non-standard-handler.c
|
||||
+endif
|
||||
if WITH_MCE
|
||||
rasdaemon_SOURCES += ras-mce-handler.c mce-intel.c mce-amd-k8.c \
|
||||
mce-intel-p4-p6.c mce-intel-nehalem.c \
|
||||
diff --git a/configure.ac b/configure.ac
|
||||
index 5af5227..31bf6bd 100644
|
||||
--- a/configure.ac
|
||||
+++ b/configure.ac
|
||||
@@ -44,6 +44,15 @@ AS_IF([test "x$enable_aer" = "xyes"], [
|
||||
])
|
||||
AM_CONDITIONAL([WITH_AER], [test x$enable_aer = xyes])
|
||||
|
||||
+AC_ARG_ENABLE([non_standard],
|
||||
+ AS_HELP_STRING([--enable-non-standard], [enable NON_STANDARD events (currently experimental)]))
|
||||
+
|
||||
+AS_IF([test "x$enable_non_standard" = "xyes"], [
|
||||
+ AC_DEFINE(HAVE_NON_STANDARD,1,"have UNKNOWN_SEC events collect")
|
||||
+ AC_SUBST([WITH_NON_STANDARD])
|
||||
+])
|
||||
+AM_CONDITIONAL([WITH_NON_STANDARD], [test x$enable_non_standard = xyes])
|
||||
+
|
||||
AC_ARG_ENABLE([mce],
|
||||
AS_HELP_STRING([--enable-mce], [enable MCE events (currently experimental)]))
|
||||
|
||||
diff --git a/ras-events.c b/ras-events.c
|
||||
index 0be7c3f..96aa6f1 100644
|
||||
--- a/ras-events.c
|
||||
+++ b/ras-events.c
|
||||
@@ -29,6 +29,7 @@
|
||||
#include "libtrace/event-parse.h"
|
||||
#include "ras-mc-handler.h"
|
||||
#include "ras-aer-handler.h"
|
||||
+#include "ras-non-standard-handler.h"
|
||||
#include "ras-mce-handler.h"
|
||||
#include "ras-extlog-handler.h"
|
||||
#include "ras-record.h"
|
||||
@@ -208,6 +209,10 @@ int toggle_ras_mc_event(int enable)
|
||||
rc |= __toggle_ras_mc_event(ras, "ras", "extlog_mem_event", enable);
|
||||
#endif
|
||||
|
||||
+#ifdef HAVE_NON_STANDARD
|
||||
+ rc |= __toggle_ras_mc_event(ras, "ras", "non_standard_event", enable);
|
||||
+#endif
|
||||
+
|
||||
free_ras:
|
||||
free(ras);
|
||||
return rc;
|
||||
@@ -676,6 +681,16 @@ int handle_ras_events(int record_events)
|
||||
"ras", "aer_event");
|
||||
#endif
|
||||
|
||||
+#ifdef HAVE_NON_STANDARD
|
||||
+ rc = add_event_handler(ras, pevent, page_size, "ras", "non_standard_event",
|
||||
+ ras_non_standard_event_handler);
|
||||
+ if (!rc)
|
||||
+ num_events++;
|
||||
+ else
|
||||
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
|
||||
+ "ras", "non_standard_event");
|
||||
+#endif
|
||||
+
|
||||
cpus = get_num_cpus(ras);
|
||||
|
||||
#ifdef HAVE_MCE
|
||||
diff --git a/ras-events.h b/ras-events.h
|
||||
index 64e045a..3e1008f 100644
|
||||
--- a/ras-events.h
|
||||
+++ b/ras-events.h
|
||||
@@ -68,6 +68,14 @@ enum hw_event_mc_err_type {
|
||||
HW_EVENT_ERR_INFO,
|
||||
};
|
||||
|
||||
+/* Should match the code at Kernel's include/acpi/ghes.h */
|
||||
+enum ghes_severity {
|
||||
+ GHES_SEV_NO,
|
||||
+ GHES_SEV_CORRECTED,
|
||||
+ GHES_SEV_RECOVERABLE,
|
||||
+ GHES_SEV_PANIC,
|
||||
+};
|
||||
+
|
||||
/* Function prototypes */
|
||||
int toggle_ras_mc_event(int enable);
|
||||
int handle_ras_events(int record_events);
|
||||
diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c
|
||||
new file mode 100644
|
||||
index 0000000..4c154e5
|
||||
--- /dev/null
|
||||
+++ b/ras-non-standard-handler.c
|
||||
@@ -0,0 +1,147 @@
|
||||
+/*
|
||||
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
|
||||
+ *
|
||||
+ * This program is free software; you can redistribute it and/or modify
|
||||
+ * it under the terms of the GNU General Public License version 2 and
|
||||
+ * only version 2 as published by the Free Software Foundation.
|
||||
+
|
||||
+ * This program is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
+ * GNU General Public License for more details.
|
||||
+ */
|
||||
+
|
||||
+#include <stdio.h>
|
||||
+#include <stdlib.h>
|
||||
+#include <string.h>
|
||||
+#include <unistd.h>
|
||||
+#include "libtrace/kbuffer.h"
|
||||
+#include "ras-non-standard-handler.h"
|
||||
+#include "ras-record.h"
|
||||
+#include "ras-logger.h"
|
||||
+#include "ras-report.h"
|
||||
+
|
||||
+void print_le_hex(struct trace_seq *s, const uint8_t *buf, int index) {
|
||||
+ trace_seq_printf(s, "%02x%02x%02x%02x", buf[index+3], buf[index+2], buf[index+1], buf[index]);
|
||||
+}
|
||||
+
|
||||
+static char *uuid_le(const char *uu)
|
||||
+{
|
||||
+ static char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")];
|
||||
+ char *p = uuid;
|
||||
+ int i;
|
||||
+ static const unsigned char le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15};
|
||||
+
|
||||
+ for (i = 0; i < 16; i++) {
|
||||
+ p += sprintf(p, "%.2x", uu[le[i]]);
|
||||
+ switch (i) {
|
||||
+ case 3:
|
||||
+ case 5:
|
||||
+ case 7:
|
||||
+ case 9:
|
||||
+ *p++ = '-';
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ *p = 0;
|
||||
+
|
||||
+ return uuid;
|
||||
+}
|
||||
+
|
||||
+int ras_non_standard_event_handler(struct trace_seq *s,
|
||||
+ struct pevent_record *record,
|
||||
+ struct event_format *event, void *context)
|
||||
+{
|
||||
+ int len, i, line_count;
|
||||
+ unsigned long long val;
|
||||
+ struct ras_events *ras = context;
|
||||
+ time_t now;
|
||||
+ struct tm *tm;
|
||||
+ struct ras_non_standard_event ev;
|
||||
+
|
||||
+ /*
|
||||
+ * Newer kernels (3.10-rc1 or upper) provide an uptime clock.
|
||||
+ * On previous kernels, the way to properly generate an event would
|
||||
+ * be to inject a fake one, measure its timestamp and diff it against
|
||||
+ * gettimeofday. We won't do it here. Instead, let's use uptime,
|
||||
+ * falling-back to the event report's time, if "uptime" clock is
|
||||
+ * not available (legacy kernels).
|
||||
+ */
|
||||
+
|
||||
+ if (ras->use_uptime)
|
||||
+ now = record->ts/user_hz + ras->uptime_diff;
|
||||
+ else
|
||||
+ now = time(NULL);
|
||||
+
|
||||
+ tm = localtime(&now);
|
||||
+ if (tm)
|
||||
+ strftime(ev.timestamp, sizeof(ev.timestamp),
|
||||
+ "%Y-%m-%d %H:%M:%S %z", tm);
|
||||
+ trace_seq_printf(s, "%s ", ev.timestamp);
|
||||
+
|
||||
+ if (pevent_get_field_val(s, event, "sev", record, &val, 1) < 0)
|
||||
+ return -1;
|
||||
+ switch (val) {
|
||||
+ case GHES_SEV_NO:
|
||||
+ ev.severity = "Informational";
|
||||
+ break;
|
||||
+ case GHES_SEV_CORRECTED:
|
||||
+ ev.severity = "Corrected";
|
||||
+ break;
|
||||
+ case GHES_SEV_RECOVERABLE:
|
||||
+ ev.severity = "Recoverable";
|
||||
+ break;
|
||||
+ default:
|
||||
+ case GHES_SEV_PANIC:
|
||||
+ ev.severity = "Fatal";
|
||||
+ }
|
||||
+ trace_seq_printf(s, "\n %s", ev.severity);
|
||||
+
|
||||
+ ev.sec_type = pevent_get_field_raw(s, event, "sec_type", record, &len, 1);
|
||||
+ if(!ev.sec_type)
|
||||
+ return -1;
|
||||
+ trace_seq_printf(s, "\n section type: %s", uuid_le(ev.sec_type));
|
||||
+ ev.fru_text = pevent_get_field_raw(s, event, "fru_text",
|
||||
+ record, &len, 1);
|
||||
+ ev.fru_id = pevent_get_field_raw(s, event, "fru_id",
|
||||
+ record, &len, 1);
|
||||
+ trace_seq_printf(s, " fru text: %s fru id: %s ",
|
||||
+ ev.fru_text,
|
||||
+ uuid_le(ev.fru_id));
|
||||
+
|
||||
+ if (pevent_get_field_val(s, event, "len", record, &val, 1) < 0)
|
||||
+ return -1;
|
||||
+ ev.length = val;
|
||||
+ trace_seq_printf(s, "\n length: %d\n", ev.length);
|
||||
+
|
||||
+ ev.error = pevent_get_field_raw(s, event, "buf", record, &len, 1);
|
||||
+ if(!ev.error)
|
||||
+ return -1;
|
||||
+ len = ev.length;
|
||||
+ i = 0;
|
||||
+ line_count = 0;
|
||||
+ trace_seq_printf(s, " error:\n %08x: ", i);
|
||||
+ while(len >= 4) {
|
||||
+ print_le_hex(s, ev.error, i);
|
||||
+ i+=4;
|
||||
+ len-=4;
|
||||
+ if(++line_count == 4) {
|
||||
+ trace_seq_printf(s, "\n %08x: ", i);
|
||||
+ line_count = 0;
|
||||
+ } else
|
||||
+ trace_seq_printf(s, " ");
|
||||
+ }
|
||||
+
|
||||
+ /* Insert data into the SGBD */
|
||||
+#ifdef HAVE_SQLITE3
|
||||
+ ras_store_non_standard_record(ras, &ev);
|
||||
+#endif
|
||||
+
|
||||
+#ifdef HAVE_ABRT_REPORT
|
||||
+ /* Report event to ABRT */
|
||||
+ ras_report_non_standard_event(ras, &ev);
|
||||
+#endif
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
diff --git a/ras-non-standard-handler.h b/ras-non-standard-handler.h
|
||||
new file mode 100644
|
||||
index 0000000..2b5ac35
|
||||
--- /dev/null
|
||||
+++ b/ras-non-standard-handler.h
|
||||
@@ -0,0 +1,26 @@
|
||||
+/*
|
||||
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
|
||||
+ *
|
||||
+ * This program is free software; you can redistribute it and/or modify
|
||||
+ * it under the terms of the GNU General Public License version 2 and
|
||||
+ * only version 2 as published by the Free Software Foundation.
|
||||
+
|
||||
+ * This program is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
+ * GNU General Public License for more details.
|
||||
+ */
|
||||
+
|
||||
+#ifndef __RAS_NON_STANDARD_HANDLER_H
|
||||
+#define __RAS_NON_STANDARD_HANDLER_H
|
||||
+
|
||||
+#include "ras-events.h"
|
||||
+#include "libtrace/event-parse.h"
|
||||
+
|
||||
+int ras_non_standard_event_handler(struct trace_seq *s,
|
||||
+ struct pevent_record *record,
|
||||
+ struct event_format *event, void *context);
|
||||
+
|
||||
+void print_le_hex(struct trace_seq *s, const uint8_t *buf, int index);
|
||||
+
|
||||
+#endif
|
||||
diff --git a/ras-record.c b/ras-record.c
|
||||
index 3dc4493..357ab61 100644
|
||||
--- a/ras-record.c
|
||||
+++ b/ras-record.c
|
||||
@@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (C) 2013 Mauro Carvalho Chehab <mchehab@redhat.com>
|
||||
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -157,6 +158,57 @@ int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev)
|
||||
}
|
||||
#endif
|
||||
|
||||
+/*
|
||||
+ * Table and functions to handle ras:non standard
|
||||
+ */
|
||||
+
|
||||
+#ifdef HAVE_NON_STANDARD
|
||||
+static const struct db_fields non_standard_event_fields[] = {
|
||||
+ { .name="id", .type="INTEGER PRIMARY KEY" },
|
||||
+ { .name="timestamp", .type="TEXT" },
|
||||
+ { .name="sec_type", .type="BLOB" },
|
||||
+ { .name="fru_id", .type="BLOB" },
|
||||
+ { .name="fru_text", .type="TEXT" },
|
||||
+ { .name="severity", .type="TEXT" },
|
||||
+ { .name="error", .type="BLOB" },
|
||||
+};
|
||||
+
|
||||
+static const struct db_table_descriptor non_standard_event_tab = {
|
||||
+ .name = "non_standard_event",
|
||||
+ .fields = non_standard_event_fields,
|
||||
+ .num_fields = ARRAY_SIZE(non_standard_event_fields),
|
||||
+};
|
||||
+
|
||||
+int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standard_event *ev)
|
||||
+{
|
||||
+ int rc;
|
||||
+ struct sqlite3_priv *priv = ras->db_priv;
|
||||
+
|
||||
+ if (!priv || !priv->stmt_non_standard_record)
|
||||
+ return 0;
|
||||
+ log(TERM, LOG_INFO, "non_standard_event store: %p\n", priv->stmt_non_standard_record);
|
||||
+
|
||||
+ sqlite3_bind_text (priv->stmt_non_standard_record, 1, ev->timestamp, -1, NULL);
|
||||
+ sqlite3_bind_blob (priv->stmt_non_standard_record, 2, ev->sec_type, -1, NULL);
|
||||
+ sqlite3_bind_blob (priv->stmt_non_standard_record, 3, ev->fru_id, 16, NULL);
|
||||
+ sqlite3_bind_text (priv->stmt_non_standard_record, 4, ev->fru_text, -1, NULL);
|
||||
+ sqlite3_bind_text (priv->stmt_non_standard_record, 5, ev->severity, -1, NULL);
|
||||
+ sqlite3_bind_blob (priv->stmt_non_standard_record, 6, ev->error, ev->length, NULL);
|
||||
+
|
||||
+ rc = sqlite3_step(priv->stmt_non_standard_record);
|
||||
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "Failed to do non_standard_event step on sqlite: error = %d\n", rc);
|
||||
+ rc = sqlite3_reset(priv->stmt_non_standard_record);
|
||||
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "Failed reset non_standard_event on sqlite: error = %d\n", rc);
|
||||
+ log(TERM, LOG_INFO, "register inserted at db\n");
|
||||
+
|
||||
+ return rc;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
#ifdef HAVE_EXTLOG
|
||||
static const struct db_fields extlog_event_fields[] = {
|
||||
{ .name="id", .type="INTEGER PRIMARY KEY" },
|
||||
@@ -450,6 +502,13 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras)
|
||||
&mce_record_tab);
|
||||
#endif
|
||||
|
||||
+#ifdef HAVE_NON_STANDARD
|
||||
+ rc = ras_mc_create_table(priv, &non_standard_event_tab);
|
||||
+ if (rc == SQLITE_OK)
|
||||
+ rc = ras_mc_prepare_stmt(priv, &priv->stmt_non_standard_record,
|
||||
+ &non_standard_event_tab);
|
||||
+#endif
|
||||
+
|
||||
ras->db_priv = priv;
|
||||
return 0;
|
||||
}
|
||||
diff --git a/ras-record.h b/ras-record.h
|
||||
index 5d84297..473ae40 100644
|
||||
--- a/ras-record.h
|
||||
+++ b/ras-record.h
|
||||
@@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (C) 2013 Mauro Carvalho Chehab <mchehab@redhat.com>
|
||||
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -56,9 +57,18 @@ struct ras_extlog_event {
|
||||
unsigned short cper_data_length;
|
||||
};
|
||||
|
||||
+struct ras_non_standard_event {
|
||||
+ char timestamp[64];
|
||||
+ const char *sec_type, *fru_id, *fru_text;
|
||||
+ const char *severity;
|
||||
+ const uint8_t *error;
|
||||
+ uint32_t length;
|
||||
+};
|
||||
+
|
||||
struct ras_mc_event;
|
||||
struct ras_aer_event;
|
||||
struct ras_extlog_event;
|
||||
+struct ras_non_standard_event;
|
||||
struct mce_event;
|
||||
|
||||
#ifdef HAVE_SQLITE3
|
||||
@@ -77,6 +87,9 @@ struct sqlite3_priv {
|
||||
#ifdef HAVE_EXTLOG
|
||||
sqlite3_stmt *stmt_extlog_record;
|
||||
#endif
|
||||
+#ifdef HAVE_NON_STANDARD
|
||||
+ sqlite3_stmt *stmt_non_standard_record;
|
||||
+#endif
|
||||
};
|
||||
|
||||
int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras);
|
||||
@@ -84,6 +97,7 @@ int ras_store_mc_event(struct ras_events *ras, struct ras_mc_event *ev);
|
||||
int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev);
|
||||
int ras_store_mce_record(struct ras_events *ras, struct mce_event *ev);
|
||||
int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event *ev);
|
||||
+int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standard_event *ev);
|
||||
|
||||
#else
|
||||
static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; };
|
||||
@@ -91,6 +105,7 @@ static inline int ras_store_mc_event(struct ras_events *ras, struct ras_mc_event
|
||||
static inline int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev) { return 0; };
|
||||
static inline int ras_store_mce_record(struct ras_events *ras, struct mce_event *ev) { return 0; };
|
||||
static inline int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event *ev) { return 0; };
|
||||
+static inline int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standard_event *ev) { return 0; };
|
||||
|
||||
#endif
|
||||
|
||||
diff --git a/ras-report.c b/ras-report.c
|
||||
index 0a05732..1eb9f79 100644
|
||||
--- a/ras-report.c
|
||||
+++ b/ras-report.c
|
||||
@@ -1,3 +1,16 @@
|
||||
+/*
|
||||
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
|
||||
+ *
|
||||
+ * This program is free software; you can redistribute it and/or modify
|
||||
+ * it under the terms of the GNU General Public License version 2 and
|
||||
+ * only version 2 as published by the Free Software Foundation.
|
||||
+
|
||||
+ * This program is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
+ * GNU General Public License for more details.
|
||||
+ */
|
||||
+
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
@@ -196,6 +209,25 @@ static int set_aer_event_backtrace(char *buf, struct ras_aer_event *ev){
|
||||
return 0;
|
||||
}
|
||||
|
||||
+static int set_non_standard_event_backtrace(char *buf, struct ras_non_standard_event *ev){
|
||||
+ char bt_buf[MAX_BACKTRACE_SIZE];
|
||||
+
|
||||
+ if(!buf || !ev)
|
||||
+ return -1;
|
||||
+
|
||||
+ sprintf(bt_buf, "BACKTRACE=" \
|
||||
+ "timestamp=%s\n" \
|
||||
+ "severity=%s\n" \
|
||||
+ "length=%d\n", \
|
||||
+ ev->timestamp, \
|
||||
+ ev->severity, \
|
||||
+ ev->length);
|
||||
+
|
||||
+ strcat(buf, bt_buf);
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
static int commit_report_backtrace(int sockfd, int type, void *ev){
|
||||
char buf[MAX_BACKTRACE_SIZE];
|
||||
char *pbuf = buf;
|
||||
@@ -218,6 +250,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){
|
||||
case MCE_EVENT:
|
||||
rc = set_mce_event_backtrace(buf, (struct mce_event *)ev);
|
||||
break;
|
||||
+ case NON_STANDARD_EVENT:
|
||||
+ rc = set_non_standard_event_backtrace(buf, (struct ras_non_standard_event *)ev);
|
||||
+ break;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
@@ -345,6 +380,51 @@ aer_fail:
|
||||
}
|
||||
}
|
||||
|
||||
+int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standard_event *ev){
|
||||
+ char buf[MAX_MESSAGE_SIZE];
|
||||
+ int sockfd = 0;
|
||||
+ int rc = -1;
|
||||
+
|
||||
+ memset(buf, 0, sizeof(buf));
|
||||
+
|
||||
+ sockfd = setup_report_socket();
|
||||
+ if(sockfd < 0){
|
||||
+ return rc;
|
||||
+ }
|
||||
+
|
||||
+ rc = commit_report_basic(sockfd);
|
||||
+ if(rc < 0){
|
||||
+ goto non_standard_fail;
|
||||
+ }
|
||||
+
|
||||
+ rc = commit_report_backtrace(sockfd, NON_STANDARD_EVENT, ev);
|
||||
+ if(rc < 0){
|
||||
+ goto non_standard_fail;
|
||||
+ }
|
||||
+
|
||||
+ sprintf(buf, "ANALYZER=%s", "rasdaemon-non-standard");
|
||||
+ rc = write(sockfd, buf, strlen(buf) + 1);
|
||||
+ if(rc < strlen(buf) + 1){
|
||||
+ goto non_standard_fail;
|
||||
+ }
|
||||
+
|
||||
+ sprintf(buf, "REASON=%s", "Unknown CPER section problem");
|
||||
+ rc = write(sockfd, buf, strlen(buf) + 1);
|
||||
+ if(rc < strlen(buf) + 1){
|
||||
+ goto non_standard_fail;
|
||||
+ }
|
||||
+
|
||||
+ rc = 0;
|
||||
+
|
||||
+non_standard_fail:
|
||||
+
|
||||
+ if(sockfd > 0){
|
||||
+ close(sockfd);
|
||||
+ }
|
||||
+
|
||||
+ return rc;
|
||||
+}
|
||||
+
|
||||
int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev){
|
||||
char buf[MAX_MESSAGE_SIZE];
|
||||
int sockfd = 0;
|
||||
diff --git a/ras-report.h b/ras-report.h
|
||||
index 7920cdf..c2fcf42 100644
|
||||
--- a/ras-report.h
|
||||
+++ b/ras-report.h
|
||||
@@ -1,3 +1,16 @@
|
||||
+/*
|
||||
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
|
||||
+ *
|
||||
+ * This program is free software; you can redistribute it and/or modify
|
||||
+ * it under the terms of the GNU General Public License version 2 and
|
||||
+ * only version 2 as published by the Free Software Foundation.
|
||||
+
|
||||
+ * This program is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
+ * GNU General Public License for more details.
|
||||
+ */
|
||||
+
|
||||
#ifndef __RAS_REPORT_H
|
||||
#define __RAS_REPORT_H
|
||||
|
||||
@@ -19,7 +32,8 @@
|
||||
enum {
|
||||
MC_EVENT,
|
||||
MCE_EVENT,
|
||||
- AER_EVENT
|
||||
+ AER_EVENT,
|
||||
+ NON_STANDARD_EVENT
|
||||
};
|
||||
|
||||
#ifdef HAVE_ABRT_REPORT
|
||||
@@ -27,12 +41,14 @@ enum {
|
||||
int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev);
|
||||
int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev);
|
||||
int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev);
|
||||
+int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standard_event *ev);
|
||||
|
||||
#else
|
||||
|
||||
static inline int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev) { return 0; };
|
||||
static inline int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev) { return 0; };
|
||||
static inline int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev) { return 0; };
|
||||
+static inline int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standard_event *ev) { return 0; };
|
||||
|
||||
#endif
|
||||
|
||||
--
|
||||
1.8.3.1
|
||||
|
|
@ -0,0 +1,150 @@
|
|||
From 873e88d6ba1ce5ec97f5cc0f4f0b45dfd2026b9f Mon Sep 17 00:00:00 2001
|
||||
From: "shiju.jose@huawei.com" <shiju.jose@huawei.com>
|
||||
Date: Wed, 4 Oct 2017 10:11:08 +0100
|
||||
Subject: [PATCH] rasdaemon:add support for non-standard error decoder
|
||||
|
||||
This patch add support to decode the non-standard
|
||||
error information.
|
||||
|
||||
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
|
||||
---
|
||||
ras-non-standard-handler.c | 62 +++++++++++++++++++++++++++++++++++++++++++++-
|
||||
ras-non-standard-handler.h | 10 ++++++++
|
||||
2 files changed, 71 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/ras-non-standard-handler.c b/ras-non-standard-handler.c
|
||||
index 4c154e5..21e6a76 100644
|
||||
--- a/ras-non-standard-handler.c
|
||||
+++ b/ras-non-standard-handler.c
|
||||
@@ -13,6 +13,7 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
+#include <stdbool.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include "libtrace/kbuffer.h"
|
||||
@@ -21,6 +22,31 @@
|
||||
#include "ras-logger.h"
|
||||
#include "ras-report.h"
|
||||
|
||||
+static p_ns_dec_tab * ns_dec_tab;
|
||||
+static size_t dec_tab_count;
|
||||
+
|
||||
+int register_ns_dec_tab(const p_ns_dec_tab tab)
|
||||
+{
|
||||
+ ns_dec_tab = (p_ns_dec_tab *)realloc(ns_dec_tab,
|
||||
+ (dec_tab_count + 1) * sizeof(tab));
|
||||
+ if (ns_dec_tab == NULL) {
|
||||
+ printf("%s p_ns_dec_tab malloc failed", __func__);
|
||||
+ return -1;
|
||||
+ }
|
||||
+ ns_dec_tab[dec_tab_count] = tab;
|
||||
+ dec_tab_count++;
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+void unregister_ns_dec_tab(void)
|
||||
+{
|
||||
+ if (ns_dec_tab) {
|
||||
+ free(ns_dec_tab);
|
||||
+ ns_dec_tab = NULL;
|
||||
+ dec_tab_count = 0;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
void print_le_hex(struct trace_seq *s, const uint8_t *buf, int index) {
|
||||
trace_seq_printf(s, "%02x%02x%02x%02x", buf[index+3], buf[index+2], buf[index+1], buf[index]);
|
||||
}
|
||||
@@ -49,16 +75,32 @@ static char *uuid_le(const char *uu)
|
||||
return uuid;
|
||||
}
|
||||
|
||||
+static int uuid_le_cmp(const char *sec_type, const char *uuid2)
|
||||
+{
|
||||
+ static char uuid1[32];
|
||||
+ char *p = uuid1;
|
||||
+ int i;
|
||||
+ static const unsigned char le[16] = {
|
||||
+ 3, 2, 1, 0, 5, 4, 7, 6, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||
+
|
||||
+ for (i = 0; i < 16; i++)
|
||||
+ p += sprintf(p, "%.2x", sec_type[le[i]]);
|
||||
+ *p = 0;
|
||||
+ return strncmp(uuid1, uuid2, 32);
|
||||
+}
|
||||
+
|
||||
int ras_non_standard_event_handler(struct trace_seq *s,
|
||||
struct pevent_record *record,
|
||||
struct event_format *event, void *context)
|
||||
{
|
||||
- int len, i, line_count;
|
||||
+ int len, i, line_count, count;
|
||||
unsigned long long val;
|
||||
struct ras_events *ras = context;
|
||||
time_t now;
|
||||
struct tm *tm;
|
||||
struct ras_non_standard_event ev;
|
||||
+ p_ns_dec_tab dec_tab;
|
||||
+ bool dec_done = false;
|
||||
|
||||
/*
|
||||
* Newer kernels (3.10-rc1 or upper) provide an uptime clock.
|
||||
@@ -133,6 +175,18 @@ int ras_non_standard_event_handler(struct trace_seq *s,
|
||||
trace_seq_printf(s, " ");
|
||||
}
|
||||
|
||||
+ for (count = 0; count < dec_tab_count && !dec_done; count++) {
|
||||
+ dec_tab = ns_dec_tab[count];
|
||||
+ for (i = 0; i < dec_tab[0].len; i++) {
|
||||
+ if (uuid_le_cmp(ev.sec_type,
|
||||
+ dec_tab[i].sec_type) == 0) {
|
||||
+ dec_tab[i].decode(s, ev.error);
|
||||
+ dec_done = true;
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
/* Insert data into the SGBD */
|
||||
#ifdef HAVE_SQLITE3
|
||||
ras_store_non_standard_record(ras, &ev);
|
||||
@@ -145,3 +199,9 @@ int ras_non_standard_event_handler(struct trace_seq *s,
|
||||
|
||||
return 0;
|
||||
}
|
||||
+
|
||||
+__attribute__((destructor))
|
||||
+static void ns_exit(void)
|
||||
+{
|
||||
+ unregister_ns_dec_tab();
|
||||
+}
|
||||
diff --git a/ras-non-standard-handler.h b/ras-non-standard-handler.h
|
||||
index 2b5ac35..a183d1a 100644
|
||||
--- a/ras-non-standard-handler.h
|
||||
+++ b/ras-non-standard-handler.h
|
||||
@@ -17,10 +17,20 @@
|
||||
#include "ras-events.h"
|
||||
#include "libtrace/event-parse.h"
|
||||
|
||||
+typedef struct ras_ns_dec_tab {
|
||||
+ const char *sec_type;
|
||||
+ int (*decode)(struct trace_seq *s, const void *err);
|
||||
+ size_t len;
|
||||
+} *p_ns_dec_tab;
|
||||
+
|
||||
int ras_non_standard_event_handler(struct trace_seq *s,
|
||||
struct pevent_record *record,
|
||||
struct event_format *event, void *context);
|
||||
|
||||
void print_le_hex(struct trace_seq *s, const uint8_t *buf, int index);
|
||||
|
||||
+int register_ns_dec_tab(const p_ns_dec_tab tab);
|
||||
+
|
||||
+void unregister_ns_dec_tab(void);
|
||||
+
|
||||
#endif
|
||||
--
|
||||
1.8.3.1
|
||||
|
|
@ -0,0 +1,489 @@
|
|||
From 5662e5376adcc45da43d7818c8ac1882883c18ac Mon Sep 17 00:00:00 2001
|
||||
From: Tyler Baicar <tbaicar@codeaurora.org>
|
||||
Date: Tue, 12 Sep 2017 14:58:25 -0600
|
||||
Subject: [PATCH 1/2] rasdaemon: add support for ARM events
|
||||
|
||||
Add support to handle the ARM kernel trace events
|
||||
which cover RAS ARM processor errors.
|
||||
|
||||
[V4]: fix arm_event_tab usage
|
||||
|
||||
Change-Id: Ife99c97042498d5fad4d9b8e873ecfba6a47947d
|
||||
Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
|
||||
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
|
||||
---
|
||||
Makefile.am | 3 ++
|
||||
configure.ac | 9 ++++++
|
||||
ras-arm-handler.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
ras-arm-handler.h | 24 +++++++++++++++
|
||||
ras-events.c | 15 ++++++++++
|
||||
ras-record.c | 59 ++++++++++++++++++++++++++++++++++++
|
||||
ras-record.h | 16 ++++++++++
|
||||
ras-report.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++
|
||||
ras-report.h | 5 +++-
|
||||
9 files changed, 295 insertions(+), 1 deletion(-)
|
||||
create mode 100644 ras-arm-handler.c
|
||||
create mode 100644 ras-arm-handler.h
|
||||
|
||||
diff --git a/Makefile.am b/Makefile.am
|
||||
index 2500772..4aa5543 100644
|
||||
--- a/Makefile.am
|
||||
+++ b/Makefile.am
|
||||
@@ -27,6 +27,9 @@ endif
|
||||
if WITH_NON_STANDARD
|
||||
rasdaemon_SOURCES += ras-non-standard-handler.c
|
||||
endif
|
||||
+if WITH_ARM
|
||||
+ rasdaemon_SOURCES += ras-arm-handler.c
|
||||
+endif
|
||||
if WITH_MCE
|
||||
rasdaemon_SOURCES += ras-mce-handler.c mce-intel.c mce-amd-k8.c \
|
||||
mce-intel-p4-p6.c mce-intel-nehalem.c \
|
||||
diff --git a/configure.ac b/configure.ac
|
||||
index ecd4b2f..14fc2f2 100644
|
||||
--- a/configure.ac
|
||||
+++ b/configure.ac
|
||||
@@ -53,6 +53,15 @@ AS_IF([test "x$enable_non_standard" = "xyes"], [
|
||||
])
|
||||
AM_CONDITIONAL([WITH_NON_STANDARD], [test x$enable_non_standard = xyes])
|
||||
|
||||
+AC_ARG_ENABLE([arm],
|
||||
+ AS_HELP_STRING([--enable-arm], [enable ARM events (currently experimental)]))
|
||||
+
|
||||
+AS_IF([test "x$enable_arm" = "xyes"], [
|
||||
+ AC_DEFINE(HAVE_ARM,1,"have ARM events collect")
|
||||
+ AC_SUBST([WITH_ARM])
|
||||
+])
|
||||
+AM_CONDITIONAL([WITH_ARM], [test x$enable_arm = xyes])
|
||||
+
|
||||
AC_ARG_ENABLE([mce],
|
||||
AS_HELP_STRING([--enable-mce], [enable MCE events (currently experimental)]))
|
||||
|
||||
diff --git a/ras-arm-handler.c b/ras-arm-handler.c
|
||||
new file mode 100644
|
||||
index 0000000..a76470d
|
||||
--- /dev/null
|
||||
+++ b/ras-arm-handler.c
|
||||
@@ -0,0 +1,90 @@
|
||||
+/*
|
||||
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
|
||||
+ *
|
||||
+ * This program is free software; you can redistribute it and/or modify
|
||||
+ * it under the terms of the GNU General Public License version 2 and
|
||||
+ * only version 2 as published by the Free Software Foundation.
|
||||
+
|
||||
+ * This program is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
+ * GNU General Public License for more details.
|
||||
+ */
|
||||
+
|
||||
+#include <stdio.h>
|
||||
+#include <stdlib.h>
|
||||
+#include <string.h>
|
||||
+#include <unistd.h>
|
||||
+#include "libtrace/kbuffer.h"
|
||||
+#include "ras-arm-handler.h"
|
||||
+#include "ras-record.h"
|
||||
+#include "ras-logger.h"
|
||||
+#include "ras-report.h"
|
||||
+
|
||||
+int ras_arm_event_handler(struct trace_seq *s,
|
||||
+ struct pevent_record *record,
|
||||
+ struct event_format *event, void *context)
|
||||
+{
|
||||
+ unsigned long long val;
|
||||
+ struct ras_events *ras = context;
|
||||
+ time_t now;
|
||||
+ struct tm *tm;
|
||||
+ struct ras_arm_event ev;
|
||||
+
|
||||
+ /*
|
||||
+ * Newer kernels (3.10-rc1 or upper) provide an uptime clock.
|
||||
+ * On previous kernels, the way to properly generate an event would
|
||||
+ * be to inject a fake one, measure its timestamp and diff it against
|
||||
+ * gettimeofday. We won't do it here. Instead, let's use uptime,
|
||||
+ * falling-back to the event report's time, if "uptime" clock is
|
||||
+ * not available (legacy kernels).
|
||||
+ */
|
||||
+
|
||||
+ if (ras->use_uptime)
|
||||
+ now = record->ts/user_hz + ras->uptime_diff;
|
||||
+ else
|
||||
+ now = time(NULL);
|
||||
+
|
||||
+ tm = localtime(&now);
|
||||
+ if (tm)
|
||||
+ strftime(ev.timestamp, sizeof(ev.timestamp),
|
||||
+ "%Y-%m-%d %H:%M:%S %z", tm);
|
||||
+ trace_seq_printf(s, "%s\n", ev.timestamp);
|
||||
+
|
||||
+ if (pevent_get_field_val(s, event, "affinity", record, &val, 1) < 0)
|
||||
+ return -1;
|
||||
+ ev.affinity = val;
|
||||
+ trace_seq_printf(s, " affinity: %d", ev.affinity);
|
||||
+
|
||||
+ if (pevent_get_field_val(s, event, "mpidr", record, &val, 1) < 0)
|
||||
+ return -1;
|
||||
+ ev.mpidr = val;
|
||||
+ trace_seq_printf(s, "\n MPIDR: 0x%llx", (unsigned long long)ev.mpidr);
|
||||
+
|
||||
+ if (pevent_get_field_val(s, event, "midr", record, &val, 1) < 0)
|
||||
+ return -1;
|
||||
+ ev.midr = val;
|
||||
+ trace_seq_printf(s, "\n MIDR: 0x%llx", (unsigned long long)ev.midr);
|
||||
+
|
||||
+ if (pevent_get_field_val(s, event, "running_state", record, &val, 1) < 0)
|
||||
+ return -1;
|
||||
+ ev.running_state = val;
|
||||
+ trace_seq_printf(s, "\n running_state: %d", ev.running_state);
|
||||
+
|
||||
+ if (pevent_get_field_val(s, event, "psci_state", record, &val, 1) < 0)
|
||||
+ return -1;
|
||||
+ ev.psci_state = val;
|
||||
+ trace_seq_printf(s, "\n psci_state: %d", ev.psci_state);
|
||||
+
|
||||
+ /* Insert data into the SGBD */
|
||||
+#ifdef HAVE_SQLITE3
|
||||
+ ras_store_arm_record(ras, &ev);
|
||||
+#endif
|
||||
+
|
||||
+#ifdef HAVE_ABRT_REPORT
|
||||
+ /* Report event to ABRT */
|
||||
+ ras_report_arm_event(ras, &ev);
|
||||
+#endif
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
diff --git a/ras-arm-handler.h b/ras-arm-handler.h
|
||||
new file mode 100644
|
||||
index 0000000..eae10ec
|
||||
--- /dev/null
|
||||
+++ b/ras-arm-handler.h
|
||||
@@ -0,0 +1,24 @@
|
||||
+/*
|
||||
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
|
||||
+ *
|
||||
+ * This program is free software; you can redistribute it and/or modify
|
||||
+ * it under the terms of the GNU General Public License version 2 and
|
||||
+ * only version 2 as published by the Free Software Foundation.
|
||||
+
|
||||
+ * This program is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
+ * GNU General Public License for more details.
|
||||
+ */
|
||||
+
|
||||
+#ifndef __RAS_ARM_HANDLER_H
|
||||
+#define __RAS_ARM_HANDLER_H
|
||||
+
|
||||
+#include "ras-events.h"
|
||||
+#include "libtrace/event-parse.h"
|
||||
+
|
||||
+int ras_arm_event_handler(struct trace_seq *s,
|
||||
+ struct pevent_record *record,
|
||||
+ struct event_format *event, void *context);
|
||||
+
|
||||
+#endif
|
||||
diff --git a/ras-events.c b/ras-events.c
|
||||
index 96aa6f1..812d712 100644
|
||||
--- a/ras-events.c
|
||||
+++ b/ras-events.c
|
||||
@@ -30,6 +30,7 @@
|
||||
#include "ras-mc-handler.h"
|
||||
#include "ras-aer-handler.h"
|
||||
#include "ras-non-standard-handler.h"
|
||||
+#include "ras-arm-handler.h"
|
||||
#include "ras-mce-handler.h"
|
||||
#include "ras-extlog-handler.h"
|
||||
#include "ras-record.h"
|
||||
@@ -213,6 +214,10 @@ int toggle_ras_mc_event(int enable)
|
||||
rc |= __toggle_ras_mc_event(ras, "ras", "non_standard_event", enable);
|
||||
#endif
|
||||
|
||||
+#ifdef HAVE_ARM
|
||||
+ rc |= __toggle_ras_mc_event(ras, "ras", "arm_event", enable);
|
||||
+#endif
|
||||
+
|
||||
free_ras:
|
||||
free(ras);
|
||||
return rc;
|
||||
@@ -691,6 +696,16 @@ int handle_ras_events(int record_events)
|
||||
"ras", "non_standard_event");
|
||||
#endif
|
||||
|
||||
+#ifdef HAVE_ARM
|
||||
+ rc = add_event_handler(ras, pevent, page_size, "ras", "arm_event",
|
||||
+ ras_arm_event_handler);
|
||||
+ if (!rc)
|
||||
+ num_events++;
|
||||
+ else
|
||||
+ log(ALL, LOG_ERR, "Can't get traces from %s:%s\n",
|
||||
+ "ras", "arm_event");
|
||||
+#endif
|
||||
+
|
||||
cpus = get_num_cpus(ras);
|
||||
|
||||
#ifdef HAVE_MCE
|
||||
diff --git a/ras-record.c b/ras-record.c
|
||||
index 357ab61..c3644cb 100644
|
||||
--- a/ras-record.c
|
||||
+++ b/ras-record.c
|
||||
@@ -209,6 +209,58 @@ int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standar
|
||||
}
|
||||
#endif
|
||||
|
||||
+/*
|
||||
+ * Table and functions to handle ras:arm
|
||||
+ */
|
||||
+
|
||||
+#ifdef HAVE_ARM
|
||||
+static const struct db_fields arm_event_fields[] = {
|
||||
+ { .name="id", .type="INTEGER PRIMARY KEY" },
|
||||
+ { .name="timestamp", .type="TEXT" },
|
||||
+ { .name="error_count", .type="INTEGER" },
|
||||
+ { .name="affinity", .type="INTEGER" },
|
||||
+ { .name="mpidr", .type="INTEGER" },
|
||||
+ { .name="running_state", .type="INTEGER" },
|
||||
+ { .name="psci_state", .type="INTEGER" },
|
||||
+};
|
||||
+
|
||||
+static const struct db_table_descriptor arm_event_tab = {
|
||||
+ .name = "arm_event",
|
||||
+ .fields = arm_event_fields,
|
||||
+ .num_fields = ARRAY_SIZE(arm_event_fields),
|
||||
+};
|
||||
+
|
||||
+int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev)
|
||||
+{
|
||||
+ int rc;
|
||||
+ struct sqlite3_priv *priv = ras->db_priv;
|
||||
+
|
||||
+ if (!priv || !priv->stmt_arm_record)
|
||||
+ return 0;
|
||||
+ log(TERM, LOG_INFO, "arm_event store: %p\n", priv->stmt_arm_record);
|
||||
+
|
||||
+ sqlite3_bind_text (priv->stmt_arm_record, 1, ev->timestamp, -1, NULL);
|
||||
+ sqlite3_bind_int (priv->stmt_arm_record, 2, ev->error_count);
|
||||
+ sqlite3_bind_int (priv->stmt_arm_record, 3, ev->affinity);
|
||||
+ sqlite3_bind_int (priv->stmt_arm_record, 4, ev->mpidr);
|
||||
+ sqlite3_bind_int (priv->stmt_arm_record, 5, ev->running_state);
|
||||
+ sqlite3_bind_int (priv->stmt_arm_record, 6, ev->psci_state);
|
||||
+
|
||||
+ rc = sqlite3_step(priv->stmt_arm_record);
|
||||
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "Failed to do arm_event step on sqlite: error = %d\n", rc);
|
||||
+ rc = sqlite3_reset(priv->stmt_arm_record);
|
||||
+ if (rc != SQLITE_OK && rc != SQLITE_DONE)
|
||||
+ log(TERM, LOG_ERR,
|
||||
+ "Failed reset arm_event on sqlite: error = %d\n",
|
||||
+ rc);
|
||||
+ log(TERM, LOG_INFO, "register inserted at db\n");
|
||||
+
|
||||
+ return rc;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
#ifdef HAVE_EXTLOG
|
||||
static const struct db_fields extlog_event_fields[] = {
|
||||
{ .name="id", .type="INTEGER PRIMARY KEY" },
|
||||
@@ -509,6 +561,13 @@ int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras)
|
||||
&non_standard_event_tab);
|
||||
#endif
|
||||
|
||||
+#ifdef HAVE_ARM
|
||||
+ rc = ras_mc_create_table(priv, &arm_event_tab);
|
||||
+ if (rc == SQLITE_OK)
|
||||
+ rc = ras_mc_prepare_stmt(priv, &priv->stmt_arm_record,
|
||||
+ &arm_event_tab);
|
||||
+#endif
|
||||
+
|
||||
ras->db_priv = priv;
|
||||
return 0;
|
||||
}
|
||||
diff --git a/ras-record.h b/ras-record.h
|
||||
index 473ae40..12c2218 100644
|
||||
--- a/ras-record.h
|
||||
+++ b/ras-record.h
|
||||
@@ -65,10 +65,21 @@ struct ras_non_standard_event {
|
||||
uint32_t length;
|
||||
};
|
||||
|
||||
+struct ras_arm_event {
|
||||
+ char timestamp[64];
|
||||
+ int32_t error_count;
|
||||
+ int8_t affinity;
|
||||
+ int64_t mpidr;
|
||||
+ int64_t midr;
|
||||
+ int32_t running_state;
|
||||
+ int32_t psci_state;
|
||||
+};
|
||||
+
|
||||
struct ras_mc_event;
|
||||
struct ras_aer_event;
|
||||
struct ras_extlog_event;
|
||||
struct ras_non_standard_event;
|
||||
+struct ras_arm_event;
|
||||
struct mce_event;
|
||||
|
||||
#ifdef HAVE_SQLITE3
|
||||
@@ -90,6 +101,9 @@ struct sqlite3_priv {
|
||||
#ifdef HAVE_NON_STANDARD
|
||||
sqlite3_stmt *stmt_non_standard_record;
|
||||
#endif
|
||||
+#ifdef HAVE_ARM
|
||||
+ sqlite3_stmt *stmt_arm_record;
|
||||
+#endif
|
||||
};
|
||||
|
||||
int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras);
|
||||
@@ -98,6 +112,7 @@ int ras_store_aer_event(struct ras_events *ras, struct ras_aer_event *ev);
|
||||
int ras_store_mce_record(struct ras_events *ras, struct mce_event *ev);
|
||||
int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event *ev);
|
||||
int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standard_event *ev);
|
||||
+int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev);
|
||||
|
||||
#else
|
||||
static inline int ras_mc_event_opendb(unsigned cpu, struct ras_events *ras) { return 0; };
|
||||
@@ -106,6 +121,7 @@ static inline int ras_store_aer_event(struct ras_events *ras, struct ras_aer_eve
|
||||
static inline int ras_store_mce_record(struct ras_events *ras, struct mce_event *ev) { return 0; };
|
||||
static inline int ras_store_extlog_mem_record(struct ras_events *ras, struct ras_extlog_event *ev) { return 0; };
|
||||
static inline int ras_store_non_standard_record(struct ras_events *ras, struct ras_non_standard_event *ev) { return 0; };
|
||||
+static inline int ras_store_arm_record(struct ras_events *ras, struct ras_arm_event *ev) { return 0; };
|
||||
|
||||
#endif
|
||||
|
||||
diff --git a/ras-report.c b/ras-report.c
|
||||
index 1eb9f79..d4beee0 100644
|
||||
--- a/ras-report.c
|
||||
+++ b/ras-report.c
|
||||
@@ -228,6 +228,33 @@ static int set_non_standard_event_backtrace(char *buf, struct ras_non_standard_e
|
||||
return 0;
|
||||
}
|
||||
|
||||
+static int set_arm_event_backtrace(char *buf, struct ras_arm_event *ev){
|
||||
+ char bt_buf[MAX_BACKTRACE_SIZE];
|
||||
+
|
||||
+ if(!buf || !ev)
|
||||
+ return -1;
|
||||
+
|
||||
+ sprintf(bt_buf, "BACKTRACE=" \
|
||||
+ "timestamp=%s\n" \
|
||||
+ "error_count=%d\n" \
|
||||
+ "affinity=%d\n" \
|
||||
+ "mpidr=0x%lx\n" \
|
||||
+ "midr=0x%lx\n" \
|
||||
+ "running_state=%d\n" \
|
||||
+ "psci_state=%d\n", \
|
||||
+ ev->timestamp, \
|
||||
+ ev->error_count, \
|
||||
+ ev->affinity, \
|
||||
+ ev->mpidr, \
|
||||
+ ev->midr, \
|
||||
+ ev->running_state, \
|
||||
+ ev->psci_state);
|
||||
+
|
||||
+ strcat(buf, bt_buf);
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
static int commit_report_backtrace(int sockfd, int type, void *ev){
|
||||
char buf[MAX_BACKTRACE_SIZE];
|
||||
char *pbuf = buf;
|
||||
@@ -253,6 +280,9 @@ static int commit_report_backtrace(int sockfd, int type, void *ev){
|
||||
case NON_STANDARD_EVENT:
|
||||
rc = set_non_standard_event_backtrace(buf, (struct ras_non_standard_event *)ev);
|
||||
break;
|
||||
+ case ARM_EVENT:
|
||||
+ rc = set_arm_event_backtrace(buf, (struct ras_arm_event *)ev);
|
||||
+ break;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
@@ -425,6 +455,51 @@ non_standard_fail:
|
||||
return rc;
|
||||
}
|
||||
|
||||
+int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev){
|
||||
+ char buf[MAX_MESSAGE_SIZE];
|
||||
+ int sockfd = 0;
|
||||
+ int rc = -1;
|
||||
+
|
||||
+ memset(buf, 0, sizeof(buf));
|
||||
+
|
||||
+ sockfd = setup_report_socket();
|
||||
+ if(sockfd < 0){
|
||||
+ return rc;
|
||||
+ }
|
||||
+
|
||||
+ rc = commit_report_basic(sockfd);
|
||||
+ if(rc < 0){
|
||||
+ goto arm_fail;
|
||||
+ }
|
||||
+
|
||||
+ rc = commit_report_backtrace(sockfd, ARM_EVENT, ev);
|
||||
+ if(rc < 0){
|
||||
+ goto arm_fail;
|
||||
+ }
|
||||
+
|
||||
+ sprintf(buf, "ANALYZER=%s", "rasdaemon-arm");
|
||||
+ rc = write(sockfd, buf, strlen(buf) + 1);
|
||||
+ if(rc < strlen(buf) + 1){
|
||||
+ goto arm_fail;
|
||||
+ }
|
||||
+
|
||||
+ sprintf(buf, "REASON=%s", "ARM CPU report problem");
|
||||
+ rc = write(sockfd, buf, strlen(buf) + 1);
|
||||
+ if(rc < strlen(buf) + 1){
|
||||
+ goto arm_fail;
|
||||
+ }
|
||||
+
|
||||
+ rc = 0;
|
||||
+
|
||||
+arm_fail:
|
||||
+
|
||||
+ if(sockfd > 0){
|
||||
+ close(sockfd);
|
||||
+ }
|
||||
+
|
||||
+ return rc;
|
||||
+}
|
||||
+
|
||||
int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev){
|
||||
char buf[MAX_MESSAGE_SIZE];
|
||||
int sockfd = 0;
|
||||
diff --git a/ras-report.h b/ras-report.h
|
||||
index c2fcf42..6c466f5 100644
|
||||
--- a/ras-report.h
|
||||
+++ b/ras-report.h
|
||||
@@ -33,7 +33,8 @@ enum {
|
||||
MC_EVENT,
|
||||
MCE_EVENT,
|
||||
AER_EVENT,
|
||||
- NON_STANDARD_EVENT
|
||||
+ NON_STANDARD_EVENT,
|
||||
+ ARM_EVENT
|
||||
};
|
||||
|
||||
#ifdef HAVE_ABRT_REPORT
|
||||
@@ -42,6 +43,7 @@ int ras_report_mc_event(struct ras_events *ras, struct ras_mc_event *ev);
|
||||
int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev);
|
||||
int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev);
|
||||
int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standard_event *ev);
|
||||
+int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev);
|
||||
|
||||
#else
|
||||
|
||||
@@ -49,6 +51,7 @@ static inline int ras_report_mc_event(struct ras_events *ras, struct ras_mc_even
|
||||
static inline int ras_report_aer_event(struct ras_events *ras, struct ras_aer_event *ev) { return 0; };
|
||||
static inline int ras_report_mce_event(struct ras_events *ras, struct mce_event *ev) { return 0; };
|
||||
static inline int ras_report_non_standard_event(struct ras_events *ras, struct ras_non_standard_event *ev) { return 0; };
|
||||
+static inline int ras_report_arm_event(struct ras_events *ras, struct ras_arm_event *ev) { return 0; };
|
||||
|
||||
#endif
|
||||
|
||||
--
|
||||
1.8.3.1
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
commit 1b23bf7d97bacd1d00adb4404dfc5004df394358
|
||||
Author: Aristeu Rozanski <arozansk@redhat.com>
|
||||
Date: Fri Feb 2 10:15:25 2018 -0500
|
||||
|
||||
ARM: initialize with 0 unused ras_arm_event members
|
||||
|
||||
Issue found by covscan:
|
||||
|
||||
1. rasdaemon-0.4.1/ras-arm-handler.c:32: var_decl: Declaring variable "ev" without initializer.
|
||||
16. rasdaemon-0.4.1/ras-arm-handler.c:81: uninit_use_in_call: Using uninitialized value "ev.error_count" when calling "ras_store_arm_record".
|
||||
23. rasdaemon-0.4.1/ras-record.c:243:2: read_parm_fld: Reading a parameter field.
|
||||
|
||||
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
|
||||
|
||||
diff --git a/ras-arm-handler.c b/ras-arm-handler.c
|
||||
index a76470d..2f170e2 100644
|
||||
--- a/ras-arm-handler.c
|
||||
+++ b/ras-arm-handler.c
|
||||
@@ -31,6 +31,8 @@ int ras_arm_event_handler(struct trace_seq *s,
|
||||
struct tm *tm;
|
||||
struct ras_arm_event ev;
|
||||
|
||||
+ memset(&ev, 0, sizeof(ev));
|
||||
+
|
||||
/*
|
||||
* Newer kernels (3.10-rc1 or upper) provide an uptime clock.
|
||||
* On previous kernels, the way to properly generate an event would
|
|
@ -0,0 +1,24 @@
|
|||
diff --git a/mce-intel.c b/mce-intel.c
|
||||
index bf68d9b..80e4b6f 100644
|
||||
--- a/mce-intel.c
|
||||
+++ b/mce-intel.c
|
||||
@@ -470,7 +470,6 @@ int set_intel_imc_log(enum cputype cputype, unsigned ncpus)
|
||||
case CPU_SANDY_BRIDGE_EP:
|
||||
case CPU_IVY_BRIDGE_EPEX:
|
||||
case CPU_HASWELL_EPEX:
|
||||
- case CPU_KNIGHTS_LANDING:
|
||||
msr = 0x17f; /* MSR_ERROR_CONTROL */
|
||||
bit = 0x2; /* MemError Log Enable */
|
||||
break;
|
||||
diff --git a/ras-mce-handler.c b/ras-mce-handler.c
|
||||
index b875512..f930fd1 100644
|
||||
--- a/ras-mce-handler.c
|
||||
+++ b/ras-mce-handler.c
|
||||
@@ -228,7 +228,6 @@ int register_mce_handler(struct ras_events *ras, unsigned ncpus)
|
||||
case CPU_SANDY_BRIDGE_EP:
|
||||
case CPU_IVY_BRIDGE_EPEX:
|
||||
case CPU_HASWELL_EPEX:
|
||||
- case CPU_KNIGHTS_LANDING:
|
||||
set_intel_imc_log(mce->cputype, ncpus);
|
||||
default:
|
||||
break;
|
|
@ -0,0 +1,302 @@
|
|||
%define _hardened_build 1
|
||||
|
||||
Name: rasdaemon
|
||||
Version: 0.4.1
|
||||
Release: 32%{?dist}
|
||||
Summary: Utility to receive RAS error tracings
|
||||
Group: Applications/System
|
||||
License: GPLv2
|
||||
URL: https://pagure.io/rasdaemon
|
||||
Source0: http://mchehab.fedorapeople.org/%{name}-%{version}.tar.bz2
|
||||
|
||||
ExclusiveArch: %{ix86} x86_64 aarch64 %{power64}
|
||||
BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n)
|
||||
BuildRequires: autoconf, automake, gettext-devel, libtool, sqlite-devel
|
||||
Requires: hwdata, perl-DBD-SQLite
|
||||
%ifnarch %{arm}
|
||||
%ifnarch %{power64}
|
||||
Requires: dmidecode
|
||||
%endif
|
||||
%endif
|
||||
|
||||
Requires(post): systemd-units
|
||||
Requires(preun): systemd-units
|
||||
Requires(postun): systemd-units
|
||||
|
||||
Patch1: 0001-ras-mc-ctl-Improve-error-summary-to-show-label-and-m.patch
|
||||
Patch2: 0002-ras-record-make-the-code-more-generic.patch
|
||||
Patch3: 0003-ras-record-rename-stmt-to-stmt_mc_event.patch
|
||||
Patch4: 0004-ras-record-reorder-functions.patch
|
||||
Patch5: 0005-ras-record-Make-the-code-easier-to-add-support-for-o.patch
|
||||
Patch6: 0006-Add-support-to-record-AER-events.patch
|
||||
Patch7: 0007-Add-support-to-store-MCE-events-at-the-database.patch
|
||||
Patch8: 0008-ras-mc-ctl-add-summary-for-MCE-and-PCIe-AER-errors.patch
|
||||
Patch9: 0009-ras-mc-ctl-report-errors-also-for-PCIe-AER-and-MCE.patch
|
||||
Patch10: 0010-ras-mc-ctl-Fix-the-name-of-the-error-table-data.patch
|
||||
Patch11: 0013-ras-mc-ctl-Improve-parser.patch
|
||||
Patch12: 0014-ras-mc-ctl-Fix-label-register-with-2-layers.patch
|
||||
Patch13: 0015-Add-an-example-of-labels-file.patch
|
||||
Patch14: 0017-ras-mc-ctl-Fix-the-DIMM-layout-display.patch
|
||||
Patch15: 0019-ras-mc-ctl-remove-completely-use-of-modprobe.patch
|
||||
Patch16: 0022-mce-amd-k8.c-fix-a-warning.patch
|
||||
Patch17: 0023-add-abrt-suppport-for-rasdaemon.patch
|
||||
Patch18: 0026-rasdaemon-Add-record-option-to-rasdaemon-man-page.patch
|
||||
Patch19: 0027-ras-mc-ctl-Print-useful-message-when-run-without-ras.patch
|
||||
Patch20: 0028-Make-paths-in-the-systemd-services-configurable.patch
|
||||
Patch21: 0031-Correct-ABRT-report-data.patch
|
||||
Patch22: 0032-rasdaemon-handle-failures-of-snprintf.patch
|
||||
Patch23: 0033-rasdaemon-correct-range-while-parsing-top-middle-and.patch
|
||||
Patch24: 0034-rasdaemon-enable-recording-by-default.patch
|
||||
Patch25: 0035-eMCA-support.patch
|
||||
Patch26: 0036-rasdaemon-fix-some-errors-in-sqlite.patch
|
||||
Patch27: 0037-rasdaemon-sqlite-truncates-some-MCE-fields-to-32-bit.patch
|
||||
Patch28: 0038-rasdaemon-fix-mce-numfield-decoded-error.patch
|
||||
Patch29: 0039-rasdaemon-do-not-assume-dimmX-directories-will-be-pr.patch
|
||||
Patch30: 0040-rasdaemon-add-more-dell-labels.patch
|
||||
Patch31: 0041-rasdaemon-add-support-for-Haswell.patch
|
||||
Patch32: 0042-rasdaemon-decode-new-simple-error-code-number-6.patch
|
||||
Patch33: 0043-rasdaemon-Add-missing-entry-to-Ivy-Bridge-memory-con.patch
|
||||
Patch34: 0044-rasdaemon-Identify-Ivy-Bridge-properly.patch
|
||||
Patch35: 0045-rasdaemon-add-support-for-Broadwell.patch
|
||||
Patch36: 0046-rasdaemon-add-support-for-Knights-Landing.patch
|
||||
Patch37: 0047-rasdaemon-properly-pring-message-strings-in-decode_b.patch
|
||||
Patch38: 0048-rasdaemon-add-missing-semicolon-in-hsw_decode_model.patch
|
||||
Patch39: 0049-rasdaemon-enable-IMC-status-usage-for-Haswell-E.patch
|
||||
Patch40: 0050-rasdaemon-make-sure-the-error-is-valid-before-handli.patch
|
||||
Patch41: 0051-rasdaemon-add-support-to-match-the-machine-by-system.patch
|
||||
Patch42: 0052-rasdaemon-add-internal-errors-of-IA32_MC4_STATUS-for.patch
|
||||
Patch43: 0053-rasdaemon-remove-a-space-from-mcgstatus_msg.patch
|
||||
Patch44: 0054-rasdaemon-unnecessary-comma-for-empty-mc_location-st.patch
|
||||
Patch45: 0055-rasdaemon-use-MCA-error-msg-as-error_msg.patch
|
||||
Patch46: 0056-x86-rasdaemon-Add-support-to-log-Local-Machine-Check.patch
|
||||
Patch47: 0057-rasdaemon-add-support-for-haswell-ex.patch
|
||||
Patch48: 0058-rasdaemon-fix-typos-on-ras-mc-ctl-man-page.patch
|
||||
Patch49: 0059-rasdaemon-Add-support-for-Knights-Landing-processor.patch
|
||||
Patch50: 0060-mce-intel-knl-Fix-CodingStyle.patch
|
||||
Patch51: 0061-Add-Broadwell-DE-MSCOD-values.patch
|
||||
Patch52: 0062-Add-Broadwell-EP-EX-MSCOD-values.patch
|
||||
# Patch53 was submitted upstream but not merged yet
|
||||
Patch53: rasdaemon-dont_use_memerror_log_enable_on_knl.patch
|
||||
Patch54: 0063-add_support_for_knights_mill.patch
|
||||
Patch55: 0064-add_support_for_skylake.patch
|
||||
Patch56: 0065-rasdaemon-Update-DIMM-labels-for-Dell-Servers.patch
|
||||
Patch57: 0066-rasdaemon-Update-DIMM-labels-for-Intel-Skylake-serve.patch
|
||||
Patch58: 0067-rasdaemon-add-support-for-non-standard-CPER-section-.patch
|
||||
Patch59: 0068-rasdaemon-add-support-for-non-standard-error-decoder.patch
|
||||
Patch60: 0069-rasdaemon-add-support-for-ARM-events.patch
|
||||
Patch61: 0070-rasdaemon-ARM-fully-initialize-ras_arm_event.patch
|
||||
|
||||
%description
|
||||
%{name} is a RAS (Reliability, Availability and Serviceability) logging tool.
|
||||
It currently records memory errors, using the EDAC tracing events.
|
||||
EDAC is drivers in the Linux kernel that handle detection of ECC errors
|
||||
from memory controllers for most chipsets on i386 and x86_64 architectures.
|
||||
EDAC drivers for other architectures like arm also exists.
|
||||
This userspace component consists of an init script which makes sure
|
||||
EDAC drivers and DIMM labels are loaded at system startup, as well as
|
||||
an utility for reporting current error counts from the EDAC sysfs files.
|
||||
|
||||
%prep
|
||||
%setup -q
|
||||
%patch1 -p1
|
||||
%patch2 -p1
|
||||
%patch3 -p1
|
||||
%patch4 -p1
|
||||
%patch5 -p1
|
||||
%patch6 -p1
|
||||
%patch7 -p1
|
||||
%patch8 -p1
|
||||
%patch9 -p1
|
||||
%patch10 -p1
|
||||
%patch11 -p1
|
||||
%patch12 -p1
|
||||
%patch13 -p1
|
||||
%patch14 -p1
|
||||
%patch15 -p1
|
||||
%patch16 -p1
|
||||
%patch17 -p1
|
||||
%patch18 -p1
|
||||
%patch19 -p1
|
||||
%patch20 -p1
|
||||
%patch21 -p1
|
||||
%patch22 -p1
|
||||
%patch23 -p1
|
||||
%patch24 -p1
|
||||
%patch25 -p1
|
||||
%patch26 -p1
|
||||
%patch27 -p1
|
||||
%patch28 -p1
|
||||
%patch29 -p1
|
||||
%patch30 -p1
|
||||
%patch31 -p1
|
||||
%patch32 -p1
|
||||
%patch33 -p1
|
||||
%patch34 -p1
|
||||
%patch35 -p1
|
||||
%patch36 -p1
|
||||
%patch37 -p1
|
||||
%patch38 -p1
|
||||
%patch39 -p1
|
||||
%patch40 -p1
|
||||
%patch41 -p1
|
||||
%patch42 -p1
|
||||
%patch43 -p1
|
||||
%patch44 -p1
|
||||
%patch45 -p1
|
||||
%patch46 -p1
|
||||
%patch47 -p1
|
||||
%patch48 -p1
|
||||
%patch49 -p1
|
||||
%patch50 -p1
|
||||
%patch51 -p1
|
||||
%patch52 -p1
|
||||
%patch53 -p1
|
||||
%patch54 -p1
|
||||
%patch55 -p1
|
||||
%patch56 -p1
|
||||
%patch57 -p1
|
||||
%patch58 -p1
|
||||
%patch59 -p1
|
||||
%patch60 -p1
|
||||
%patch61 -p1
|
||||
|
||||
%build
|
||||
autoreconf -vfi
|
||||
%configure --enable-mce --enable-aer --enable-sqlite3 --enable-extlog --enable-arm
|
||||
make %{?_smp_mflags}
|
||||
|
||||
%install
|
||||
make install DESTDIR=%{buildroot}
|
||||
install -D -p -m 0644 misc/rasdaemon.service %{buildroot}/%{_unitdir}/rasdaemon.service
|
||||
install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl.service
|
||||
install -D -p -m 0655 labels/* %{buildroot}%{_sysconfdir}/ras/dimm_labels.d
|
||||
rm INSTALL %{buildroot}/usr/include/*.h
|
||||
|
||||
%clean
|
||||
rm -rf %{buildroot}
|
||||
|
||||
%files
|
||||
%defattr(-,root,root)
|
||||
%doc AUTHORS ChangeLog COPYING README TODO
|
||||
%{_sbindir}/rasdaemon
|
||||
%{_sbindir}/ras-mc-ctl
|
||||
%{_mandir}/*/*
|
||||
%{_unitdir}/*.service
|
||||
%{_sharedstatedir}/rasdaemon
|
||||
%{_sysconfdir}/ras/dimm_labels.d
|
||||
|
||||
%changelog
|
||||
* Fri Feb 02 2018 Aristeu Rozanski <aris@redhat.com> 0.4.1-32.el7
|
||||
- Fixed covscan error [1520602]
|
||||
|
||||
* Wed Jan 24 2018 Aristeu Rozanski <aris@redhat.com> 0.4.1-31.el7
|
||||
- Added ARM support [1520602]
|
||||
|
||||
* Thu Oct 19 2017 Aristeu Rozanski <aris@redhat.com> 0.4.1-30.el7
|
||||
- Updated project url [1502400]
|
||||
|
||||
* Wed Aug 23 2017 Aristeu Rozanski <aris@redhat.com> 0.4.1-29.el7
|
||||
- Updating Dell labels [1458938]
|
||||
|
||||
* Tue May 30 2017 Aristeu Rozanski <aris@redhat.com> 0.4.1-28.el7
|
||||
- Bump release [1448113]
|
||||
|
||||
* Tue May 30 2017 Aristeu Rozanski <aris@redhat.com> 0.4.1-28.el7
|
||||
- Identify as Knights Mill systems as such [1448113]
|
||||
|
||||
* Mon May 8 2017 Aristeu Rozanski <aris@redhat.com> 0.4.1-27.el7
|
||||
- Fixed error found by covscan in the last patch [1377467]
|
||||
|
||||
* Tue Apr 11 2017 Aristeu Rozanski <aris@redhat.com> 0.4.1-26.el7
|
||||
- add support for Skylake client and server [1377467]
|
||||
|
||||
* Wed Mar 22 2017 Aristeu Rozanski <aris@redhat.com> 0.4.1-25.el7
|
||||
- add support for Knights Mill [1433862]
|
||||
|
||||
* Wed Aug 24 2016 Aristeu Rozanski <aris@redaht.com> 0.4.1-24.el7
|
||||
- don't use MemError Log Enable on Knights Landing [1273326]
|
||||
|
||||
* Fri Apr 15 2016 Aristeu Rozanski <aris@redhat.com> 0.4.1-23.el7
|
||||
- add Broadwell DE/EP/EX MSCOD values [1299512]
|
||||
|
||||
* Mon Feb 08 2016 Aristeu Rozanski <aris@redhat.com> 0.4.1-22.el7
|
||||
- add missing upstream patches for Knights Landing [1273326]
|
||||
- fix documentation typos [1247562]
|
||||
|
||||
* Thu Dec 03 2015 Aristeu Rozanski <aris@redhat.com> 0.4.1-21.el7
|
||||
- add support for Knights Landing [1273326]
|
||||
|
||||
* Wed Sep 30 2015 Aristeu Rozanski <aris@redhat.com> 0.4.1-20.el7
|
||||
- add support for Haswell EP/EX [1267137]
|
||||
|
||||
* Mon Jul 27 2015 Aristeu Rozanski <aris@redhat.com> 0.4.1-19.el7
|
||||
- pull latest fixes by Seiichi Ikarashi from upstream [1243941]
|
||||
|
||||
* Mon Jul 27 2015 Aristeu Rozanski <aris@redhat.com> 0.4.1-18.el7
|
||||
- don't depend on dmidecode on ppc64, fix typo [1244593]
|
||||
|
||||
* Wed Jul 22 2015 Aristeu Rozanski <aris@redhat.com> 0.4.1-17.el7
|
||||
- don't depend on dmidecode on ppc64 [1244593]
|
||||
|
||||
* Wed Jul 08 2015 Aristeu Rozanski <aris@redhat.com> 0.4.1-16.el7
|
||||
- allow label files to specify by system product name [1168340]
|
||||
|
||||
* Wed Jun 03 2015 Aristeu Rozanski <aris@redhat.com> 0.4.1-15.el7
|
||||
- add support to Haswell and newer processors [1221912]
|
||||
|
||||
* Tue Dec 16 2014 Aristeu Rozanski <aris@redhat.com> 0.4.1-14.el7
|
||||
- properly install the labels so it can be packaged [1073090]
|
||||
|
||||
* Tue Dec 02 2014 Aristeu Rozanski <aris@redhat.com> 0.4.1-13.el7
|
||||
- updated labels patch to the new version submitted upstream [1073090]
|
||||
|
||||
* Tue Nov 25 2014 Aristeu Rozanski <aris@redhat.com> 0.4.1-12.el7
|
||||
- fix on how sysfs tree is parsed for DIMMs [1073090]
|
||||
- include new Dell labels [1073090]
|
||||
|
||||
* Fri Oct 10 2014 Aristeu Rozanski <aris@redhat.com> 0.4.1-11.el7
|
||||
- don't require dmidecode for ppc64le [1151385]
|
||||
|
||||
* Fri Aug 22 2014 Aristeu Rozanski <aris@redhat.com> 0.4.1-10.el7
|
||||
- use power64 macro instead, we have a driver enabled on ppc64 [1125663]
|
||||
|
||||
* Mon Aug 18 2014 Aristeu Rozanski <aris@redhat.com> 0.4.1-9.el7
|
||||
- eMCA support [1085519]
|
||||
- enable ppc64le [1125663]
|
||||
|
||||
* Mon Jun 09 2014 Aristeu Rozanski <aris@redhat.com> 0.4.1-8.el7
|
||||
- Revert patch in 0.4.1-7.el7, replaced by a better patch
|
||||
- Fix sizeof() usage on pointer [1035742]
|
||||
- Added macro to build the package with stack protector [1092558]
|
||||
- Handle failures of snprintf() [1035741]
|
||||
- Fix range checking on signed char variables [1035746]
|
||||
- Added aarch64 as architecture [1070973]
|
||||
- Start recording by default [1117366] [1117367]
|
||||
|
||||
* Fri Jan 17 2014 Aristeu Rozanski <aris@redhat.com> 0.4.1-7.el7
|
||||
- Fixed rasdaemon.service executable path [1043478]
|
||||
|
||||
* Fri Dec 27 2013 Daniel Mach <dmach@redhat.com> - 0.4.1-6
|
||||
- Mass rebuild 2013-12-27
|
||||
|
||||
* Tue Aug 20 2013 Aristeu Rozanski <aris@redhat.com> 0.4.1-5.el7
|
||||
- Applied Jarod Wilson fixes required to pass rpmlint tests
|
||||
|
||||
* Thu Aug 15 2013 Aristeu Rozanski <aris@redhat.com> 0.4.1-4.el7
|
||||
- Rebuild
|
||||
|
||||
* Sun Jun 2 2013 Peter Robinson <pbrobinson@fedoraproject.org> 0.4.1-3
|
||||
- ARM has EDMA drivers (currently supported in Calxeda highbank)
|
||||
|
||||
* Tue May 28 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.4.1-2
|
||||
- Fix the name of perl-DBD-SQLite package
|
||||
|
||||
* Tue May 28 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.4.1-1
|
||||
- Updated to version 0.4.1 with contains some bug fixes
|
||||
|
||||
* Mon May 27 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.4.0-1
|
||||
- Updated to version 0.4.0 and added support for mce, aer and sqlite3 storage
|
||||
|
||||
* Mon May 20 2013 Mauro Carvalho Chehab <mchehab@redhat.com> 0.3.0-1
|
||||
- Package created
|
||||
|
Loading…
Reference in New Issue