aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorEran Ben Elisha <eranbe@mellanox.com>2019-01-17 23:59:12 +0200
committerDavid S. Miller <davem@davemloft.net>2019-01-18 14:51:22 -0800
commitc7af343b4e33578b7de91786a3f639c8cfa0d97b (patch)
treee60cab858369020cc94ce5f569e82d53a62e57f0 /net
parentdevlink: Add health reporter create/destroy functionality (diff)
downloadlinux-dev-c7af343b4e33578b7de91786a3f639c8cfa0d97b.tar.xz
linux-dev-c7af343b4e33578b7de91786a3f639c8cfa0d97b.zip
devlink: Add health report functionality
Upon error discover, every driver can report it to the devlink health mechanism via devlink_health_report function, using the appropriate reporter registered to it. Driver can pass error specific context which will be delivered to it as part of the dump / recovery callbacks. Once an error is reported, devlink health will do the following actions: * A log is being send to the kernel trace events buffer * Health status and statistics are being updated for the reporter instance * Object dump is being taken and stored at the reporter instance (as long as there is no other dump which is already stored) * Auto recovery attempt is being done. depends on: - Auto Recovery configuration - Grace period vs. time since last recover Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com> Reviewed-by: Moshe Shemesh <moshe@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/core/devlink.c93
1 files changed, 93 insertions, 0 deletions
diff --git a/net/core/devlink.c b/net/core/devlink.c
index fec169a28dba..943d3e7dea6a 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -4110,6 +4110,16 @@ struct devlink_health_reporter {
u64 graceful_period;
bool auto_recover;
u8 health_state;
+ u8 dump_avail;
+ u64 dump_ts;
+ u64 error_count;
+ u64 recovery_count;
+ u64 last_recovery_ts;
+};
+
+enum devlink_health_reporter_state {
+ DEVLINK_HEALTH_REPORTER_STATE_HEALTHY,
+ DEVLINK_HEALTH_REPORTER_STATE_ERROR,
};
void *
@@ -4224,6 +4234,89 @@ devlink_health_reporter_destroy(struct devlink_health_reporter *reporter)
}
EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy);
+static int
+devlink_health_reporter_recover(struct devlink_health_reporter *reporter,
+ void *priv_ctx)
+{
+ int err;
+
+ if (!reporter->ops->recover)
+ return -EOPNOTSUPP;
+
+ err = reporter->ops->recover(reporter, priv_ctx);
+ if (err)
+ return err;
+
+ reporter->recovery_count++;
+ reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY;
+ reporter->last_recovery_ts = jiffies;
+
+ return 0;
+}
+
+static int devlink_health_do_dump(struct devlink_health_reporter *reporter,
+ void *priv_ctx)
+{
+ int err;
+
+ if (!reporter->ops->dump)
+ return 0;
+
+ if (reporter->dump_avail)
+ return 0;
+
+ devlink_health_buffers_reset(reporter->dump_buffers_array,
+ DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size));
+ err = reporter->ops->dump(reporter, reporter->dump_buffers_array,
+ DEVLINK_HEALTH_BUFFER_SIZE,
+ DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size),
+ priv_ctx);
+ if (!err) {
+ reporter->dump_avail = true;
+ reporter->dump_ts = jiffies;
+ }
+
+ return err;
+}
+
+int devlink_health_report(struct devlink_health_reporter *reporter,
+ const char *msg, void *priv_ctx)
+{
+ struct devlink *devlink = reporter->devlink;
+ int err = 0;
+
+ /* write a log message of the current error */
+ WARN_ON(!msg);
+ trace_devlink_health_report(devlink, reporter->ops->name, msg);
+ reporter->error_count++;
+
+ /* abort if the previous error wasn't recovered */
+ if (reporter->auto_recover &&
+ (reporter->health_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY ||
+ jiffies - reporter->last_recovery_ts <
+ msecs_to_jiffies(reporter->graceful_period))) {
+ trace_devlink_health_recover_aborted(devlink,
+ reporter->ops->name,
+ reporter->health_state,
+ jiffies -
+ reporter->last_recovery_ts);
+ return -ECANCELED;
+ }
+
+ reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR;
+
+ mutex_lock(&reporter->dump_lock);
+ /* store current dump of current error, for later analysis */
+ devlink_health_do_dump(reporter, priv_ctx);
+ mutex_unlock(&reporter->dump_lock);
+
+ if (reporter->auto_recover)
+ err = devlink_health_reporter_recover(reporter, priv_ctx);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_health_report);
+
static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },