aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ceph/mdsmap.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ceph/mdsmap.c')
-rw-r--r--fs/ceph/mdsmap.c91
1 files changed, 52 insertions, 39 deletions
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 471bac335fae..889627817e52 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -13,30 +13,25 @@
#include "super.h"
+#define CEPH_MDS_IS_READY(i, ignore_laggy) \
+ (m->m_info[i].state > 0 && ignore_laggy ? true : !m->m_info[i].laggy)
-/*
- * choose a random mds that is "up" (i.e. has a state > 0), or -1.
- */
-int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
+static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy)
{
int n = 0;
int i, j;
- /* special case for one mds */
- if (1 == m->m_num_mds && m->m_info[0].state > 0)
- return 0;
-
/* count */
- for (i = 0; i < m->m_num_mds; i++)
- if (m->m_info[i].state > 0)
+ for (i = 0; i < m->possible_max_rank; i++)
+ if (CEPH_MDS_IS_READY(i, ignore_laggy))
n++;
if (n == 0)
return -1;
/* pick */
n = prandom_u32() % n;
- for (j = 0, i = 0; i < m->m_num_mds; i++) {
- if (m->m_info[i].state > 0)
+ for (j = 0, i = 0; i < m->possible_max_rank; i++) {
+ if (CEPH_MDS_IS_READY(i, ignore_laggy))
j++;
if (j > n)
break;
@@ -45,6 +40,20 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
return i;
}
+/*
+ * choose a random mds that is "up" (i.e. has a state > 0), or -1.
+ */
+int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
+{
+ int mds;
+
+ mds = __mdsmap_get_random_mds(m, false);
+ if (mds == m->possible_max_rank || mds == -1)
+ mds = __mdsmap_get_random_mds(m, true);
+
+ return mds == m->possible_max_rank ? -1 : mds;
+}
+
#define __decode_and_drop_type(p, end, type, bad) \
do { \
if (*p + sizeof(type) > end) \
@@ -138,14 +147,29 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
m->m_session_autoclose = ceph_decode_32(p);
m->m_max_file_size = ceph_decode_64(p);
m->m_max_mds = ceph_decode_32(p);
- m->m_num_mds = m->m_max_mds;
- m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS);
+ /*
+ * pick out the active nodes as the m_num_active_mds, the
+ * m_num_active_mds maybe larger than m_max_mds when decreasing
+ * the max_mds in cluster side, in other case it should less
+ * than or equal to m_max_mds.
+ */
+ m->m_num_active_mds = n = ceph_decode_32(p);
+
+ /*
+ * the possible max rank, it maybe larger than the m_num_active_mds,
+ * for example if the mds_max == 2 in the cluster, when the MDS(0)
+ * was laggy and being replaced by a new MDS, we will temporarily
+ * receive a new mds map with n_num_mds == 1 and the active MDS(1),
+ * and the mds rank >= m_num_active_mds.
+ */
+ m->possible_max_rank = max(m->m_num_active_mds, m->m_max_mds);
+
+ m->m_info = kcalloc(m->possible_max_rank, sizeof(*m->m_info), GFP_NOFS);
if (!m->m_info)
goto nomem;
/* pick out active nodes from mds_info (state > 0) */
- n = ceph_decode_32(p);
for (i = 0; i < n; i++) {
u64 global_id;
u32 namelen;
@@ -215,18 +239,15 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
ceph_mds_state_name(state),
laggy ? "(laggy)" : "");
- if (mds < 0 || state <= 0)
+ if (mds < 0 || mds >= m->possible_max_rank) {
+ pr_warn("mdsmap_decode got incorrect mds(%d)\n", mds);
continue;
+ }
- if (mds >= m->m_num_mds) {
- int new_num = max(mds + 1, m->m_num_mds * 2);
- void *new_m_info = krealloc(m->m_info,
- new_num * sizeof(*m->m_info),
- GFP_NOFS | __GFP_ZERO);
- if (!new_m_info)
- goto nomem;
- m->m_info = new_m_info;
- m->m_num_mds = new_num;
+ if (state <= 0) {
+ pr_warn("mdsmap_decode got incorrect state(%s)\n",
+ ceph_mds_state_name(state));
+ continue;
}
info = &m->m_info[mds];
@@ -247,14 +268,6 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
info->export_targets = NULL;
}
}
- if (m->m_num_mds > m->m_max_mds) {
- /* find max up mds */
- for (i = m->m_num_mds; i >= m->m_max_mds; i--) {
- if (i == 0 || m->m_info[i-1].state > 0)
- break;
- }
- m->m_num_mds = i;
- }
/* pg_pools */
ceph_decode_32_safe(p, end, n, bad);
@@ -296,14 +309,14 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
for (i = 0; i < n; i++) {
s32 mds = ceph_decode_32(p);
- if (mds >= 0 && mds < m->m_num_mds) {
+ if (mds >= 0 && mds < m->possible_max_rank) {
if (m->m_info[mds].laggy)
num_laggy++;
}
}
m->m_num_laggy = num_laggy;
- if (n > m->m_num_mds) {
+ if (n > m->possible_max_rank) {
void *new_m_info = krealloc(m->m_info,
n * sizeof(*m->m_info),
GFP_NOFS | __GFP_ZERO);
@@ -311,7 +324,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
goto nomem;
m->m_info = new_m_info;
}
- m->m_num_mds = n;
+ m->possible_max_rank = n;
}
/* inc */
@@ -382,7 +395,7 @@ void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
{
int i;
- for (i = 0; i < m->m_num_mds; i++)
+ for (i = 0; i < m->possible_max_rank; i++)
kfree(m->m_info[i].export_targets);
kfree(m->m_info);
kfree(m->m_data_pg_pools);
@@ -396,9 +409,9 @@ bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m)
return false;
if (m->m_damaged)
return false;
- if (m->m_num_laggy > 0)
+ if (m->m_num_laggy == m->m_num_active_mds)
return false;
- for (i = 0; i < m->m_num_mds; i++) {
+ for (i = 0; i < m->possible_max_rank; i++) {
if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE)
nr_active++;
}