aboutsummaryrefslogtreecommitdiffstats
path: root/include/linux/ceph
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/ceph')
-rw-r--r--include/linux/ceph/auth.h72
-rw-r--r--include/linux/ceph/ceph_features.h21
-rw-r--r--include/linux/ceph/ceph_fs.h79
-rw-r--r--include/linux/ceph/debugfs.h14
-rw-r--r--include/linux/ceph/decode.h8
-rw-r--r--include/linux/ceph/libceph.h37
-rw-r--r--include/linux/ceph/mdsmap.h3
-rw-r--r--include/linux/ceph/messenger.h301
-rw-r--r--include/linux/ceph/mon_client.h4
-rw-r--r--include/linux/ceph/msgr.h66
-rw-r--r--include/linux/ceph/osd_client.h54
-rw-r--r--include/linux/ceph/osdmap.h41
-rw-r--r--include/linux/ceph/rados.h22
13 files changed, 565 insertions, 157 deletions
diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index 6728c2ee0205..6b138fa97db8 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -32,8 +32,6 @@ struct ceph_auth_handshake {
};
struct ceph_auth_client_ops {
- const char *name;
-
/*
* true if we are authenticated and can connect to
* services.
@@ -52,8 +50,10 @@ struct ceph_auth_client_ops {
* another request.
*/
int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
- int (*handle_reply)(struct ceph_auth_client *ac, int result,
- void *buf, void *end);
+ int (*handle_reply)(struct ceph_auth_client *ac, u64 global_id,
+ void *buf, void *end, u8 *session_key,
+ int *session_key_len, u8 *con_secret,
+ int *con_secret_len);
/*
* Create authorizer for connecting to a service, and verify
@@ -69,7 +69,10 @@ struct ceph_auth_client_ops {
void *challenge_buf,
int challenge_buf_len);
int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
- struct ceph_authorizer *a);
+ struct ceph_authorizer *a,
+ void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len);
void (*invalidate_authorizer)(struct ceph_auth_client *ac,
int peer_type);
@@ -95,11 +98,17 @@ struct ceph_auth_client {
const struct ceph_crypto_key *key; /* our secret key */
unsigned want_keys; /* which services we want */
+ int preferred_mode; /* CEPH_CON_MODE_* */
+ int fallback_mode; /* ditto */
+
struct mutex mutex;
};
-extern struct ceph_auth_client *ceph_auth_init(const char *name,
- const struct ceph_crypto_key *key);
+void ceph_auth_set_global_id(struct ceph_auth_client *ac, u64 global_id);
+
+struct ceph_auth_client *ceph_auth_init(const char *name,
+ const struct ceph_crypto_key *key,
+ const int *con_modes);
extern void ceph_auth_destroy(struct ceph_auth_client *ac);
extern void ceph_auth_reset(struct ceph_auth_client *ac);
@@ -113,21 +122,22 @@ int ceph_auth_entity_name_encode(const char *name, void **p, void *end);
extern int ceph_build_auth(struct ceph_auth_client *ac,
void *msg_buf, size_t msg_len);
-
extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
-extern int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
- int peer_type,
- struct ceph_auth_handshake *auth);
+
+int __ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ int peer_type, bool force_new,
+ int *proto, int *pref_mode, int *fallb_mode);
void ceph_auth_destroy_authorizer(struct ceph_authorizer *a);
-extern int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
- int peer_type,
- struct ceph_auth_handshake *a);
int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac,
struct ceph_authorizer *a,
void *challenge_buf,
int challenge_buf_len);
-extern int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
- struct ceph_authorizer *a);
+int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a,
+ void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len);
extern void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac,
int peer_type);
@@ -147,4 +157,34 @@ int ceph_auth_check_message_signature(struct ceph_auth_handshake *auth,
return auth->check_message_signature(auth, msg);
return 0;
}
+
+int ceph_auth_get_request(struct ceph_auth_client *ac, void *buf, int buf_len);
+int ceph_auth_handle_reply_more(struct ceph_auth_client *ac, void *reply,
+ int reply_len, void *buf, int buf_len);
+int ceph_auth_handle_reply_done(struct ceph_auth_client *ac,
+ u64 global_id, void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len);
+bool ceph_auth_handle_bad_method(struct ceph_auth_client *ac,
+ int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt);
+
+int ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ int peer_type, void *buf, int *buf_len);
+int ceph_auth_handle_svc_reply_more(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ void *reply, int reply_len,
+ void *buf, int *buf_len);
+int ceph_auth_handle_svc_reply_done(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len);
+bool ceph_auth_handle_bad_authorizer(struct ceph_auth_client *ac,
+ int peer_type, int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt);
+
#endif
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 39e6f4c57580..3a47acd9cc14 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -8,17 +8,18 @@
* feature. Base case is 1 (first use).
*/
#define CEPH_FEATURE_INCARNATION_1 (0ull)
-#define CEPH_FEATURE_INCARNATION_2 (1ull<<57) // CEPH_FEATURE_SERVER_JEWEL
+#define CEPH_FEATURE_INCARNATION_2 (1ull<<57) // SERVER_JEWEL
+#define CEPH_FEATURE_INCARNATION_3 ((1ull<<57)|(1ull<<28)) // SERVER_MIMIC
#define DEFINE_CEPH_FEATURE(bit, incarnation, name) \
- static const uint64_t CEPH_FEATURE_##name = (1ULL<<bit); \
- static const uint64_t CEPH_FEATUREMASK_##name = \
+ static const uint64_t __maybe_unused CEPH_FEATURE_##name = (1ULL<<bit); \
+ static const uint64_t __maybe_unused CEPH_FEATUREMASK_##name = \
(1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation);
/* this bit is ignored but still advertised by release *when* */
#define DEFINE_CEPH_FEATURE_DEPRECATED(bit, incarnation, name, when) \
- static const uint64_t DEPRECATED_CEPH_FEATURE_##name = (1ULL<<bit); \
- static const uint64_t DEPRECATED_CEPH_FEATUREMASK_##name = \
+ static const uint64_t __maybe_unused DEPRECATED_CEPH_FEATURE_##name = (1ULL<<bit); \
+ static const uint64_t __maybe_unused DEPRECATED_CEPH_FEATUREMASK_##name = \
(1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation);
/*
@@ -58,7 +59,7 @@
* because 10.2.z (jewel) did not care if its peers advertised this
* feature bit.
*
- * - In the second phase we stop advertising the the bit and call it
+ * - In the second phase we stop advertising the bit and call it
* RETIRED. This can normally be done in the *next* major release
* following the one in which we marked the feature DEPRECATED. In
* the above example, for 12.0.z (luminous) we can say:
@@ -75,7 +76,7 @@
DEFINE_CEPH_FEATURE( 0, 1, UID)
DEFINE_CEPH_FEATURE( 1, 1, NOSRCADDR)
DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS)
-
+DEFINE_CEPH_FEATURE( 2, 3, SERVER_NAUTILUS)
DEFINE_CEPH_FEATURE( 3, 1, FLOCK)
DEFINE_CEPH_FEATURE( 4, 1, SUBSCRIBE2)
DEFINE_CEPH_FEATURE( 5, 1, MONNAMES)
@@ -114,7 +115,7 @@ DEFINE_CEPH_FEATURE(25, 1, CRUSH_TUNABLES2)
DEFINE_CEPH_FEATURE(26, 1, CREATEPOOLID)
DEFINE_CEPH_FEATURE(27, 1, REPLY_CREATE_INODE)
DEFINE_CEPH_FEATURE_RETIRED(28, 1, OSD_HBMSGS, HAMMER, JEWEL)
-DEFINE_CEPH_FEATURE(28, 2, SERVER_M)
+DEFINE_CEPH_FEATURE(28, 2, SERVER_MIMIC)
DEFINE_CEPH_FEATURE(29, 1, MDSENC)
DEFINE_CEPH_FEATURE(30, 1, OSDHASHPSPOOL)
DEFINE_CEPH_FEATURE(31, 1, MON_SINGLE_PAXOS) // deprecate me
@@ -177,13 +178,16 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin
*/
#define CEPH_FEATURES_SUPPORTED_DEFAULT \
(CEPH_FEATURE_NOSRCADDR | \
+ CEPH_FEATURE_SERVER_NAUTILUS | \
CEPH_FEATURE_FLOCK | \
CEPH_FEATURE_SUBSCRIBE2 | \
+ CEPH_FEATURE_MONNAMES | \
CEPH_FEATURE_RECONNECT_SEQ | \
CEPH_FEATURE_DIRLAYOUTHASH | \
CEPH_FEATURE_PGID64 | \
CEPH_FEATURE_PGPOOL3 | \
CEPH_FEATURE_OSDENC | \
+ CEPH_FEATURE_MONENC | \
CEPH_FEATURE_CRUSH_TUNABLES | \
CEPH_FEATURE_SERVER_LUMINOUS | \
CEPH_FEATURE_RESEND_ON_SPLIT | \
@@ -193,6 +197,7 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin
CEPH_FEATURE_MSG_AUTH | \
CEPH_FEATURE_CRUSH_TUNABLES2 | \
CEPH_FEATURE_REPLY_CREATE_INODE | \
+ CEPH_FEATURE_SERVER_MIMIC | \
CEPH_FEATURE_MDSENC | \
CEPH_FEATURE_OSDHASHPSPOOL | \
CEPH_FEATURE_OSD_CACHEPOOL | \
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index cb21c5cf12c3..49586ff26152 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -28,8 +28,8 @@
#define CEPH_INO_ROOT 1
-#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
-#define CEPH_INO_DOTDOT 3 /* used by ceph fuse for parent (..) */
+#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
+#define CEPH_INO_GLOBAL_SNAPREALM 3 /* global dummy snaprealm */
/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
#define CEPH_MAX_MON 31
@@ -93,8 +93,19 @@ struct ceph_dir_layout {
#define CEPH_AUTH_NONE 0x1
#define CEPH_AUTH_CEPHX 0x2
+#define CEPH_AUTH_MODE_NONE 0
+#define CEPH_AUTH_MODE_AUTHORIZER 1
+#define CEPH_AUTH_MODE_MON 10
+
+/* msgr2 protocol modes */
+#define CEPH_CON_MODE_UNKNOWN 0x0
+#define CEPH_CON_MODE_CRC 0x1
+#define CEPH_CON_MODE_SECURE 0x2
+
#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
+const char *ceph_auth_proto_name(int proto);
+const char *ceph_con_mode_name(int mode);
/*********************************************
* message layer
@@ -130,6 +141,7 @@ struct ceph_dir_layout {
#define CEPH_MSG_CLIENT_REQUEST 24
#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
#define CEPH_MSG_CLIENT_REPLY 26
+#define CEPH_MSG_CLIENT_METRICS 29
#define CEPH_MSG_CLIENT_CAPS 0x310
#define CEPH_MSG_CLIENT_LEASE 0x311
#define CEPH_MSG_CLIENT_SNAP 0x312
@@ -287,8 +299,11 @@ enum {
CEPH_SESSION_FLUSHMSG_ACK,
CEPH_SESSION_FORCE_RO,
CEPH_SESSION_REJECT,
+ CEPH_SESSION_REQUEST_FLUSH_MDLOG,
};
+#define CEPH_SESSION_BLOCKLISTED (1 << 0) /* session blocklisted */
+
extern const char *ceph_session_op_name(int op);
struct ceph_mds_session_head {
@@ -313,6 +328,7 @@ enum {
CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
CEPH_MDS_OP_LOOKUPINO = 0x00104,
CEPH_MDS_OP_LOOKUPNAME = 0x00105,
+ CEPH_MDS_OP_GETVXATTR = 0x00106,
CEPH_MDS_OP_SETXATTR = 0x01105,
CEPH_MDS_OP_RMXATTR = 0x01106,
@@ -417,12 +433,13 @@ union ceph_mds_request_args {
__le32 stripe_unit; /* layout for newly created file */
__le32 stripe_count; /* ... */
__le32 object_size;
- __le32 file_replication;
- __le32 mask; /* CEPH_CAP_* */
- __le32 old_size;
+ __le32 pool;
+ __le32 mask; /* CEPH_CAP_* */
+ __le64 old_size;
} __attribute__ ((packed)) open;
struct {
__le32 flags;
+ __le32 osdmap_epoch; /* used for setting file/dir layouts */
} __attribute__ ((packed)) setxattr;
struct {
struct ceph_file_layout_legacy layout;
@@ -444,10 +461,25 @@ union ceph_mds_request_args {
} __attribute__ ((packed)) lookupino;
} __attribute__ ((packed));
-#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
-#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
+union ceph_mds_request_args_ext {
+ union ceph_mds_request_args old;
+ struct {
+ __le32 mode;
+ __le32 uid;
+ __le32 gid;
+ struct ceph_timespec mtime;
+ struct ceph_timespec atime;
+ __le64 size, old_size; /* old_size needed by truncate */
+ __le32 mask; /* CEPH_SETATTR_* */
+ struct ceph_timespec btime;
+ } __attribute__ ((packed)) setattr_ext;
+};
-struct ceph_mds_request_head {
+#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
+#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
+#define CEPH_MDS_FLAG_ASYNC 4 /* request is asynchronous */
+
+struct ceph_mds_request_head_old {
__le64 oldest_client_tid;
__le32 mdsmap_epoch; /* on client */
__le32 flags; /* CEPH_MDS_FLAG_* */
@@ -460,6 +492,22 @@ struct ceph_mds_request_head {
union ceph_mds_request_args args;
} __attribute__ ((packed));
+#define CEPH_MDS_REQUEST_HEAD_VERSION 1
+
+struct ceph_mds_request_head {
+ __le16 version; /* struct version */
+ __le64 oldest_client_tid;
+ __le32 mdsmap_epoch; /* on client */
+ __le32 flags; /* CEPH_MDS_FLAG_* */
+ __u8 num_retry, num_fwd; /* count retry, fwd attempts */
+ __le16 num_releases; /* # include cap/lease release records */
+ __le32 op; /* mds op code */
+ __le32 caller_uid, caller_gid;
+ __le64 ino; /* use this ino for openc, mkdir, mknod,
+ etc. (if replaying) */
+ union ceph_mds_request_args_ext args;
+} __attribute__ ((packed));
+
/* cap/lease release record */
struct ceph_mds_request_release {
__le64 ino, cap_id; /* ino and unique cap id */
@@ -530,6 +578,9 @@ struct ceph_mds_reply_lease {
__le32 seq;
} __attribute__ ((packed));
+#define CEPH_LEASE_VALID (1 | 2) /* old and new bit values */
+#define CEPH_LEASE_PRIMARY_LINK 4 /* primary linkage */
+
struct ceph_mds_reply_dirfrag {
__le32 frag; /* fragment */
__le32 auth; /* auth mds, if this is a delegation point */
@@ -564,6 +615,7 @@ struct ceph_filelock {
#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
#define CEPH_FILE_MODE_BITS 4
+#define CEPH_FILE_MODE_MASK ((1 << CEPH_FILE_MODE_BITS) - 1)
int ceph_flags_to_mode(int flags);
@@ -655,10 +707,19 @@ int ceph_flags_to_mode(int flags);
#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
CEPH_CAP_PIN)
+#define CEPH_CAP_ALL_FILE (CEPH_CAP_PIN | CEPH_CAP_ANY_SHARED | \
+ CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL | \
+ CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)
#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
CEPH_LOCK_IXATTR)
+/* cap masks async dir operations */
+#define CEPH_CAP_DIR_CREATE CEPH_CAP_FILE_CACHE
+#define CEPH_CAP_DIR_UNLINK CEPH_CAP_FILE_RD
+#define CEPH_CAP_ANY_DIR_OPS (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD | \
+ CEPH_CAP_FILE_WREXTEND | CEPH_CAP_FILE_LAZYIO)
+
int ceph_caps_for_mode(int mode);
enum {
@@ -707,7 +768,7 @@ struct ceph_mds_caps {
__le32 xattr_len;
__le64 xattr_version;
- /* filelock */
+ /* a union of non-export and export bodies. */
__le64 size, max_size, truncate_size;
__le32 truncate_seq;
struct ceph_timespec mtime, atime, ctime;
diff --git a/include/linux/ceph/debugfs.h b/include/linux/ceph/debugfs.h
index cf5e840eec71..8b3a1a7a953a 100644
--- a/include/linux/ceph/debugfs.h
+++ b/include/linux/ceph/debugfs.h
@@ -2,22 +2,8 @@
#ifndef _FS_CEPH_DEBUGFS_H
#define _FS_CEPH_DEBUGFS_H
-#include <linux/ceph/ceph_debug.h>
#include <linux/ceph/types.h>
-#define CEPH_DEFINE_SHOW_FUNC(name) \
-static int name##_open(struct inode *inode, struct file *file) \
-{ \
- return single_open(file, name, inode->i_private); \
-} \
- \
-static const struct file_operations name##_fops = { \
- .open = name##_open, \
- .read = seq_read, \
- .llseek = seq_lseek, \
- .release = single_release, \
-};
-
/* debugfs.c */
extern void ceph_debugfs_init(void);
extern void ceph_debugfs_cleanup(void);
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 450384fe487c..04f3ace5787b 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -220,6 +220,8 @@ static inline void ceph_encode_timespec64(struct ceph_timespec *tv,
*/
#define CEPH_ENTITY_ADDR_TYPE_NONE 0
#define CEPH_ENTITY_ADDR_TYPE_LEGACY __cpu_to_le32(1)
+#define CEPH_ENTITY_ADDR_TYPE_MSGR2 __cpu_to_le32(2)
+#define CEPH_ENTITY_ADDR_TYPE_ANY __cpu_to_le32(3)
static inline void ceph_encode_banner_addr(struct ceph_entity_addr *a)
{
@@ -239,6 +241,12 @@ static inline void ceph_decode_banner_addr(struct ceph_entity_addr *a)
extern int ceph_decode_entity_addr(void **p, void *end,
struct ceph_entity_addr *addr);
+int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2,
+ struct ceph_entity_addr *addr);
+
+int ceph_entity_addr_encoding_len(const struct ceph_entity_addr *addr);
+void ceph_encode_entity_addr(void **p, const struct ceph_entity_addr *addr);
+
/*
* encoders
*/
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index ec73ebc4827d..00af2c98da75 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -31,11 +31,11 @@
#define CEPH_OPT_FSID (1<<0)
#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
-#define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes */
-#define CEPH_OPT_NOMSGAUTH (1<<4) /* don't require msg signing feat */
-#define CEPH_OPT_TCP_NODELAY (1<<5) /* TCP_NODELAY on TCP sockets */
-#define CEPH_OPT_NOMSGSIGN (1<<6) /* don't sign msgs */
-#define CEPH_OPT_ABORT_ON_FULL (1<<7) /* abort w/ ENOSPC when full */
+#define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes (msgr1) */
+#define CEPH_OPT_TCP_NODELAY (1<<4) /* TCP_NODELAY on TCP sockets */
+#define CEPH_OPT_NOMSGSIGN (1<<5) /* don't sign msgs (msgr1) */
+#define CEPH_OPT_ABORT_ON_FULL (1<<6) /* abort w/ ENOSPC when full */
+#define CEPH_OPT_RXBOUNCE (1<<7) /* double-buffer read data */
#define CEPH_OPT_DEFAULT (CEPH_OPT_TCP_NODELAY)
@@ -52,6 +52,8 @@ struct ceph_options {
unsigned long osd_idle_ttl; /* jiffies */
unsigned long osd_keepalive_timeout; /* jiffies */
unsigned long osd_request_timeout; /* jiffies */
+ u32 read_from_replica; /* CEPH_OSD_FLAG_BALANCE/LOCALIZE_READS */
+ int con_modes[2]; /* CEPH_CON_MODE_* */
/*
* any type that can't be simply compared or doesn't need
@@ -64,6 +66,7 @@ struct ceph_options {
int num_mon;
char *name;
struct ceph_crypto_key *key;
+ struct rb_root crush_locs;
};
/*
@@ -73,6 +76,7 @@ struct ceph_options {
#define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000)
#define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000)
#define CEPH_OSD_REQUEST_TIMEOUT_DEFAULT 0 /* no timeout */
+#define CEPH_READ_FROM_REPLICA_DEFAULT 0 /* read from primary */
#define CEPH_MONC_HUNT_INTERVAL msecs_to_jiffies(3 * 1000)
#define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000)
@@ -80,6 +84,7 @@ struct ceph_options {
#define CEPH_MONC_HUNT_BACKOFF 2
#define CEPH_MONC_HUNT_MAX_MULT 10
+#define CEPH_MSG_MAX_CONTROL_LEN (16*1024*1024)
#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
#define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024)
@@ -101,6 +106,7 @@ enum {
CEPH_MOUNT_UNMOUNTING,
CEPH_MOUNT_UNMOUNTED,
CEPH_MOUNT_SHUTDOWN,
+ CEPH_MOUNT_RECOVER,
};
static inline unsigned long ceph_timeout_jiffies(unsigned long timeout)
@@ -147,6 +153,10 @@ struct ceph_client {
#define from_msgr(ms) container_of(ms, struct ceph_client, msgr)
+static inline bool ceph_msgr2(struct ceph_client *client)
+{
+ return client->options->con_modes[0] != CEPH_CON_MODE_UNKNOWN;
+}
/*
* snapshots
@@ -188,7 +198,7 @@ static inline int calc_pages_for(u64 off, u64 len)
#define RB_CMP3WAY(a, b) ((a) < (b) ? -1 : (a) > (b))
#define DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, cmpexp, keyexp, nodefld) \
-static void insert_##name(struct rb_root *root, type *t) \
+static bool __insert_##name(struct rb_root *root, type *t) \
{ \
struct rb_node **n = &root->rb_node; \
struct rb_node *parent = NULL; \
@@ -206,11 +216,17 @@ static void insert_##name(struct rb_root *root, type *t) \
else if (cmp > 0) \
n = &(*n)->rb_right; \
else \
- BUG(); \
+ return false; \
} \
\
rb_link_node(&t->nodefld, parent, n); \
rb_insert_color(&t->nodefld, root); \
+ return true; \
+} \
+static void __maybe_unused insert_##name(struct rb_root *root, type *t) \
+{ \
+ if (!__insert_##name(root, t)) \
+ BUG(); \
} \
static void erase_##name(struct rb_root *root, type *t) \
{ \
@@ -268,23 +284,26 @@ DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)
extern struct kmem_cache *ceph_inode_cachep;
extern struct kmem_cache *ceph_cap_cachep;
+extern struct kmem_cache *ceph_cap_snap_cachep;
extern struct kmem_cache *ceph_cap_flush_cachep;
extern struct kmem_cache *ceph_dentry_cachep;
extern struct kmem_cache *ceph_file_cachep;
extern struct kmem_cache *ceph_dir_file_cachep;
+extern struct kmem_cache *ceph_mds_request_cachep;
+extern mempool_t *ceph_wb_pagevec_pool;
/* ceph_common.c */
extern bool libceph_compatible(void *data);
extern const char *ceph_msg_type_name(int type);
extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
-extern void *ceph_kvmalloc(size_t size, gfp_t flags);
+extern int ceph_parse_fsid(const char *str, struct ceph_fsid *fsid);
struct fs_parameter;
struct fc_log;
struct ceph_options *ceph_alloc_options(void);
int ceph_parse_mon_ips(const char *buf, size_t len, struct ceph_options *opt,
- struct fc_log *l);
+ struct fc_log *l, char delim);
int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
struct fc_log *l);
int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
index 35d385296fbb..4c3e0648dc27 100644
--- a/include/linux/ceph/mdsmap.h
+++ b/include/linux/ceph/mdsmap.h
@@ -25,6 +25,7 @@ struct ceph_mdsmap {
u32 m_session_timeout; /* seconds */
u32 m_session_autoclose; /* seconds */
u64 m_max_file_size;
+ u64 m_max_xattr_size; /* maximum size for xattrs blob */
u32 m_max_mds; /* expected up:active mds number */
u32 m_num_active_mds; /* actual up:active mds number */
u32 possible_max_rank; /* possible max rank index */
@@ -64,7 +65,7 @@ static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
}
extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
-extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
+struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2);
extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
extern bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m);
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index c4458dc6a757..99c1726be6ee 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -3,6 +3,7 @@
#define __FS_CEPH_MESSENGER_H
#include <linux/bvec.h>
+#include <linux/crypto.h>
#include <linux/kref.h>
#include <linux/mutex.h>
#include <linux/net.h>
@@ -52,9 +53,26 @@ struct ceph_connection_operations {
int (*sign_message) (struct ceph_msg *msg);
int (*check_message_signature) (struct ceph_msg *msg);
+
+ /* msgr2 authentication exchange */
+ int (*get_auth_request)(struct ceph_connection *con,
+ void *buf, int *buf_len,
+ void **authorizer, int *authorizer_len);
+ int (*handle_auth_reply_more)(struct ceph_connection *con,
+ void *reply, int reply_len,
+ void *buf, int *buf_len,
+ void **authorizer, int *authorizer_len);
+ int (*handle_auth_done)(struct ceph_connection *con,
+ u64 global_id, void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len);
+ int (*handle_auth_bad_method)(struct ceph_connection *con,
+ int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt);
};
-/* use format string %s%d */
+/* use format string %s%lld */
#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
struct ceph_messenger {
@@ -175,9 +193,10 @@ struct ceph_msg_data {
#endif /* CONFIG_BLOCK */
struct ceph_bvec_iter bvec_pos;
struct {
- struct page **pages; /* NOT OWNER. */
+ struct page **pages;
size_t length; /* total # bytes */
unsigned int alignment; /* first page */
+ bool own_pages;
};
struct ceph_pagelist *pagelist;
};
@@ -188,7 +207,6 @@ struct ceph_msg_data_cursor {
struct ceph_msg_data *data; /* current data item */
size_t resid; /* bytes not yet consumed */
- bool last_piece; /* current is last piece */
bool need_crc; /* crc update needed */
union {
#ifdef CONFIG_BLOCK
@@ -234,14 +252,175 @@ struct ceph_msg {
bool more_to_follow;
bool needs_out_seq;
int front_alloc_len;
- unsigned long ack_stamp; /* tx: when we were acked */
struct ceph_msgpool *pool;
};
+/*
+ * connection states
+ */
+#define CEPH_CON_S_CLOSED 1
+#define CEPH_CON_S_PREOPEN 2
+#define CEPH_CON_S_V1_BANNER 3
+#define CEPH_CON_S_V1_CONNECT_MSG 4
+#define CEPH_CON_S_V2_BANNER_PREFIX 5
+#define CEPH_CON_S_V2_BANNER_PAYLOAD 6
+#define CEPH_CON_S_V2_HELLO 7
+#define CEPH_CON_S_V2_AUTH 8
+#define CEPH_CON_S_V2_AUTH_SIGNATURE 9
+#define CEPH_CON_S_V2_SESSION_CONNECT 10
+#define CEPH_CON_S_V2_SESSION_RECONNECT 11
+#define CEPH_CON_S_OPEN 12
+#define CEPH_CON_S_STANDBY 13
+
+/*
+ * ceph_connection flag bits
+ */
+#define CEPH_CON_F_LOSSYTX 0 /* we can close channel or drop
+ messages on errors */
+#define CEPH_CON_F_KEEPALIVE_PENDING 1 /* we need to send a keepalive */
+#define CEPH_CON_F_WRITE_PENDING 2 /* we have data ready to send */
+#define CEPH_CON_F_SOCK_CLOSED 3 /* socket state changed to closed */
+#define CEPH_CON_F_BACKOFF 4 /* need to retry queuing delayed
+ work */
+
/* ceph connection fault delay defaults, for exponential backoff */
-#define BASE_DELAY_INTERVAL (HZ/2)
-#define MAX_DELAY_INTERVAL (5 * 60 * HZ)
+#define BASE_DELAY_INTERVAL (HZ / 4)
+#define MAX_DELAY_INTERVAL (15 * HZ)
+
+struct ceph_connection_v1_info {
+ struct kvec out_kvec[8], /* sending header/footer data */
+ *out_kvec_cur;
+ int out_kvec_left; /* kvec's left in out_kvec */
+ int out_skip; /* skip this many bytes */
+ int out_kvec_bytes; /* total bytes left */
+ bool out_more; /* there is more data after the kvecs */
+ bool out_msg_done;
+
+ struct ceph_auth_handshake *auth;
+ int auth_retry; /* true if we need a newer authorizer */
+
+ /* connection negotiation temps */
+ u8 in_banner[CEPH_BANNER_MAX_LEN];
+ struct ceph_entity_addr actual_peer_addr;
+ struct ceph_entity_addr peer_addr_for_me;
+ struct ceph_msg_connect out_connect;
+ struct ceph_msg_connect_reply in_reply;
+
+ int in_base_pos; /* bytes read */
+
+ /* message in temps */
+ u8 in_tag; /* protocol control byte */
+ struct ceph_msg_header in_hdr;
+ __le64 in_temp_ack; /* for reading an ack */
+
+ /* message out temps */
+ struct ceph_msg_header out_hdr;
+ __le64 out_temp_ack; /* for writing an ack */
+ struct ceph_timespec out_temp_keepalive2; /* for writing keepalive2
+ stamp */
+
+ u32 connect_seq; /* identify the most recent connection
+ attempt for this session */
+ u32 peer_global_seq; /* peer's global seq for this connection */
+};
+
+#define CEPH_CRC_LEN 4
+#define CEPH_GCM_KEY_LEN 16
+#define CEPH_GCM_IV_LEN sizeof(struct ceph_gcm_nonce)
+#define CEPH_GCM_BLOCK_LEN 16
+#define CEPH_GCM_TAG_LEN 16
+
+#define CEPH_PREAMBLE_LEN 32
+#define CEPH_PREAMBLE_INLINE_LEN 48
+#define CEPH_PREAMBLE_PLAIN_LEN CEPH_PREAMBLE_LEN
+#define CEPH_PREAMBLE_SECURE_LEN (CEPH_PREAMBLE_LEN + \
+ CEPH_PREAMBLE_INLINE_LEN + \
+ CEPH_GCM_TAG_LEN)
+#define CEPH_EPILOGUE_PLAIN_LEN (1 + 3 * CEPH_CRC_LEN)
+#define CEPH_EPILOGUE_SECURE_LEN (CEPH_GCM_BLOCK_LEN + CEPH_GCM_TAG_LEN)
+
+#define CEPH_FRAME_MAX_SEGMENT_COUNT 4
+
+struct ceph_frame_desc {
+ int fd_tag; /* FRAME_TAG_* */
+ int fd_seg_cnt;
+ int fd_lens[CEPH_FRAME_MAX_SEGMENT_COUNT]; /* logical */
+ int fd_aligns[CEPH_FRAME_MAX_SEGMENT_COUNT];
+};
+
+struct ceph_gcm_nonce {
+ __le32 fixed;
+ __le64 counter __packed;
+};
+
+struct ceph_connection_v2_info {
+ struct iov_iter in_iter;
+ struct kvec in_kvecs[5]; /* recvmsg */
+ struct bio_vec in_bvec; /* recvmsg (in_cursor) */
+ int in_kvec_cnt;
+ int in_state; /* IN_S_* */
+
+ struct iov_iter out_iter;
+ struct kvec out_kvecs[8]; /* sendmsg */
+ struct bio_vec out_bvec; /* sendpage (out_cursor, out_zero),
+ sendmsg (out_enc_pages) */
+ int out_kvec_cnt;
+ int out_state; /* OUT_S_* */
+
+ int out_zero; /* # of zero bytes to send */
+ bool out_iter_sendpage; /* use sendpage if possible */
+
+ struct ceph_frame_desc in_desc;
+ struct ceph_msg_data_cursor in_cursor;
+ struct ceph_msg_data_cursor out_cursor;
+
+ struct crypto_shash *hmac_tfm; /* post-auth signature */
+ struct crypto_aead *gcm_tfm; /* on-wire encryption */
+ struct aead_request *gcm_req;
+ struct crypto_wait gcm_wait;
+ struct ceph_gcm_nonce in_gcm_nonce;
+ struct ceph_gcm_nonce out_gcm_nonce;
+
+ struct page **in_enc_pages;
+ int in_enc_page_cnt;
+ int in_enc_resid;
+ int in_enc_i;
+ struct page **out_enc_pages;
+ int out_enc_page_cnt;
+ int out_enc_resid;
+ int out_enc_i;
+
+ int con_mode; /* CEPH_CON_MODE_* */
+
+ void *conn_bufs[16];
+ int conn_buf_cnt;
+
+ struct kvec in_sign_kvecs[8];
+ struct kvec out_sign_kvecs[8];
+ int in_sign_kvec_cnt;
+ int out_sign_kvec_cnt;
+
+ u64 client_cookie;
+ u64 server_cookie;
+ u64 global_seq;
+ u64 connect_seq;
+ u64 peer_global_seq;
+
+ u8 in_buf[CEPH_PREAMBLE_SECURE_LEN];
+ u8 out_buf[CEPH_PREAMBLE_SECURE_LEN];
+ struct {
+ u8 late_status; /* FRAME_LATE_STATUS_* */
+ union {
+ struct {
+ u32 front_crc;
+ u32 middle_crc;
+ u32 data_crc;
+ } __packed;
+ u8 pad[CEPH_GCM_BLOCK_LEN - 1];
+ };
+ } out_epil;
+};
/*
* A single connection with another host.
@@ -257,24 +436,16 @@ struct ceph_connection {
struct ceph_messenger *msgr;
+ int state; /* CEPH_CON_S_* */
atomic_t sock_state;
struct socket *sock;
- struct ceph_entity_addr peer_addr; /* peer address */
- struct ceph_entity_addr peer_addr_for_me;
- unsigned long flags;
- unsigned long state;
+ unsigned long flags; /* CEPH_CON_F_* */
const char *error_msg; /* error message, if any */
struct ceph_entity_name peer_name; /* peer name */
-
+ struct ceph_entity_addr peer_addr; /* peer address */
u64 peer_features;
- u32 connect_seq; /* identify the most recent connection
- attempt for this connection, client */
- u32 peer_global_seq; /* peer's global seq for this connection */
-
- struct ceph_auth_handshake *auth;
- int auth_retry; /* true if we need a newer authorizer */
struct mutex mutex;
@@ -285,50 +456,86 @@ struct ceph_connection {
u64 in_seq, in_seq_acked; /* last message received, acked */
- /* connection negotiation temps */
- char in_banner[CEPH_BANNER_MAX_LEN];
- struct ceph_msg_connect out_connect;
- struct ceph_msg_connect_reply in_reply;
- struct ceph_entity_addr actual_peer_addr;
-
- /* message out temps */
- struct ceph_msg_header out_hdr;
+ struct ceph_msg *in_msg;
struct ceph_msg *out_msg; /* sending message (== tail of
out_sent) */
- bool out_msg_done;
-
- struct kvec out_kvec[8], /* sending header/footer data */
- *out_kvec_cur;
- int out_kvec_left; /* kvec's left in out_kvec */
- int out_skip; /* skip this many bytes */
- int out_kvec_bytes; /* total bytes left */
- int out_more; /* there is more data after the kvecs */
- __le64 out_temp_ack; /* for writing an ack */
- struct ceph_timespec out_temp_keepalive2; /* for writing keepalive2
- stamp */
- /* message in temps */
- struct ceph_msg_header in_hdr;
- struct ceph_msg *in_msg;
+ struct page *bounce_page;
u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
- char in_tag; /* protocol control byte */
- int in_base_pos; /* bytes read */
- __le64 in_temp_ack; /* for reading an ack */
-
struct timespec64 last_keepalive_ack; /* keepalive2 ack stamp */
struct delayed_work work; /* send|recv work */
unsigned long delay; /* current delay interval */
+
+ union {
+ struct ceph_connection_v1_info v1;
+ struct ceph_connection_v2_info v2;
+ };
};
+extern struct page *ceph_zero_page;
+
+void ceph_con_flag_clear(struct ceph_connection *con, unsigned long con_flag);
+void ceph_con_flag_set(struct ceph_connection *con, unsigned long con_flag);
+bool ceph_con_flag_test(struct ceph_connection *con, unsigned long con_flag);
+bool ceph_con_flag_test_and_clear(struct ceph_connection *con,
+ unsigned long con_flag);
+bool ceph_con_flag_test_and_set(struct ceph_connection *con,
+ unsigned long con_flag);
+
+void ceph_encode_my_addr(struct ceph_messenger *msgr);
+
+int ceph_tcp_connect(struct ceph_connection *con);
+int ceph_con_close_socket(struct ceph_connection *con);
+void ceph_con_reset_session(struct ceph_connection *con);
+
+u32 ceph_get_global_seq(struct ceph_messenger *msgr, u32 gt);
+void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq);
+void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq);
+
+void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
+ struct ceph_msg *msg, size_t length);
+struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
+ size_t *page_offset, size_t *length);
+void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes);
+
+u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset,
+ unsigned int length);
+
+bool ceph_addr_is_blank(const struct ceph_entity_addr *addr);
+int ceph_addr_port(const struct ceph_entity_addr *addr);
+void ceph_addr_set_port(struct ceph_entity_addr *addr, int p);
+
+void ceph_con_process_message(struct ceph_connection *con);
+int ceph_con_in_msg_alloc(struct ceph_connection *con,
+ struct ceph_msg_header *hdr, int *skip);
+void ceph_con_get_out_msg(struct ceph_connection *con);
+
+/* messenger_v1.c */
+int ceph_con_v1_try_read(struct ceph_connection *con);
+int ceph_con_v1_try_write(struct ceph_connection *con);
+void ceph_con_v1_revoke(struct ceph_connection *con);
+void ceph_con_v1_revoke_incoming(struct ceph_connection *con);
+bool ceph_con_v1_opened(struct ceph_connection *con);
+void ceph_con_v1_reset_session(struct ceph_connection *con);
+void ceph_con_v1_reset_protocol(struct ceph_connection *con);
+
+/* messenger_v2.c */
+int ceph_con_v2_try_read(struct ceph_connection *con);
+int ceph_con_v2_try_write(struct ceph_connection *con);
+void ceph_con_v2_revoke(struct ceph_connection *con);
+void ceph_con_v2_revoke_incoming(struct ceph_connection *con);
+bool ceph_con_v2_opened(struct ceph_connection *con);
+void ceph_con_v2_reset_session(struct ceph_connection *con);
+void ceph_con_v2_reset_protocol(struct ceph_connection *con);
+
extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr);
extern int ceph_parse_ips(const char *c, const char *end,
struct ceph_entity_addr *addr,
- int max_count, int *count);
-
+ int max_count, int *count, char delim);
extern int ceph_msgr_init(void);
extern void ceph_msgr_exit(void);
@@ -356,8 +563,8 @@ extern void ceph_con_keepalive(struct ceph_connection *con);
extern bool ceph_con_keepalive_expired(struct ceph_connection *con,
unsigned long interval);
-extern void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
- size_t length, size_t alignment);
+void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
+ size_t length, size_t alignment, bool own_pages);
extern void ceph_msg_data_add_pagelist(struct ceph_msg *msg,
struct ceph_pagelist *pagelist);
#ifdef CONFIG_BLOCK
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index dbb8a6959a73..b658961156a0 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -19,7 +19,7 @@ struct ceph_monmap {
struct ceph_fsid fsid;
u32 epoch;
u32 num_mon;
- struct ceph_entity_inst mon_inst[0];
+ struct ceph_entity_inst mon_inst[];
};
struct ceph_mon_client;
@@ -142,7 +142,7 @@ int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
ceph_monc_callback_t cb, u64 private_data);
-int ceph_monc_blacklist_add(struct ceph_mon_client *monc,
+int ceph_monc_blocklist_add(struct ceph_mon_client *monc,
struct ceph_entity_addr *client_addr);
extern int ceph_monc_open_session(struct ceph_mon_client *monc);
diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h
index 9e50aede46c8..3989dcb94d3d 100644
--- a/include/linux/ceph/msgr.h
+++ b/include/linux/ceph/msgr.h
@@ -9,24 +9,45 @@
#define CEPH_MON_PORT 6789 /* default monitor port */
/*
- * client-side processes will try to bind to ports in this
- * range, simply for the benefit of tools like nmap or wireshark
- * that would like to identify the protocol.
- */
-#define CEPH_PORT_FIRST 6789
-#define CEPH_PORT_START 6800 /* non-monitors start here */
-#define CEPH_PORT_LAST 6900
-
-/*
* tcp connection banner. include a protocol version. and adjust
* whenever the wire protocol changes. try to keep this string length
* constant.
*/
#define CEPH_BANNER "ceph v027"
+#define CEPH_BANNER_LEN 9
#define CEPH_BANNER_MAX_LEN 30
/*
+ * messenger V2 connection banner prefix.
+ * The full banner string should have the form: "ceph v2\n<le16>"
+ * the 2 bytes are the length of the remaining banner.
+ */
+#define CEPH_BANNER_V2 "ceph v2\n"
+#define CEPH_BANNER_V2_LEN 8
+#define CEPH_BANNER_V2_PREFIX_LEN (CEPH_BANNER_V2_LEN + sizeof(__le16))
+
+/*
+ * messenger V2 features
+ */
+#define CEPH_MSGR2_INCARNATION_1 (0ull)
+
+#define DEFINE_MSGR2_FEATURE(bit, incarnation, name) \
+ static const uint64_t __maybe_unused CEPH_MSGR2_FEATURE_##name = (1ULL << bit); \
+ static const uint64_t __maybe_unused CEPH_MSGR2_FEATUREMASK_##name = \
+ (1ULL << bit | CEPH_MSGR2_INCARNATION_##incarnation);
+
+#define HAVE_MSGR2_FEATURE(x, name) \
+ (((x) & (CEPH_MSGR2_FEATUREMASK_##name)) == (CEPH_MSGR2_FEATUREMASK_##name))
+
+DEFINE_MSGR2_FEATURE( 0, 1, REVISION_1) // msgr2.1
+
+#define CEPH_MSGR2_SUPPORTED_FEATURES (CEPH_MSGR2_FEATURE_REVISION_1)
+
+#define CEPH_MSGR2_REQUIRED_FEATURES (CEPH_MSGR2_FEATURE_REVISION_1)
+
+
+/*
* Rollover-safe type and comparator for 32-bit sequence numbers.
* Comparator returns -1, 0, or 1.
*/
@@ -61,11 +82,18 @@ extern const char *ceph_entity_type_name(int type);
* entity_addr -- network address
*/
struct ceph_entity_addr {
- __le32 type;
+ __le32 type; /* CEPH_ENTITY_ADDR_TYPE_* */
__le32 nonce; /* unique id for process (e.g. pid) */
struct sockaddr_storage in_addr;
} __attribute__ ((packed));
+static inline bool ceph_addr_equal_no_type(const struct ceph_entity_addr *lhs,
+ const struct ceph_entity_addr *rhs)
+{
+ return !memcmp(&lhs->in_addr, &rhs->in_addr, sizeof(lhs->in_addr)) &&
+ lhs->nonce == rhs->nonce;
+}
+
struct ceph_entity_inst {
struct ceph_entity_name name;
struct ceph_entity_addr addr;
@@ -160,6 +188,24 @@ struct ceph_msg_header {
__le32 crc; /* header crc32c */
} __attribute__ ((packed));
+struct ceph_msg_header2 {
+ __le64 seq; /* message seq# for this session */
+ __le64 tid; /* transaction id */
+ __le16 type; /* message type */
+ __le16 priority; /* priority. higher value == higher priority */
+ __le16 version; /* version of message encoding */
+
+ __le32 data_pre_padding_len;
+ __le16 data_off; /* sender: include full offset;
+ receiver: mask against ~PAGE_MASK */
+
+ __le64 ack_seq;
+ __u8 flags;
+ /* oldest code we think can decode this. unknown if zero. */
+ __le16 compat_version;
+ __le16 reserved;
+} __attribute__ ((packed));
+
#define CEPH_MSG_PRIO_LOW 64
#define CEPH_MSG_PRIO_DEFAULT 127
#define CEPH_MSG_PRIO_HIGH 196
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 5a62dbd3f4c2..fb6be72104df 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -8,6 +8,7 @@
#include <linux/mempool.h>
#include <linux/rbtree.h>
#include <linux/refcount.h>
+#include <linux/ktime.h>
#include <linux/ceph/types.h>
#include <linux/ceph/osdmap.h>
@@ -135,6 +136,7 @@ struct ceph_osd_req_op {
struct {
u64 expected_object_size;
u64 expected_write_size;
+ u32 flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
} alloc_hint;
struct {
u64 snapid;
@@ -164,6 +166,7 @@ struct ceph_osd_request_target {
bool recovery_deletes;
unsigned int flags; /* CEPH_OSD_FLAG_* */
+ bool used_replica;
bool paused;
u32 epoch;
@@ -213,6 +216,8 @@ struct ceph_osd_request {
/* internal */
unsigned long r_stamp; /* jiffies, send or check time */
unsigned long r_start_stamp; /* jiffies */
+ ktime_t r_start_latency; /* ktime_t */
+ ktime_t r_end_latency; /* ktime_t */
int r_attempts;
u32 r_map_dne_bound;
@@ -282,6 +287,9 @@ struct ceph_osd_linger_request {
rados_watcherrcb_t errcb;
void *data;
+ struct ceph_pagelist *request_pl;
+ struct page **notify_id_pages;
+
struct page ***preply_pages;
size_t *preply_len;
};
@@ -399,7 +407,7 @@ void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc);
&__oreq->r_ops[__whch].typ.fld; \
})
-extern void osd_req_op_init(struct ceph_osd_request *osd_req,
+struct ceph_osd_req_op *osd_req_op_init(struct ceph_osd_request *osd_req,
unsigned int which, u16 opcode, u32 flags);
extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *,
@@ -468,7 +476,16 @@ extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int
extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
unsigned int which,
u64 expected_object_size,
- u64 expected_write_size);
+ u64 expected_write_size,
+ u32 flags);
+extern int osd_req_op_copy_from_init(struct ceph_osd_request *req,
+ u64 src_snapid, u64 src_version,
+ struct ceph_object_id *src_oid,
+ struct ceph_object_locator *src_oloc,
+ u32 src_fadvise_flags,
+ u32 dst_fadvise_flags,
+ u32 truncate_seq, u64 truncate_size,
+ u8 copy_from_flags);
extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
struct ceph_snap_context *snapc,
@@ -490,9 +507,8 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
extern void ceph_osdc_get_request(struct ceph_osd_request *req);
extern void ceph_osdc_put_request(struct ceph_osd_request *req);
-extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req,
- bool nofail);
+void ceph_osdc_start_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req);
extern void ceph_osdc_cancel_request(struct ceph_osd_request *req);
extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
struct ceph_osd_request *req);
@@ -509,34 +525,6 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
struct page *req_page, size_t req_len,
struct page **resp_pages, size_t *resp_len);
-extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
- struct ceph_vino vino,
- struct ceph_file_layout *layout,
- u64 off, u64 *plen,
- u32 truncate_seq, u64 truncate_size,
- struct page **pages, int nr_pages,
- int page_align);
-
-extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
- struct ceph_vino vino,
- struct ceph_file_layout *layout,
- struct ceph_snap_context *sc,
- u64 off, u64 len,
- u32 truncate_seq, u64 truncate_size,
- struct timespec64 *mtime,
- struct page **pages, int nr_pages);
-
-int ceph_osdc_copy_from(struct ceph_osd_client *osdc,
- u64 src_snapid, u64 src_version,
- struct ceph_object_id *src_oid,
- struct ceph_object_locator *src_oloc,
- u32 src_fadvise_flags,
- struct ceph_object_id *dst_oid,
- struct ceph_object_locator *dst_oloc,
- u32 dst_fadvise_flags,
- u32 truncate_seq, u64 truncate_size,
- u8 copy_from_flags);
-
/* watch/notify */
struct ceph_osd_linger_request *
ceph_osdc_watch(struct ceph_osd_client *osdc,
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index e081b56f1c1d..5553019c3f07 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -37,6 +37,9 @@ int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs);
#define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id
together */
#define CEPH_POOL_FLAG_FULL (1ULL << 1) /* pool is full */
+#define CEPH_POOL_FLAG_FULL_QUOTA (1ULL << 10) /* pool ran out of quota,
+ will set FULL too */
+#define CEPH_POOL_FLAG_NEARFULL (1ULL << 11) /* pool is nearfull */
struct ceph_pg_pool_info {
struct rb_node node;
@@ -134,6 +137,17 @@ int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
const char *fmt, ...);
void ceph_oid_destroy(struct ceph_object_id *oid);
+struct workspace_manager {
+ struct list_head idle_ws;
+ spinlock_t ws_lock;
+ /* Number of free workspaces */
+ int free_ws;
+ /* Total number of allocated workspaces */
+ atomic_t total_ws;
+ /* Waiters for a free workspace */
+ wait_queue_head_t ws_wait;
+};
+
struct ceph_pg_mapping {
struct rb_node node;
struct ceph_pg pgid;
@@ -181,8 +195,7 @@ struct ceph_osdmap {
* the list of osds that store+replicate them. */
struct crush_map *crush;
- struct mutex crush_workspace_mutex;
- void *crush_workspace;
+ struct workspace_manager crush_wsm;
};
static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
@@ -238,8 +251,8 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
}
struct ceph_osdmap *ceph_osdmap_alloc(void);
-extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end);
-struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2);
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2,
struct ceph_osdmap *map);
extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
@@ -299,10 +312,28 @@ bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
const struct ceph_pg *raw_pgid);
+struct crush_loc {
+ char *cl_type_name;
+ char *cl_name;
+};
+
+struct crush_loc_node {
+ struct rb_node cl_node;
+ struct crush_loc cl_loc; /* pointers into cl_data */
+ char cl_data[];
+};
+
+int ceph_parse_crush_location(char *crush_location, struct rb_root *locs);
+int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2);
+void ceph_clear_crush_locs(struct rb_root *locs);
+
+int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id,
+ struct rb_root *locs);
+
extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
u64 id);
-
extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id);
extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
+u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id);
#endif
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 59bdfd470100..43a7a1573b51 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -143,8 +143,10 @@ extern const char *ceph_osd_state_name(int s);
/*
* osd map flag bits
*/
-#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
-#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
+#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC),
+ not set since ~luminous */
+#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC),
+ not set since ~luminous */
#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
@@ -422,7 +424,7 @@ enum {
};
#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
-#define EBLACKLISTED ESHUTDOWN /* blacklisted */
+#define EBLOCKLISTED ESHUTDOWN /* blocklisted */
/* xattr comparison */
enum {
@@ -463,6 +465,19 @@ enum {
const char *ceph_osd_watch_op_name(int o);
enum {
+ CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1,
+ CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE = 2,
+ CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4,
+ CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ = 8,
+ CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY = 16,
+ CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE = 32,
+ CEPH_OSD_ALLOC_HINT_FLAG_SHORTLIVED = 64,
+ CEPH_OSD_ALLOC_HINT_FLAG_LONGLIVED = 128,
+ CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE = 256,
+ CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512,
+};
+
+enum {
CEPH_OSD_BACKOFF_OP_BLOCK = 1,
CEPH_OSD_BACKOFF_OP_ACK_BLOCK = 2,
CEPH_OSD_BACKOFF_OP_UNBLOCK = 3,
@@ -515,6 +530,7 @@ struct ceph_osd_op {
struct {
__le64 expected_object_size;
__le64 expected_write_size;
+ __le32 flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
} __attribute__ ((packed)) alloc_hint;
struct {
__le64 snapid;