diff options
Diffstat (limited to 'include/linux/ceph')
-rw-r--r-- | include/linux/ceph/auth.h | 72 | ||||
-rw-r--r-- | include/linux/ceph/ceph_features.h | 21 | ||||
-rw-r--r-- | include/linux/ceph/ceph_fs.h | 79 | ||||
-rw-r--r-- | include/linux/ceph/debugfs.h | 14 | ||||
-rw-r--r-- | include/linux/ceph/decode.h | 8 | ||||
-rw-r--r-- | include/linux/ceph/libceph.h | 37 | ||||
-rw-r--r-- | include/linux/ceph/mdsmap.h | 3 | ||||
-rw-r--r-- | include/linux/ceph/messenger.h | 301 | ||||
-rw-r--r-- | include/linux/ceph/mon_client.h | 4 | ||||
-rw-r--r-- | include/linux/ceph/msgr.h | 66 | ||||
-rw-r--r-- | include/linux/ceph/osd_client.h | 54 | ||||
-rw-r--r-- | include/linux/ceph/osdmap.h | 41 | ||||
-rw-r--r-- | include/linux/ceph/rados.h | 22 |
13 files changed, 565 insertions, 157 deletions
diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h index 6728c2ee0205..6b138fa97db8 100644 --- a/include/linux/ceph/auth.h +++ b/include/linux/ceph/auth.h @@ -32,8 +32,6 @@ struct ceph_auth_handshake { }; struct ceph_auth_client_ops { - const char *name; - /* * true if we are authenticated and can connect to * services. @@ -52,8 +50,10 @@ struct ceph_auth_client_ops { * another request. */ int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end); - int (*handle_reply)(struct ceph_auth_client *ac, int result, - void *buf, void *end); + int (*handle_reply)(struct ceph_auth_client *ac, u64 global_id, + void *buf, void *end, u8 *session_key, + int *session_key_len, u8 *con_secret, + int *con_secret_len); /* * Create authorizer for connecting to a service, and verify @@ -69,7 +69,10 @@ struct ceph_auth_client_ops { void *challenge_buf, int challenge_buf_len); int (*verify_authorizer_reply)(struct ceph_auth_client *ac, - struct ceph_authorizer *a); + struct ceph_authorizer *a, + void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len); void (*invalidate_authorizer)(struct ceph_auth_client *ac, int peer_type); @@ -95,11 +98,17 @@ struct ceph_auth_client { const struct ceph_crypto_key *key; /* our secret key */ unsigned want_keys; /* which services we want */ + int preferred_mode; /* CEPH_CON_MODE_* */ + int fallback_mode; /* ditto */ + struct mutex mutex; }; -extern struct ceph_auth_client *ceph_auth_init(const char *name, - const struct ceph_crypto_key *key); +void ceph_auth_set_global_id(struct ceph_auth_client *ac, u64 global_id); + +struct ceph_auth_client *ceph_auth_init(const char *name, + const struct ceph_crypto_key *key, + const int *con_modes); extern void ceph_auth_destroy(struct ceph_auth_client *ac); extern void ceph_auth_reset(struct ceph_auth_client *ac); @@ -113,21 +122,22 @@ int ceph_auth_entity_name_encode(const char *name, void **p, void *end); extern int ceph_build_auth(struct ceph_auth_client *ac, void *msg_buf, size_t msg_len); - extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac); -extern int ceph_auth_create_authorizer(struct ceph_auth_client *ac, - int peer_type, - struct ceph_auth_handshake *auth); + +int __ceph_auth_get_authorizer(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + int peer_type, bool force_new, + int *proto, int *pref_mode, int *fallb_mode); void ceph_auth_destroy_authorizer(struct ceph_authorizer *a); -extern int ceph_auth_update_authorizer(struct ceph_auth_client *ac, - int peer_type, - struct ceph_auth_handshake *a); int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac, struct ceph_authorizer *a, void *challenge_buf, int challenge_buf_len); -extern int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac, - struct ceph_authorizer *a); +int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac, + struct ceph_authorizer *a, + void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len); extern void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type); @@ -147,4 +157,34 @@ int ceph_auth_check_message_signature(struct ceph_auth_handshake *auth, return auth->check_message_signature(auth, msg); return 0; } + +int ceph_auth_get_request(struct ceph_auth_client *ac, void *buf, int buf_len); +int ceph_auth_handle_reply_more(struct ceph_auth_client *ac, void *reply, + int reply_len, void *buf, int buf_len); +int ceph_auth_handle_reply_done(struct ceph_auth_client *ac, + u64 global_id, void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len); +bool ceph_auth_handle_bad_method(struct ceph_auth_client *ac, + int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt); + +int ceph_auth_get_authorizer(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + int peer_type, void *buf, int *buf_len); +int ceph_auth_handle_svc_reply_more(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + void *reply, int reply_len, + void *buf, int *buf_len); +int ceph_auth_handle_svc_reply_done(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len); +bool ceph_auth_handle_bad_authorizer(struct ceph_auth_client *ac, + int peer_type, int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt); + #endif diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 39e6f4c57580..3a47acd9cc14 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -8,17 +8,18 @@ * feature. Base case is 1 (first use). */ #define CEPH_FEATURE_INCARNATION_1 (0ull) -#define CEPH_FEATURE_INCARNATION_2 (1ull<<57) // CEPH_FEATURE_SERVER_JEWEL +#define CEPH_FEATURE_INCARNATION_2 (1ull<<57) // SERVER_JEWEL +#define CEPH_FEATURE_INCARNATION_3 ((1ull<<57)|(1ull<<28)) // SERVER_MIMIC #define DEFINE_CEPH_FEATURE(bit, incarnation, name) \ - static const uint64_t CEPH_FEATURE_##name = (1ULL<<bit); \ - static const uint64_t CEPH_FEATUREMASK_##name = \ + static const uint64_t __maybe_unused CEPH_FEATURE_##name = (1ULL<<bit); \ + static const uint64_t __maybe_unused CEPH_FEATUREMASK_##name = \ (1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation); /* this bit is ignored but still advertised by release *when* */ #define DEFINE_CEPH_FEATURE_DEPRECATED(bit, incarnation, name, when) \ - static const uint64_t DEPRECATED_CEPH_FEATURE_##name = (1ULL<<bit); \ - static const uint64_t DEPRECATED_CEPH_FEATUREMASK_##name = \ + static const uint64_t __maybe_unused DEPRECATED_CEPH_FEATURE_##name = (1ULL<<bit); \ + static const uint64_t __maybe_unused DEPRECATED_CEPH_FEATUREMASK_##name = \ (1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation); /* @@ -58,7 +59,7 @@ * because 10.2.z (jewel) did not care if its peers advertised this * feature bit. * - * - In the second phase we stop advertising the the bit and call it + * - In the second phase we stop advertising the bit and call it * RETIRED. This can normally be done in the *next* major release * following the one in which we marked the feature DEPRECATED. In * the above example, for 12.0.z (luminous) we can say: @@ -75,7 +76,7 @@ DEFINE_CEPH_FEATURE( 0, 1, UID) DEFINE_CEPH_FEATURE( 1, 1, NOSRCADDR) DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS) - +DEFINE_CEPH_FEATURE( 2, 3, SERVER_NAUTILUS) DEFINE_CEPH_FEATURE( 3, 1, FLOCK) DEFINE_CEPH_FEATURE( 4, 1, SUBSCRIBE2) DEFINE_CEPH_FEATURE( 5, 1, MONNAMES) @@ -114,7 +115,7 @@ DEFINE_CEPH_FEATURE(25, 1, CRUSH_TUNABLES2) DEFINE_CEPH_FEATURE(26, 1, CREATEPOOLID) DEFINE_CEPH_FEATURE(27, 1, REPLY_CREATE_INODE) DEFINE_CEPH_FEATURE_RETIRED(28, 1, OSD_HBMSGS, HAMMER, JEWEL) -DEFINE_CEPH_FEATURE(28, 2, SERVER_M) +DEFINE_CEPH_FEATURE(28, 2, SERVER_MIMIC) DEFINE_CEPH_FEATURE(29, 1, MDSENC) DEFINE_CEPH_FEATURE(30, 1, OSDHASHPSPOOL) DEFINE_CEPH_FEATURE(31, 1, MON_SINGLE_PAXOS) // deprecate me @@ -177,13 +178,16 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin */ #define CEPH_FEATURES_SUPPORTED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ + CEPH_FEATURE_SERVER_NAUTILUS | \ CEPH_FEATURE_FLOCK | \ CEPH_FEATURE_SUBSCRIBE2 | \ + CEPH_FEATURE_MONNAMES | \ CEPH_FEATURE_RECONNECT_SEQ | \ CEPH_FEATURE_DIRLAYOUTHASH | \ CEPH_FEATURE_PGID64 | \ CEPH_FEATURE_PGPOOL3 | \ CEPH_FEATURE_OSDENC | \ + CEPH_FEATURE_MONENC | \ CEPH_FEATURE_CRUSH_TUNABLES | \ CEPH_FEATURE_SERVER_LUMINOUS | \ CEPH_FEATURE_RESEND_ON_SPLIT | \ @@ -193,6 +197,7 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin CEPH_FEATURE_MSG_AUTH | \ CEPH_FEATURE_CRUSH_TUNABLES2 | \ CEPH_FEATURE_REPLY_CREATE_INODE | \ + CEPH_FEATURE_SERVER_MIMIC | \ CEPH_FEATURE_MDSENC | \ CEPH_FEATURE_OSDHASHPSPOOL | \ CEPH_FEATURE_OSD_CACHEPOOL | \ diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index cb21c5cf12c3..49586ff26152 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -28,8 +28,8 @@ #define CEPH_INO_ROOT 1 -#define CEPH_INO_CEPH 2 /* hidden .ceph dir */ -#define CEPH_INO_DOTDOT 3 /* used by ceph fuse for parent (..) */ +#define CEPH_INO_CEPH 2 /* hidden .ceph dir */ +#define CEPH_INO_GLOBAL_SNAPREALM 3 /* global dummy snaprealm */ /* arbitrary limit on max # of monitors (cluster of 3 is typical) */ #define CEPH_MAX_MON 31 @@ -93,8 +93,19 @@ struct ceph_dir_layout { #define CEPH_AUTH_NONE 0x1 #define CEPH_AUTH_CEPHX 0x2 +#define CEPH_AUTH_MODE_NONE 0 +#define CEPH_AUTH_MODE_AUTHORIZER 1 +#define CEPH_AUTH_MODE_MON 10 + +/* msgr2 protocol modes */ +#define CEPH_CON_MODE_UNKNOWN 0x0 +#define CEPH_CON_MODE_CRC 0x1 +#define CEPH_CON_MODE_SECURE 0x2 + #define CEPH_AUTH_UID_DEFAULT ((__u64) -1) +const char *ceph_auth_proto_name(int proto); +const char *ceph_con_mode_name(int mode); /********************************************* * message layer @@ -130,6 +141,7 @@ struct ceph_dir_layout { #define CEPH_MSG_CLIENT_REQUEST 24 #define CEPH_MSG_CLIENT_REQUEST_FORWARD 25 #define CEPH_MSG_CLIENT_REPLY 26 +#define CEPH_MSG_CLIENT_METRICS 29 #define CEPH_MSG_CLIENT_CAPS 0x310 #define CEPH_MSG_CLIENT_LEASE 0x311 #define CEPH_MSG_CLIENT_SNAP 0x312 @@ -287,8 +299,11 @@ enum { CEPH_SESSION_FLUSHMSG_ACK, CEPH_SESSION_FORCE_RO, CEPH_SESSION_REJECT, + CEPH_SESSION_REQUEST_FLUSH_MDLOG, }; +#define CEPH_SESSION_BLOCKLISTED (1 << 0) /* session blocklisted */ + extern const char *ceph_session_op_name(int op); struct ceph_mds_session_head { @@ -313,6 +328,7 @@ enum { CEPH_MDS_OP_LOOKUPPARENT = 0x00103, CEPH_MDS_OP_LOOKUPINO = 0x00104, CEPH_MDS_OP_LOOKUPNAME = 0x00105, + CEPH_MDS_OP_GETVXATTR = 0x00106, CEPH_MDS_OP_SETXATTR = 0x01105, CEPH_MDS_OP_RMXATTR = 0x01106, @@ -417,12 +433,13 @@ union ceph_mds_request_args { __le32 stripe_unit; /* layout for newly created file */ __le32 stripe_count; /* ... */ __le32 object_size; - __le32 file_replication; - __le32 mask; /* CEPH_CAP_* */ - __le32 old_size; + __le32 pool; + __le32 mask; /* CEPH_CAP_* */ + __le64 old_size; } __attribute__ ((packed)) open; struct { __le32 flags; + __le32 osdmap_epoch; /* used for setting file/dir layouts */ } __attribute__ ((packed)) setxattr; struct { struct ceph_file_layout_legacy layout; @@ -444,10 +461,25 @@ union ceph_mds_request_args { } __attribute__ ((packed)) lookupino; } __attribute__ ((packed)); -#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */ -#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */ +union ceph_mds_request_args_ext { + union ceph_mds_request_args old; + struct { + __le32 mode; + __le32 uid; + __le32 gid; + struct ceph_timespec mtime; + struct ceph_timespec atime; + __le64 size, old_size; /* old_size needed by truncate */ + __le32 mask; /* CEPH_SETATTR_* */ + struct ceph_timespec btime; + } __attribute__ ((packed)) setattr_ext; +}; -struct ceph_mds_request_head { +#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */ +#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */ +#define CEPH_MDS_FLAG_ASYNC 4 /* request is asynchronous */ + +struct ceph_mds_request_head_old { __le64 oldest_client_tid; __le32 mdsmap_epoch; /* on client */ __le32 flags; /* CEPH_MDS_FLAG_* */ @@ -460,6 +492,22 @@ struct ceph_mds_request_head { union ceph_mds_request_args args; } __attribute__ ((packed)); +#define CEPH_MDS_REQUEST_HEAD_VERSION 1 + +struct ceph_mds_request_head { + __le16 version; /* struct version */ + __le64 oldest_client_tid; + __le32 mdsmap_epoch; /* on client */ + __le32 flags; /* CEPH_MDS_FLAG_* */ + __u8 num_retry, num_fwd; /* count retry, fwd attempts */ + __le16 num_releases; /* # include cap/lease release records */ + __le32 op; /* mds op code */ + __le32 caller_uid, caller_gid; + __le64 ino; /* use this ino for openc, mkdir, mknod, + etc. (if replaying) */ + union ceph_mds_request_args_ext args; +} __attribute__ ((packed)); + /* cap/lease release record */ struct ceph_mds_request_release { __le64 ino, cap_id; /* ino and unique cap id */ @@ -530,6 +578,9 @@ struct ceph_mds_reply_lease { __le32 seq; } __attribute__ ((packed)); +#define CEPH_LEASE_VALID (1 | 2) /* old and new bit values */ +#define CEPH_LEASE_PRIMARY_LINK 4 /* primary linkage */ + struct ceph_mds_reply_dirfrag { __le32 frag; /* fragment */ __le32 auth; /* auth mds, if this is a delegation point */ @@ -564,6 +615,7 @@ struct ceph_filelock { #define CEPH_FILE_MODE_RDWR 3 /* RD | WR */ #define CEPH_FILE_MODE_LAZY 4 /* lazy io */ #define CEPH_FILE_MODE_BITS 4 +#define CEPH_FILE_MODE_MASK ((1 << CEPH_FILE_MODE_BITS) - 1) int ceph_flags_to_mode(int flags); @@ -655,10 +707,19 @@ int ceph_flags_to_mode(int flags); #define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \ CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \ CEPH_CAP_PIN) +#define CEPH_CAP_ALL_FILE (CEPH_CAP_PIN | CEPH_CAP_ANY_SHARED | \ + CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL | \ + CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR) #define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \ CEPH_LOCK_IXATTR) +/* cap masks async dir operations */ +#define CEPH_CAP_DIR_CREATE CEPH_CAP_FILE_CACHE +#define CEPH_CAP_DIR_UNLINK CEPH_CAP_FILE_RD +#define CEPH_CAP_ANY_DIR_OPS (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD | \ + CEPH_CAP_FILE_WREXTEND | CEPH_CAP_FILE_LAZYIO) + int ceph_caps_for_mode(int mode); enum { @@ -707,7 +768,7 @@ struct ceph_mds_caps { __le32 xattr_len; __le64 xattr_version; - /* filelock */ + /* a union of non-export and export bodies. */ __le64 size, max_size, truncate_size; __le32 truncate_seq; struct ceph_timespec mtime, atime, ctime; diff --git a/include/linux/ceph/debugfs.h b/include/linux/ceph/debugfs.h index cf5e840eec71..8b3a1a7a953a 100644 --- a/include/linux/ceph/debugfs.h +++ b/include/linux/ceph/debugfs.h @@ -2,22 +2,8 @@ #ifndef _FS_CEPH_DEBUGFS_H #define _FS_CEPH_DEBUGFS_H -#include <linux/ceph/ceph_debug.h> #include <linux/ceph/types.h> -#define CEPH_DEFINE_SHOW_FUNC(name) \ -static int name##_open(struct inode *inode, struct file *file) \ -{ \ - return single_open(file, name, inode->i_private); \ -} \ - \ -static const struct file_operations name##_fops = { \ - .open = name##_open, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ -}; - /* debugfs.c */ extern void ceph_debugfs_init(void); extern void ceph_debugfs_cleanup(void); diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index 450384fe487c..04f3ace5787b 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -220,6 +220,8 @@ static inline void ceph_encode_timespec64(struct ceph_timespec *tv, */ #define CEPH_ENTITY_ADDR_TYPE_NONE 0 #define CEPH_ENTITY_ADDR_TYPE_LEGACY __cpu_to_le32(1) +#define CEPH_ENTITY_ADDR_TYPE_MSGR2 __cpu_to_le32(2) +#define CEPH_ENTITY_ADDR_TYPE_ANY __cpu_to_le32(3) static inline void ceph_encode_banner_addr(struct ceph_entity_addr *a) { @@ -239,6 +241,12 @@ static inline void ceph_decode_banner_addr(struct ceph_entity_addr *a) extern int ceph_decode_entity_addr(void **p, void *end, struct ceph_entity_addr *addr); +int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2, + struct ceph_entity_addr *addr); + +int ceph_entity_addr_encoding_len(const struct ceph_entity_addr *addr); +void ceph_encode_entity_addr(void **p, const struct ceph_entity_addr *addr); + /* * encoders */ diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index ec73ebc4827d..00af2c98da75 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -31,11 +31,11 @@ #define CEPH_OPT_FSID (1<<0) #define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */ #define CEPH_OPT_MYIP (1<<2) /* specified my ip */ -#define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes */ -#define CEPH_OPT_NOMSGAUTH (1<<4) /* don't require msg signing feat */ -#define CEPH_OPT_TCP_NODELAY (1<<5) /* TCP_NODELAY on TCP sockets */ -#define CEPH_OPT_NOMSGSIGN (1<<6) /* don't sign msgs */ -#define CEPH_OPT_ABORT_ON_FULL (1<<7) /* abort w/ ENOSPC when full */ +#define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes (msgr1) */ +#define CEPH_OPT_TCP_NODELAY (1<<4) /* TCP_NODELAY on TCP sockets */ +#define CEPH_OPT_NOMSGSIGN (1<<5) /* don't sign msgs (msgr1) */ +#define CEPH_OPT_ABORT_ON_FULL (1<<6) /* abort w/ ENOSPC when full */ +#define CEPH_OPT_RXBOUNCE (1<<7) /* double-buffer read data */ #define CEPH_OPT_DEFAULT (CEPH_OPT_TCP_NODELAY) @@ -52,6 +52,8 @@ struct ceph_options { unsigned long osd_idle_ttl; /* jiffies */ unsigned long osd_keepalive_timeout; /* jiffies */ unsigned long osd_request_timeout; /* jiffies */ + u32 read_from_replica; /* CEPH_OSD_FLAG_BALANCE/LOCALIZE_READS */ + int con_modes[2]; /* CEPH_CON_MODE_* */ /* * any type that can't be simply compared or doesn't need @@ -64,6 +66,7 @@ struct ceph_options { int num_mon; char *name; struct ceph_crypto_key *key; + struct rb_root crush_locs; }; /* @@ -73,6 +76,7 @@ struct ceph_options { #define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000) #define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000) #define CEPH_OSD_REQUEST_TIMEOUT_DEFAULT 0 /* no timeout */ +#define CEPH_READ_FROM_REPLICA_DEFAULT 0 /* read from primary */ #define CEPH_MONC_HUNT_INTERVAL msecs_to_jiffies(3 * 1000) #define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000) @@ -80,6 +84,7 @@ struct ceph_options { #define CEPH_MONC_HUNT_BACKOFF 2 #define CEPH_MONC_HUNT_MAX_MULT 10 +#define CEPH_MSG_MAX_CONTROL_LEN (16*1024*1024) #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) #define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024) @@ -101,6 +106,7 @@ enum { CEPH_MOUNT_UNMOUNTING, CEPH_MOUNT_UNMOUNTED, CEPH_MOUNT_SHUTDOWN, + CEPH_MOUNT_RECOVER, }; static inline unsigned long ceph_timeout_jiffies(unsigned long timeout) @@ -147,6 +153,10 @@ struct ceph_client { #define from_msgr(ms) container_of(ms, struct ceph_client, msgr) +static inline bool ceph_msgr2(struct ceph_client *client) +{ + return client->options->con_modes[0] != CEPH_CON_MODE_UNKNOWN; +} /* * snapshots @@ -188,7 +198,7 @@ static inline int calc_pages_for(u64 off, u64 len) #define RB_CMP3WAY(a, b) ((a) < (b) ? -1 : (a) > (b)) #define DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, cmpexp, keyexp, nodefld) \ -static void insert_##name(struct rb_root *root, type *t) \ +static bool __insert_##name(struct rb_root *root, type *t) \ { \ struct rb_node **n = &root->rb_node; \ struct rb_node *parent = NULL; \ @@ -206,11 +216,17 @@ static void insert_##name(struct rb_root *root, type *t) \ else if (cmp > 0) \ n = &(*n)->rb_right; \ else \ - BUG(); \ + return false; \ } \ \ rb_link_node(&t->nodefld, parent, n); \ rb_insert_color(&t->nodefld, root); \ + return true; \ +} \ +static void __maybe_unused insert_##name(struct rb_root *root, type *t) \ +{ \ + if (!__insert_##name(root, t)) \ + BUG(); \ } \ static void erase_##name(struct rb_root *root, type *t) \ { \ @@ -268,23 +284,26 @@ DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) extern struct kmem_cache *ceph_inode_cachep; extern struct kmem_cache *ceph_cap_cachep; +extern struct kmem_cache *ceph_cap_snap_cachep; extern struct kmem_cache *ceph_cap_flush_cachep; extern struct kmem_cache *ceph_dentry_cachep; extern struct kmem_cache *ceph_file_cachep; extern struct kmem_cache *ceph_dir_file_cachep; +extern struct kmem_cache *ceph_mds_request_cachep; +extern mempool_t *ceph_wb_pagevec_pool; /* ceph_common.c */ extern bool libceph_compatible(void *data); extern const char *ceph_msg_type_name(int type); extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); -extern void *ceph_kvmalloc(size_t size, gfp_t flags); +extern int ceph_parse_fsid(const char *str, struct ceph_fsid *fsid); struct fs_parameter; struct fc_log; struct ceph_options *ceph_alloc_options(void); int ceph_parse_mon_ips(const char *buf, size_t len, struct ceph_options *opt, - struct fc_log *l); + struct fc_log *l, char delim); int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, struct fc_log *l); int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h index 35d385296fbb..4c3e0648dc27 100644 --- a/include/linux/ceph/mdsmap.h +++ b/include/linux/ceph/mdsmap.h @@ -25,6 +25,7 @@ struct ceph_mdsmap { u32 m_session_timeout; /* seconds */ u32 m_session_autoclose; /* seconds */ u64 m_max_file_size; + u64 m_max_xattr_size; /* maximum size for xattrs blob */ u32 m_max_mds; /* expected up:active mds number */ u32 m_num_active_mds; /* actual up:active mds number */ u32 possible_max_rank; /* possible max rank index */ @@ -64,7 +65,7 @@ static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w) } extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m); -extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end); +struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2); extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m); extern bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m); diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index c4458dc6a757..99c1726be6ee 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -3,6 +3,7 @@ #define __FS_CEPH_MESSENGER_H #include <linux/bvec.h> +#include <linux/crypto.h> #include <linux/kref.h> #include <linux/mutex.h> #include <linux/net.h> @@ -52,9 +53,26 @@ struct ceph_connection_operations { int (*sign_message) (struct ceph_msg *msg); int (*check_message_signature) (struct ceph_msg *msg); + + /* msgr2 authentication exchange */ + int (*get_auth_request)(struct ceph_connection *con, + void *buf, int *buf_len, + void **authorizer, int *authorizer_len); + int (*handle_auth_reply_more)(struct ceph_connection *con, + void *reply, int reply_len, + void *buf, int *buf_len, + void **authorizer, int *authorizer_len); + int (*handle_auth_done)(struct ceph_connection *con, + u64 global_id, void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len); + int (*handle_auth_bad_method)(struct ceph_connection *con, + int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt); }; -/* use format string %s%d */ +/* use format string %s%lld */ #define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num) struct ceph_messenger { @@ -175,9 +193,10 @@ struct ceph_msg_data { #endif /* CONFIG_BLOCK */ struct ceph_bvec_iter bvec_pos; struct { - struct page **pages; /* NOT OWNER. */ + struct page **pages; size_t length; /* total # bytes */ unsigned int alignment; /* first page */ + bool own_pages; }; struct ceph_pagelist *pagelist; }; @@ -188,7 +207,6 @@ struct ceph_msg_data_cursor { struct ceph_msg_data *data; /* current data item */ size_t resid; /* bytes not yet consumed */ - bool last_piece; /* current is last piece */ bool need_crc; /* crc update needed */ union { #ifdef CONFIG_BLOCK @@ -234,14 +252,175 @@ struct ceph_msg { bool more_to_follow; bool needs_out_seq; int front_alloc_len; - unsigned long ack_stamp; /* tx: when we were acked */ struct ceph_msgpool *pool; }; +/* + * connection states + */ +#define CEPH_CON_S_CLOSED 1 +#define CEPH_CON_S_PREOPEN 2 +#define CEPH_CON_S_V1_BANNER 3 +#define CEPH_CON_S_V1_CONNECT_MSG 4 +#define CEPH_CON_S_V2_BANNER_PREFIX 5 +#define CEPH_CON_S_V2_BANNER_PAYLOAD 6 +#define CEPH_CON_S_V2_HELLO 7 +#define CEPH_CON_S_V2_AUTH 8 +#define CEPH_CON_S_V2_AUTH_SIGNATURE 9 +#define CEPH_CON_S_V2_SESSION_CONNECT 10 +#define CEPH_CON_S_V2_SESSION_RECONNECT 11 +#define CEPH_CON_S_OPEN 12 +#define CEPH_CON_S_STANDBY 13 + +/* + * ceph_connection flag bits + */ +#define CEPH_CON_F_LOSSYTX 0 /* we can close channel or drop + messages on errors */ +#define CEPH_CON_F_KEEPALIVE_PENDING 1 /* we need to send a keepalive */ +#define CEPH_CON_F_WRITE_PENDING 2 /* we have data ready to send */ +#define CEPH_CON_F_SOCK_CLOSED 3 /* socket state changed to closed */ +#define CEPH_CON_F_BACKOFF 4 /* need to retry queuing delayed + work */ + /* ceph connection fault delay defaults, for exponential backoff */ -#define BASE_DELAY_INTERVAL (HZ/2) -#define MAX_DELAY_INTERVAL (5 * 60 * HZ) +#define BASE_DELAY_INTERVAL (HZ / 4) +#define MAX_DELAY_INTERVAL (15 * HZ) + +struct ceph_connection_v1_info { + struct kvec out_kvec[8], /* sending header/footer data */ + *out_kvec_cur; + int out_kvec_left; /* kvec's left in out_kvec */ + int out_skip; /* skip this many bytes */ + int out_kvec_bytes; /* total bytes left */ + bool out_more; /* there is more data after the kvecs */ + bool out_msg_done; + + struct ceph_auth_handshake *auth; + int auth_retry; /* true if we need a newer authorizer */ + + /* connection negotiation temps */ + u8 in_banner[CEPH_BANNER_MAX_LEN]; + struct ceph_entity_addr actual_peer_addr; + struct ceph_entity_addr peer_addr_for_me; + struct ceph_msg_connect out_connect; + struct ceph_msg_connect_reply in_reply; + + int in_base_pos; /* bytes read */ + + /* message in temps */ + u8 in_tag; /* protocol control byte */ + struct ceph_msg_header in_hdr; + __le64 in_temp_ack; /* for reading an ack */ + + /* message out temps */ + struct ceph_msg_header out_hdr; + __le64 out_temp_ack; /* for writing an ack */ + struct ceph_timespec out_temp_keepalive2; /* for writing keepalive2 + stamp */ + + u32 connect_seq; /* identify the most recent connection + attempt for this session */ + u32 peer_global_seq; /* peer's global seq for this connection */ +}; + +#define CEPH_CRC_LEN 4 +#define CEPH_GCM_KEY_LEN 16 +#define CEPH_GCM_IV_LEN sizeof(struct ceph_gcm_nonce) +#define CEPH_GCM_BLOCK_LEN 16 +#define CEPH_GCM_TAG_LEN 16 + +#define CEPH_PREAMBLE_LEN 32 +#define CEPH_PREAMBLE_INLINE_LEN 48 +#define CEPH_PREAMBLE_PLAIN_LEN CEPH_PREAMBLE_LEN +#define CEPH_PREAMBLE_SECURE_LEN (CEPH_PREAMBLE_LEN + \ + CEPH_PREAMBLE_INLINE_LEN + \ + CEPH_GCM_TAG_LEN) +#define CEPH_EPILOGUE_PLAIN_LEN (1 + 3 * CEPH_CRC_LEN) +#define CEPH_EPILOGUE_SECURE_LEN (CEPH_GCM_BLOCK_LEN + CEPH_GCM_TAG_LEN) + +#define CEPH_FRAME_MAX_SEGMENT_COUNT 4 + +struct ceph_frame_desc { + int fd_tag; /* FRAME_TAG_* */ + int fd_seg_cnt; + int fd_lens[CEPH_FRAME_MAX_SEGMENT_COUNT]; /* logical */ + int fd_aligns[CEPH_FRAME_MAX_SEGMENT_COUNT]; +}; + +struct ceph_gcm_nonce { + __le32 fixed; + __le64 counter __packed; +}; + +struct ceph_connection_v2_info { + struct iov_iter in_iter; + struct kvec in_kvecs[5]; /* recvmsg */ + struct bio_vec in_bvec; /* recvmsg (in_cursor) */ + int in_kvec_cnt; + int in_state; /* IN_S_* */ + + struct iov_iter out_iter; + struct kvec out_kvecs[8]; /* sendmsg */ + struct bio_vec out_bvec; /* sendpage (out_cursor, out_zero), + sendmsg (out_enc_pages) */ + int out_kvec_cnt; + int out_state; /* OUT_S_* */ + + int out_zero; /* # of zero bytes to send */ + bool out_iter_sendpage; /* use sendpage if possible */ + + struct ceph_frame_desc in_desc; + struct ceph_msg_data_cursor in_cursor; + struct ceph_msg_data_cursor out_cursor; + + struct crypto_shash *hmac_tfm; /* post-auth signature */ + struct crypto_aead *gcm_tfm; /* on-wire encryption */ + struct aead_request *gcm_req; + struct crypto_wait gcm_wait; + struct ceph_gcm_nonce in_gcm_nonce; + struct ceph_gcm_nonce out_gcm_nonce; + + struct page **in_enc_pages; + int in_enc_page_cnt; + int in_enc_resid; + int in_enc_i; + struct page **out_enc_pages; + int out_enc_page_cnt; + int out_enc_resid; + int out_enc_i; + + int con_mode; /* CEPH_CON_MODE_* */ + + void *conn_bufs[16]; + int conn_buf_cnt; + + struct kvec in_sign_kvecs[8]; + struct kvec out_sign_kvecs[8]; + int in_sign_kvec_cnt; + int out_sign_kvec_cnt; + + u64 client_cookie; + u64 server_cookie; + u64 global_seq; + u64 connect_seq; + u64 peer_global_seq; + + u8 in_buf[CEPH_PREAMBLE_SECURE_LEN]; + u8 out_buf[CEPH_PREAMBLE_SECURE_LEN]; + struct { + u8 late_status; /* FRAME_LATE_STATUS_* */ + union { + struct { + u32 front_crc; + u32 middle_crc; + u32 data_crc; + } __packed; + u8 pad[CEPH_GCM_BLOCK_LEN - 1]; + }; + } out_epil; +}; /* * A single connection with another host. @@ -257,24 +436,16 @@ struct ceph_connection { struct ceph_messenger *msgr; + int state; /* CEPH_CON_S_* */ atomic_t sock_state; struct socket *sock; - struct ceph_entity_addr peer_addr; /* peer address */ - struct ceph_entity_addr peer_addr_for_me; - unsigned long flags; - unsigned long state; + unsigned long flags; /* CEPH_CON_F_* */ const char *error_msg; /* error message, if any */ struct ceph_entity_name peer_name; /* peer name */ - + struct ceph_entity_addr peer_addr; /* peer address */ u64 peer_features; - u32 connect_seq; /* identify the most recent connection - attempt for this connection, client */ - u32 peer_global_seq; /* peer's global seq for this connection */ - - struct ceph_auth_handshake *auth; - int auth_retry; /* true if we need a newer authorizer */ struct mutex mutex; @@ -285,50 +456,86 @@ struct ceph_connection { u64 in_seq, in_seq_acked; /* last message received, acked */ - /* connection negotiation temps */ - char in_banner[CEPH_BANNER_MAX_LEN]; - struct ceph_msg_connect out_connect; - struct ceph_msg_connect_reply in_reply; - struct ceph_entity_addr actual_peer_addr; - - /* message out temps */ - struct ceph_msg_header out_hdr; + struct ceph_msg *in_msg; struct ceph_msg *out_msg; /* sending message (== tail of out_sent) */ - bool out_msg_done; - - struct kvec out_kvec[8], /* sending header/footer data */ - *out_kvec_cur; - int out_kvec_left; /* kvec's left in out_kvec */ - int out_skip; /* skip this many bytes */ - int out_kvec_bytes; /* total bytes left */ - int out_more; /* there is more data after the kvecs */ - __le64 out_temp_ack; /* for writing an ack */ - struct ceph_timespec out_temp_keepalive2; /* for writing keepalive2 - stamp */ - /* message in temps */ - struct ceph_msg_header in_hdr; - struct ceph_msg *in_msg; + struct page *bounce_page; u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */ - char in_tag; /* protocol control byte */ - int in_base_pos; /* bytes read */ - __le64 in_temp_ack; /* for reading an ack */ - struct timespec64 last_keepalive_ack; /* keepalive2 ack stamp */ struct delayed_work work; /* send|recv work */ unsigned long delay; /* current delay interval */ + + union { + struct ceph_connection_v1_info v1; + struct ceph_connection_v2_info v2; + }; }; +extern struct page *ceph_zero_page; + +void ceph_con_flag_clear(struct ceph_connection *con, unsigned long con_flag); +void ceph_con_flag_set(struct ceph_connection *con, unsigned long con_flag); +bool ceph_con_flag_test(struct ceph_connection *con, unsigned long con_flag); +bool ceph_con_flag_test_and_clear(struct ceph_connection *con, + unsigned long con_flag); +bool ceph_con_flag_test_and_set(struct ceph_connection *con, + unsigned long con_flag); + +void ceph_encode_my_addr(struct ceph_messenger *msgr); + +int ceph_tcp_connect(struct ceph_connection *con); +int ceph_con_close_socket(struct ceph_connection *con); +void ceph_con_reset_session(struct ceph_connection *con); + +u32 ceph_get_global_seq(struct ceph_messenger *msgr, u32 gt); +void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq); +void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq); + +void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor, + struct ceph_msg *msg, size_t length); +struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, size_t *length); +void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes); + +u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset, + unsigned int length); + +bool ceph_addr_is_blank(const struct ceph_entity_addr *addr); +int ceph_addr_port(const struct ceph_entity_addr *addr); +void ceph_addr_set_port(struct ceph_entity_addr *addr, int p); + +void ceph_con_process_message(struct ceph_connection *con); +int ceph_con_in_msg_alloc(struct ceph_connection *con, + struct ceph_msg_header *hdr, int *skip); +void ceph_con_get_out_msg(struct ceph_connection *con); + +/* messenger_v1.c */ +int ceph_con_v1_try_read(struct ceph_connection *con); +int ceph_con_v1_try_write(struct ceph_connection *con); +void ceph_con_v1_revoke(struct ceph_connection *con); +void ceph_con_v1_revoke_incoming(struct ceph_connection *con); +bool ceph_con_v1_opened(struct ceph_connection *con); +void ceph_con_v1_reset_session(struct ceph_connection *con); +void ceph_con_v1_reset_protocol(struct ceph_connection *con); + +/* messenger_v2.c */ +int ceph_con_v2_try_read(struct ceph_connection *con); +int ceph_con_v2_try_write(struct ceph_connection *con); +void ceph_con_v2_revoke(struct ceph_connection *con); +void ceph_con_v2_revoke_incoming(struct ceph_connection *con); +bool ceph_con_v2_opened(struct ceph_connection *con); +void ceph_con_v2_reset_session(struct ceph_connection *con); +void ceph_con_v2_reset_protocol(struct ceph_connection *con); + extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr); extern int ceph_parse_ips(const char *c, const char *end, struct ceph_entity_addr *addr, - int max_count, int *count); - + int max_count, int *count, char delim); extern int ceph_msgr_init(void); extern void ceph_msgr_exit(void); @@ -356,8 +563,8 @@ extern void ceph_con_keepalive(struct ceph_connection *con); extern bool ceph_con_keepalive_expired(struct ceph_connection *con, unsigned long interval); -extern void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, - size_t length, size_t alignment); +void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, + size_t length, size_t alignment, bool own_pages); extern void ceph_msg_data_add_pagelist(struct ceph_msg *msg, struct ceph_pagelist *pagelist); #ifdef CONFIG_BLOCK diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index dbb8a6959a73..b658961156a0 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -19,7 +19,7 @@ struct ceph_monmap { struct ceph_fsid fsid; u32 epoch; u32 num_mon; - struct ceph_entity_inst mon_inst[0]; + struct ceph_entity_inst mon_inst[]; }; struct ceph_mon_client; @@ -142,7 +142,7 @@ int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what, int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what, ceph_monc_callback_t cb, u64 private_data); -int ceph_monc_blacklist_add(struct ceph_mon_client *monc, +int ceph_monc_blocklist_add(struct ceph_mon_client *monc, struct ceph_entity_addr *client_addr); extern int ceph_monc_open_session(struct ceph_mon_client *monc); diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h index 9e50aede46c8..3989dcb94d3d 100644 --- a/include/linux/ceph/msgr.h +++ b/include/linux/ceph/msgr.h @@ -9,24 +9,45 @@ #define CEPH_MON_PORT 6789 /* default monitor port */ /* - * client-side processes will try to bind to ports in this - * range, simply for the benefit of tools like nmap or wireshark - * that would like to identify the protocol. - */ -#define CEPH_PORT_FIRST 6789 -#define CEPH_PORT_START 6800 /* non-monitors start here */ -#define CEPH_PORT_LAST 6900 - -/* * tcp connection banner. include a protocol version. and adjust * whenever the wire protocol changes. try to keep this string length * constant. */ #define CEPH_BANNER "ceph v027" +#define CEPH_BANNER_LEN 9 #define CEPH_BANNER_MAX_LEN 30 /* + * messenger V2 connection banner prefix. + * The full banner string should have the form: "ceph v2\n<le16>" + * the 2 bytes are the length of the remaining banner. + */ +#define CEPH_BANNER_V2 "ceph v2\n" +#define CEPH_BANNER_V2_LEN 8 +#define CEPH_BANNER_V2_PREFIX_LEN (CEPH_BANNER_V2_LEN + sizeof(__le16)) + +/* + * messenger V2 features + */ +#define CEPH_MSGR2_INCARNATION_1 (0ull) + +#define DEFINE_MSGR2_FEATURE(bit, incarnation, name) \ + static const uint64_t __maybe_unused CEPH_MSGR2_FEATURE_##name = (1ULL << bit); \ + static const uint64_t __maybe_unused CEPH_MSGR2_FEATUREMASK_##name = \ + (1ULL << bit | CEPH_MSGR2_INCARNATION_##incarnation); + +#define HAVE_MSGR2_FEATURE(x, name) \ + (((x) & (CEPH_MSGR2_FEATUREMASK_##name)) == (CEPH_MSGR2_FEATUREMASK_##name)) + +DEFINE_MSGR2_FEATURE( 0, 1, REVISION_1) // msgr2.1 + +#define CEPH_MSGR2_SUPPORTED_FEATURES (CEPH_MSGR2_FEATURE_REVISION_1) + +#define CEPH_MSGR2_REQUIRED_FEATURES (CEPH_MSGR2_FEATURE_REVISION_1) + + +/* * Rollover-safe type and comparator for 32-bit sequence numbers. * Comparator returns -1, 0, or 1. */ @@ -61,11 +82,18 @@ extern const char *ceph_entity_type_name(int type); * entity_addr -- network address */ struct ceph_entity_addr { - __le32 type; + __le32 type; /* CEPH_ENTITY_ADDR_TYPE_* */ __le32 nonce; /* unique id for process (e.g. pid) */ struct sockaddr_storage in_addr; } __attribute__ ((packed)); +static inline bool ceph_addr_equal_no_type(const struct ceph_entity_addr *lhs, + const struct ceph_entity_addr *rhs) +{ + return !memcmp(&lhs->in_addr, &rhs->in_addr, sizeof(lhs->in_addr)) && + lhs->nonce == rhs->nonce; +} + struct ceph_entity_inst { struct ceph_entity_name name; struct ceph_entity_addr addr; @@ -160,6 +188,24 @@ struct ceph_msg_header { __le32 crc; /* header crc32c */ } __attribute__ ((packed)); +struct ceph_msg_header2 { + __le64 seq; /* message seq# for this session */ + __le64 tid; /* transaction id */ + __le16 type; /* message type */ + __le16 priority; /* priority. higher value == higher priority */ + __le16 version; /* version of message encoding */ + + __le32 data_pre_padding_len; + __le16 data_off; /* sender: include full offset; + receiver: mask against ~PAGE_MASK */ + + __le64 ack_seq; + __u8 flags; + /* oldest code we think can decode this. unknown if zero. */ + __le16 compat_version; + __le16 reserved; +} __attribute__ ((packed)); + #define CEPH_MSG_PRIO_LOW 64 #define CEPH_MSG_PRIO_DEFAULT 127 #define CEPH_MSG_PRIO_HIGH 196 diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 5a62dbd3f4c2..fb6be72104df 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -8,6 +8,7 @@ #include <linux/mempool.h> #include <linux/rbtree.h> #include <linux/refcount.h> +#include <linux/ktime.h> #include <linux/ceph/types.h> #include <linux/ceph/osdmap.h> @@ -135,6 +136,7 @@ struct ceph_osd_req_op { struct { u64 expected_object_size; u64 expected_write_size; + u32 flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */ } alloc_hint; struct { u64 snapid; @@ -164,6 +166,7 @@ struct ceph_osd_request_target { bool recovery_deletes; unsigned int flags; /* CEPH_OSD_FLAG_* */ + bool used_replica; bool paused; u32 epoch; @@ -213,6 +216,8 @@ struct ceph_osd_request { /* internal */ unsigned long r_stamp; /* jiffies, send or check time */ unsigned long r_start_stamp; /* jiffies */ + ktime_t r_start_latency; /* ktime_t */ + ktime_t r_end_latency; /* ktime_t */ int r_attempts; u32 r_map_dne_bound; @@ -282,6 +287,9 @@ struct ceph_osd_linger_request { rados_watcherrcb_t errcb; void *data; + struct ceph_pagelist *request_pl; + struct page **notify_id_pages; + struct page ***preply_pages; size_t *preply_len; }; @@ -399,7 +407,7 @@ void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc); &__oreq->r_ops[__whch].typ.fld; \ }) -extern void osd_req_op_init(struct ceph_osd_request *osd_req, +struct ceph_osd_req_op *osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u32 flags); extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *, @@ -468,7 +476,16 @@ extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, unsigned int which, u64 expected_object_size, - u64 expected_write_size); + u64 expected_write_size, + u32 flags); +extern int osd_req_op_copy_from_init(struct ceph_osd_request *req, + u64 src_snapid, u64 src_version, + struct ceph_object_id *src_oid, + struct ceph_object_locator *src_oloc, + u32 src_fadvise_flags, + u32 dst_fadvise_flags, + u32 truncate_seq, u64 truncate_size, + u8 copy_from_flags); extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, @@ -490,9 +507,8 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, extern void ceph_osdc_get_request(struct ceph_osd_request *req); extern void ceph_osdc_put_request(struct ceph_osd_request *req); -extern int ceph_osdc_start_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req, - bool nofail); +void ceph_osdc_start_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req); extern void ceph_osdc_cancel_request(struct ceph_osd_request *req); extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req); @@ -509,34 +525,6 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, struct page *req_page, size_t req_len, struct page **resp_pages, size_t *resp_len); -extern int ceph_osdc_readpages(struct ceph_osd_client *osdc, - struct ceph_vino vino, - struct ceph_file_layout *layout, - u64 off, u64 *plen, - u32 truncate_seq, u64 truncate_size, - struct page **pages, int nr_pages, - int page_align); - -extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, - struct ceph_vino vino, - struct ceph_file_layout *layout, - struct ceph_snap_context *sc, - u64 off, u64 len, - u32 truncate_seq, u64 truncate_size, - struct timespec64 *mtime, - struct page **pages, int nr_pages); - -int ceph_osdc_copy_from(struct ceph_osd_client *osdc, - u64 src_snapid, u64 src_version, - struct ceph_object_id *src_oid, - struct ceph_object_locator *src_oloc, - u32 src_fadvise_flags, - struct ceph_object_id *dst_oid, - struct ceph_object_locator *dst_oloc, - u32 dst_fadvise_flags, - u32 truncate_seq, u64 truncate_size, - u8 copy_from_flags); - /* watch/notify */ struct ceph_osd_linger_request * ceph_osdc_watch(struct ceph_osd_client *osdc, diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index e081b56f1c1d..5553019c3f07 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -37,6 +37,9 @@ int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs); #define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id together */ #define CEPH_POOL_FLAG_FULL (1ULL << 1) /* pool is full */ +#define CEPH_POOL_FLAG_FULL_QUOTA (1ULL << 10) /* pool ran out of quota, + will set FULL too */ +#define CEPH_POOL_FLAG_NEARFULL (1ULL << 11) /* pool is nearfull */ struct ceph_pg_pool_info { struct rb_node node; @@ -134,6 +137,17 @@ int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp, const char *fmt, ...); void ceph_oid_destroy(struct ceph_object_id *oid); +struct workspace_manager { + struct list_head idle_ws; + spinlock_t ws_lock; + /* Number of free workspaces */ + int free_ws; + /* Total number of allocated workspaces */ + atomic_t total_ws; + /* Waiters for a free workspace */ + wait_queue_head_t ws_wait; +}; + struct ceph_pg_mapping { struct rb_node node; struct ceph_pg pgid; @@ -181,8 +195,7 @@ struct ceph_osdmap { * the list of osds that store+replicate them. */ struct crush_map *crush; - struct mutex crush_workspace_mutex; - void *crush_workspace; + struct workspace_manager crush_wsm; }; static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd) @@ -238,8 +251,8 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid) } struct ceph_osdmap *ceph_osdmap_alloc(void); -extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end); -struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, +struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2); +struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2, struct ceph_osdmap *map); extern void ceph_osdmap_destroy(struct ceph_osdmap *map); @@ -299,10 +312,28 @@ bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap, int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, const struct ceph_pg *raw_pgid); +struct crush_loc { + char *cl_type_name; + char *cl_name; +}; + +struct crush_loc_node { + struct rb_node cl_node; + struct crush_loc cl_loc; /* pointers into cl_data */ + char cl_data[]; +}; + +int ceph_parse_crush_location(char *crush_location, struct rb_root *locs); +int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2); +void ceph_clear_crush_locs(struct rb_root *locs); + +int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id, + struct rb_root *locs); + extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id); - extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id); extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name); +u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id); #endif diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 59bdfd470100..43a7a1573b51 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -143,8 +143,10 @@ extern const char *ceph_osd_state_name(int s); /* * osd map flag bits */ -#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */ -#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */ +#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC), + not set since ~luminous */ +#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC), + not set since ~luminous */ #define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */ #define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */ #define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */ @@ -422,7 +424,7 @@ enum { }; #define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ -#define EBLACKLISTED ESHUTDOWN /* blacklisted */ +#define EBLOCKLISTED ESHUTDOWN /* blocklisted */ /* xattr comparison */ enum { @@ -463,6 +465,19 @@ enum { const char *ceph_osd_watch_op_name(int o); enum { + CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1, + CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE = 2, + CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4, + CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ = 8, + CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY = 16, + CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE = 32, + CEPH_OSD_ALLOC_HINT_FLAG_SHORTLIVED = 64, + CEPH_OSD_ALLOC_HINT_FLAG_LONGLIVED = 128, + CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE = 256, + CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512, +}; + +enum { CEPH_OSD_BACKOFF_OP_BLOCK = 1, CEPH_OSD_BACKOFF_OP_ACK_BLOCK = 2, CEPH_OSD_BACKOFF_OP_UNBLOCK = 3, @@ -515,6 +530,7 @@ struct ceph_osd_op { struct { __le64 expected_object_size; __le64 expected_write_size; + __le32 flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */ } __attribute__ ((packed)) alloc_hint; struct { __le64 snapid; |