diff options
Diffstat (limited to 'fs')
96 files changed, 2497 insertions, 1346 deletions
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index e34fa20acf61..9a21269b7234 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -260,8 +260,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = rs.bavail; buf->f_files = rs.files; buf->f_ffree = rs.ffree; - buf->f_fsid.val[0] = rs.fsid & 0xFFFFFFFFUL; - buf->f_fsid.val[1] = (rs.fsid >> 32) & 0xFFFFFFFFUL; + buf->f_fsid = u64_to_fsid(rs.fsid); buf->f_namelen = rs.namelen; } if (res != -ENOSYS) diff --git a/fs/Makefile b/fs/Makefile index 7bb2a05fda1f..999d1a23f036 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -14,7 +14,7 @@ obj-y := open.o read_write.o file_table.o super.o \ pnode.o splice.o sync.o utimes.o d_path.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ - kernel_read_file.o + kernel_read_file.o remap_range.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o block_dev.o direct-io.o mpage.o diff --git a/fs/adfs/super.c b/fs/adfs/super.c index d553bb5bc17a..bdbd26e571ed 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -210,8 +210,7 @@ static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = sbi->s_namelen; buf->f_bsize = sb->s_blocksize; buf->f_ffree = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); return 0; } diff --git a/fs/affs/super.c b/fs/affs/super.c index a100cd9950c8..c6c2a513ec92 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -620,8 +620,7 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = AFFS_SB(sb)->s_partition_size - AFFS_SB(sb)->s_reserved; buf->f_bfree = free; buf->f_bavail = free; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); buf->f_namelen = AFFSNAMEMAX; return 0; } diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 2482032021ca..c1ba13d19024 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -963,8 +963,7 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = buf->f_bfree; buf->f_files = 0; /* UNKNOWN */ buf->f_ffree = 0; /* UNKNOWN */ - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); buf->f_namelen = BEFS_NAME_LEN; befs_debug(sb, "<--- %s", __func__); diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index f8ce1368218b..3ac7611ef7ce 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -229,8 +229,7 @@ static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bfree = buf->f_bavail = info->si_freeb; buf->f_files = info->si_lasti + 1 - BFS_ROOT_INO; buf->f_ffree = info->si_freei; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); buf->f_namelen = BFS_NAMELEN; return 0; } diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 3080cda9e824..8bda092e60c5 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -121,7 +121,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object, _debug("reissue read"); ret = bmapping->a_ops->readpage(NULL, backpage); if (ret < 0) - goto unlock_discard; + goto discard; } /* but the page may have been read before the monitor was installed, so @@ -138,6 +138,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object, unlock_discard: unlock_page(backpage); +discard: spin_lock_irq(&object->work_lock); list_del(&monitor->op_link); spin_unlock_irq(&object->work_lock); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 2516304379d3..33ba6f0aa55c 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -104,8 +104,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) le64_to_cpu(*((__le64 *)&monc->monmap->fsid + 1)); mutex_unlock(&monc->mutex); - buf->f_fsid.val[0] = fsid & 0xffffffff; - buf->f_fsid.val[1] = fsid >> 32; + buf->f_fsid = u64_to_fsid(fsid); return 0; } diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 51bae9340842..cd17d0e50f2a 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -10,7 +10,7 @@ cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \ cifs_unicode.o nterr.o cifsencrypt.o \ readdir.o ioctl.o sess.o export.o smb1ops.o winucase.o \ smb2ops.o smb2maperror.o smb2transport.o \ - smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o + smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o cifs-$(CONFIG_CIFS_XATTR) += xattr.o diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c index 689162e2e175..3150c19cdc2f 100644 --- a/fs/cifs/asn1.c +++ b/fs/cifs/asn1.c @@ -530,8 +530,8 @@ decode_negTokenInit(unsigned char *security_blob, int length, return 0; } else if ((cls != ASN1_CTX) || (con != ASN1_CON) || (tag != ASN1_EOC)) { - cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 0\n", - cls, con, tag, end, *end); + cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 0\n", + cls, con, tag, end); return 0; } @@ -541,8 +541,8 @@ decode_negTokenInit(unsigned char *security_blob, int length, return 0; } else if ((cls != ASN1_UNI) || (con != ASN1_CON) || (tag != ASN1_SEQ)) { - cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 1\n", - cls, con, tag, end, *end); + cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 1\n", + cls, con, tag, end); return 0; } @@ -552,8 +552,8 @@ decode_negTokenInit(unsigned char *security_blob, int length, return 0; } else if ((cls != ASN1_CTX) || (con != ASN1_CON) || (tag != ASN1_EOC)) { - cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 0\n", - cls, con, tag, end, *end); + cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 0\n", + cls, con, tag, end); return 0; } @@ -564,8 +564,8 @@ decode_negTokenInit(unsigned char *security_blob, int length, return 0; } else if ((cls != ASN1_UNI) || (con != ASN1_CON) || (tag != ASN1_SEQ)) { - cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 1\n", - cls, con, tag, end, *end); + cifs_dbg(FYI, "cls = %d con = %d tag = %d sequence_end = %p exit 1\n", + cls, con, tag, sequence_end); return 0; } diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 498777d859eb..9bd03a231032 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -488,7 +488,13 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, else if (map_chars == SFM_MAP_UNI_RSVD) { bool end_of_string; - if (i == srclen - 1) + /** + * Remap spaces and periods found at the end of every + * component of the path. The special cases of '.' and + * '..' do not need to be dealt with explicitly because + * they are addressed in namei.c:link_path_walk(). + **/ + if ((i == srclen - 1) || (source[i+1] == '\\')) end_of_string = true; else end_of_string = false; diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index fcff14ef1c70..23b21e943652 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -338,7 +338,7 @@ invalidate_key: goto out_key_put; } -static int +int sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, struct cifs_fattr *fattr, uint sidtype) { @@ -359,7 +359,8 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, return -EIO; } - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) { + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) || + (cifs_sb_master_tcon(cifs_sb)->posix_extensions)) { uint32_t unix_id; bool is_group; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 0fb99d25e8a8..472cb7777e3e 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -71,6 +71,8 @@ bool enable_oplocks = true; bool linuxExtEnabled = true; bool lookupCacheEnabled = true; bool disable_legacy_dialects; /* false by default */ +bool enable_gcm_256; /* false by default, change when more servers support it */ +bool require_gcm_256; /* false by default */ unsigned int global_secflags = CIFSSEC_DEF; /* unsigned int ntlmv2_support = 0; */ unsigned int sign_CIFS_PDUs = 1; @@ -104,6 +106,12 @@ MODULE_PARM_DESC(slow_rsp_threshold, "Amount of time (in seconds) to wait " module_param(enable_oplocks, bool, 0644); MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1"); +module_param(enable_gcm_256, bool, 0644); +MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: n/N/0"); + +module_param(require_gcm_256, bool, 0644); +MODULE_PARM_DESC(require_gcm_256, "Require strongest (256 bit) GCM encryption. Default: n/N/0"); + module_param(disable_legacy_dialects, bool, 0644); MODULE_PARM_DESC(disable_legacy_dialects, "To improve security it may be " "helpful to restrict the ability to " diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 99b3180c613a..905d03863721 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -156,5 +156,5 @@ extern int cifs_truncate_page(struct address_space *mapping, loff_t from); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.28" +#define CIFS_VERSION "2.29" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index b565d83ba89e..484ec2d8c5c9 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -195,18 +195,6 @@ struct smb_rqst { unsigned int rq_tailsz; /* length of last page */ }; -enum smb_version { - Smb_1 = 1, - Smb_20, - Smb_21, - Smb_30, - Smb_302, - Smb_311, - Smb_3any, - Smb_default, - Smb_version_err -}; - struct mid_q_entry; struct TCP_Server_Info; struct cifsFileInfo; @@ -310,6 +298,10 @@ struct smb_version_operations { /* query file data from the server */ int (*query_file_info)(const unsigned int, struct cifs_tcon *, struct cifs_fid *, FILE_ALL_INFO *); + /* query reparse tag from srv to determine which type of special file */ + int (*query_reparse_tag)(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *path, + __u32 *reparse_tag); /* get server index number */ int (*get_srv_inum)(const unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const char *, @@ -510,6 +502,8 @@ struct smb_version_operations { struct fiemap_extent_info *, u64, u64); /* version specific llseek implementation */ loff_t (*llseek)(struct file *, struct cifs_tcon *, loff_t, int); + /* Check for STATUS_IO_TIMEOUT */ + bool (*is_status_io_timeout)(char *buf); }; struct smb_version_values { @@ -1954,6 +1948,8 @@ extern bool lookupCacheEnabled; extern unsigned int global_secflags; /* if on, session setup sent with more secure ntlmssp2 challenge/resp */ extern unsigned int sign_CIFS_PDUs; /* enable smb packet signing */ +extern bool enable_gcm_256; /* allow optional negotiate of strongest signing (aes-gcm-256) */ +extern bool require_gcm_256; /* require use of strongest signing (aes-gcm-256) */ extern bool linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/ extern unsigned int CIFSMaxBufSize; /* max size not including hdr */ extern unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index bb68cbf81074..24c6f36177ba 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -209,6 +209,8 @@ extern int cifs_set_file_info(struct inode *inode, struct iattr *attrs, extern int cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, const unsigned int xid); +extern int sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, + struct cifs_fattr *fattr, uint sidtype); extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct inode *inode, bool get_mode_from_special_sid, diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index a5731dd6e656..c38156f324dd 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -61,6 +61,7 @@ #ifdef CONFIG_CIFS_DFS_UPCALL #include "dfs_cache.h" #endif +#include "fs_context.h" extern mempool_t *cifs_req_poolp; extern bool disable_legacy_dialects; @@ -69,6 +70,9 @@ extern bool disable_legacy_dialects; #define TLINK_ERROR_EXPIRE (1 * HZ) #define TLINK_IDLE_EXPIRE (600 * HZ) +/* Drop the connection to not overload the server */ +#define NUM_STATUS_IO_TIMEOUT 5 + enum { /* Mount options that take no arguments */ Opt_user_xattr, Opt_nouser_xattr, @@ -276,66 +280,6 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_err, NULL } }; -enum { - Opt_sec_krb5, Opt_sec_krb5i, Opt_sec_krb5p, - Opt_sec_ntlmsspi, Opt_sec_ntlmssp, - Opt_ntlm, Opt_sec_ntlmi, Opt_sec_ntlmv2, - Opt_sec_ntlmv2i, Opt_sec_lanman, - Opt_sec_none, - - Opt_sec_err -}; - -static const match_table_t cifs_secflavor_tokens = { - { Opt_sec_krb5, "krb5" }, - { Opt_sec_krb5i, "krb5i" }, - { Opt_sec_krb5p, "krb5p" }, - { Opt_sec_ntlmsspi, "ntlmsspi" }, - { Opt_sec_ntlmssp, "ntlmssp" }, - { Opt_ntlm, "ntlm" }, - { Opt_sec_ntlmi, "ntlmi" }, - { Opt_sec_ntlmv2, "nontlm" }, - { Opt_sec_ntlmv2, "ntlmv2" }, - { Opt_sec_ntlmv2i, "ntlmv2i" }, - { Opt_sec_lanman, "lanman" }, - { Opt_sec_none, "none" }, - - { Opt_sec_err, NULL } -}; - -/* cache flavors */ -enum { - Opt_cache_loose, - Opt_cache_strict, - Opt_cache_none, - Opt_cache_ro, - Opt_cache_rw, - Opt_cache_err -}; - -static const match_table_t cifs_cacheflavor_tokens = { - { Opt_cache_loose, "loose" }, - { Opt_cache_strict, "strict" }, - { Opt_cache_none, "none" }, - { Opt_cache_ro, "ro" }, - { Opt_cache_rw, "singleclient" }, - { Opt_cache_err, NULL } -}; - -static const match_table_t cifs_smb_version_tokens = { - { Smb_1, SMB1_VERSION_STRING }, - { Smb_20, SMB20_VERSION_STRING}, - { Smb_21, SMB21_VERSION_STRING }, - { Smb_30, SMB30_VERSION_STRING }, - { Smb_302, SMB302_VERSION_STRING }, - { Smb_302, ALT_SMB302_VERSION_STRING }, - { Smb_311, SMB311_VERSION_STRING }, - { Smb_311, ALT_SMB311_VERSION_STRING }, - { Smb_3any, SMB3ANY_VERSION_STRING }, - { Smb_default, SMBDEFAULT_VERSION_STRING }, - { Smb_version_err, NULL } -}; - static int ip_connect(struct TCP_Server_Info *server); static int generic_ip_connect(struct TCP_Server_Info *server); static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink); @@ -1117,7 +1061,7 @@ cifs_demultiplex_thread(void *p) struct task_struct *task_to_wake = NULL; struct mid_q_entry *mids[MAX_COMPOUND]; char *bufs[MAX_COMPOUND]; - unsigned int noreclaim_flag; + unsigned int noreclaim_flag, num_io_timeout = 0; noreclaim_flag = memalloc_noreclaim_save(); cifs_dbg(FYI, "Demultiplex PID: %d\n", task_pid_nr(current)); @@ -1213,6 +1157,16 @@ next_pdu: continue; } + if (server->ops->is_status_io_timeout && + server->ops->is_status_io_timeout(buf)) { + num_io_timeout++; + if (num_io_timeout > NUM_STATUS_IO_TIMEOUT) { + cifs_reconnect(server); + num_io_timeout = 0; + continue; + } + } + server->lstrp = jiffies; for (i = 0; i < num_mids; i++) { @@ -1359,177 +1313,6 @@ static int get_option_gid(substring_t args[], kgid_t *result) return 0; } -static int cifs_parse_security_flavors(char *value, - struct smb_vol *vol) -{ - - substring_t args[MAX_OPT_ARGS]; - - /* - * With mount options, the last one should win. Reset any existing - * settings back to default. - */ - vol->sectype = Unspecified; - vol->sign = false; - - switch (match_token(value, cifs_secflavor_tokens, args)) { - case Opt_sec_krb5p: - cifs_dbg(VFS, "sec=krb5p is not supported!\n"); - return 1; - case Opt_sec_krb5i: - vol->sign = true; - fallthrough; - case Opt_sec_krb5: - vol->sectype = Kerberos; - break; - case Opt_sec_ntlmsspi: - vol->sign = true; - fallthrough; - case Opt_sec_ntlmssp: - vol->sectype = RawNTLMSSP; - break; - case Opt_sec_ntlmi: - vol->sign = true; - fallthrough; - case Opt_ntlm: - vol->sectype = NTLM; - break; - case Opt_sec_ntlmv2i: - vol->sign = true; - fallthrough; - case Opt_sec_ntlmv2: - vol->sectype = NTLMv2; - break; -#ifdef CONFIG_CIFS_WEAK_PW_HASH - case Opt_sec_lanman: - vol->sectype = LANMAN; - break; -#endif - case Opt_sec_none: - vol->nullauth = 1; - break; - default: - cifs_dbg(VFS, "bad security option: %s\n", value); - return 1; - } - - return 0; -} - -static int -cifs_parse_cache_flavor(char *value, struct smb_vol *vol) -{ - substring_t args[MAX_OPT_ARGS]; - - switch (match_token(value, cifs_cacheflavor_tokens, args)) { - case Opt_cache_loose: - vol->direct_io = false; - vol->strict_io = false; - vol->cache_ro = false; - vol->cache_rw = false; - break; - case Opt_cache_strict: - vol->direct_io = false; - vol->strict_io = true; - vol->cache_ro = false; - vol->cache_rw = false; - break; - case Opt_cache_none: - vol->direct_io = true; - vol->strict_io = false; - vol->cache_ro = false; - vol->cache_rw = false; - break; - case Opt_cache_ro: - vol->direct_io = false; - vol->strict_io = false; - vol->cache_ro = true; - vol->cache_rw = false; - break; - case Opt_cache_rw: - vol->direct_io = false; - vol->strict_io = false; - vol->cache_ro = false; - vol->cache_rw = true; - break; - default: - cifs_dbg(VFS, "bad cache= option: %s\n", value); - return 1; - } - return 0; -} - -static int -cifs_parse_smb_version(char *value, struct smb_vol *vol, bool is_smb3) -{ - substring_t args[MAX_OPT_ARGS]; - - switch (match_token(value, cifs_smb_version_tokens, args)) { -#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY - case Smb_1: - if (disable_legacy_dialects) { - cifs_dbg(VFS, "mount with legacy dialect disabled\n"); - return 1; - } - if (is_smb3) { - cifs_dbg(VFS, "vers=1.0 (cifs) not permitted when mounting with smb3\n"); - return 1; - } - cifs_dbg(VFS, "Use of the less secure dialect vers=1.0 is not recommended unless required for access to very old servers\n"); - vol->ops = &smb1_operations; - vol->vals = &smb1_values; - break; - case Smb_20: - if (disable_legacy_dialects) { - cifs_dbg(VFS, "mount with legacy dialect disabled\n"); - return 1; - } - if (is_smb3) { - cifs_dbg(VFS, "vers=2.0 not permitted when mounting with smb3\n"); - return 1; - } - vol->ops = &smb20_operations; - vol->vals = &smb20_values; - break; -#else - case Smb_1: - cifs_dbg(VFS, "vers=1.0 (cifs) mount not permitted when legacy dialects disabled\n"); - return 1; - case Smb_20: - cifs_dbg(VFS, "vers=2.0 mount not permitted when legacy dialects disabled\n"); - return 1; -#endif /* CIFS_ALLOW_INSECURE_LEGACY */ - case Smb_21: - vol->ops = &smb21_operations; - vol->vals = &smb21_values; - break; - case Smb_30: - vol->ops = &smb30_operations; - vol->vals = &smb30_values; - break; - case Smb_302: - vol->ops = &smb30_operations; /* currently identical with 3.0 */ - vol->vals = &smb302_values; - break; - case Smb_311: - vol->ops = &smb311_operations; - vol->vals = &smb311_values; - break; - case Smb_3any: - vol->ops = &smb30_operations; /* currently identical with 3.0 */ - vol->vals = &smb3any_values; - break; - case Smb_default: - vol->ops = &smb30_operations; /* currently identical with 3.0 */ - vol->vals = &smbdefault_values; - break; - default: - cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value); - return 1; - } - return 0; -} - /* * Parse a devname into substrings and populate the vol->UNC and vol->prepath * fields with the result. Returns 0 on success and an error otherwise. @@ -3595,7 +3378,10 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) */ tcon->retry = volume_info->retry; tcon->nocase = volume_info->nocase; - tcon->nohandlecache = volume_info->nohandlecache; + if (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING) + tcon->nohandlecache = volume_info->nohandlecache; + else + tcon->nohandlecache = 1; tcon->nodelete = volume_info->nodelete; tcon->local_lease = volume_info->local_lease; INIT_LIST_HEAD(&tcon->pending_opens); @@ -3889,13 +3675,21 @@ generic_ip_connect(struct TCP_Server_Info *server) saddr = (struct sockaddr *) &server->dstaddr; if (server->dstaddr.ss_family == AF_INET6) { - sport = ((struct sockaddr_in6 *) saddr)->sin6_port; + struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)&server->dstaddr; + + sport = ipv6->sin6_port; slen = sizeof(struct sockaddr_in6); sfamily = AF_INET6; + cifs_dbg(FYI, "%s: connecting to [%pI6]:%d\n", __func__, &ipv6->sin6_addr, + ntohs(sport)); } else { - sport = ((struct sockaddr_in *) saddr)->sin_port; + struct sockaddr_in *ipv4 = (struct sockaddr_in *)&server->dstaddr; + + sport = ipv4->sin_port; slen = sizeof(struct sockaddr_in); sfamily = AF_INET; + cifs_dbg(FYI, "%s: connecting to %pI4:%d\n", __func__, &ipv4->sin_addr, + ntohs(sport)); } if (socket == NULL) { diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c new file mode 100644 index 000000000000..ad6c2fed4055 --- /dev/null +++ b/fs/cifs/fs_context.c @@ -0,0 +1,221 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2020, Microsoft Corporation. + * + * Author(s): Steve French <stfrench@microsoft.com> + * David Howells <dhowells@redhat.com> + */ + +#include "cifsglob.h" +#include "cifs_debug.h" +#include "fs_context.h" + +static const match_table_t cifs_smb_version_tokens = { + { Smb_1, SMB1_VERSION_STRING }, + { Smb_20, SMB20_VERSION_STRING}, + { Smb_21, SMB21_VERSION_STRING }, + { Smb_30, SMB30_VERSION_STRING }, + { Smb_302, SMB302_VERSION_STRING }, + { Smb_302, ALT_SMB302_VERSION_STRING }, + { Smb_311, SMB311_VERSION_STRING }, + { Smb_311, ALT_SMB311_VERSION_STRING }, + { Smb_3any, SMB3ANY_VERSION_STRING }, + { Smb_default, SMBDEFAULT_VERSION_STRING }, + { Smb_version_err, NULL } +}; + +int +cifs_parse_smb_version(char *value, struct smb_vol *vol, bool is_smb3) +{ + substring_t args[MAX_OPT_ARGS]; + + switch (match_token(value, cifs_smb_version_tokens, args)) { +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY + case Smb_1: + if (disable_legacy_dialects) { + cifs_dbg(VFS, "mount with legacy dialect disabled\n"); + return 1; + } + if (is_smb3) { + cifs_dbg(VFS, "vers=1.0 (cifs) not permitted when mounting with smb3\n"); + return 1; + } + cifs_dbg(VFS, "Use of the less secure dialect vers=1.0 is not recommended unless required for access to very old servers\n"); + vol->ops = &smb1_operations; + vol->vals = &smb1_values; + break; + case Smb_20: + if (disable_legacy_dialects) { + cifs_dbg(VFS, "mount with legacy dialect disabled\n"); + return 1; + } + if (is_smb3) { + cifs_dbg(VFS, "vers=2.0 not permitted when mounting with smb3\n"); + return 1; + } + vol->ops = &smb20_operations; + vol->vals = &smb20_values; + break; +#else + case Smb_1: + cifs_dbg(VFS, "vers=1.0 (cifs) mount not permitted when legacy dialects disabled\n"); + return 1; + case Smb_20: + cifs_dbg(VFS, "vers=2.0 mount not permitted when legacy dialects disabled\n"); + return 1; +#endif /* CIFS_ALLOW_INSECURE_LEGACY */ + case Smb_21: + vol->ops = &smb21_operations; + vol->vals = &smb21_values; + break; + case Smb_30: + vol->ops = &smb30_operations; + vol->vals = &smb30_values; + break; + case Smb_302: + vol->ops = &smb30_operations; /* currently identical with 3.0 */ + vol->vals = &smb302_values; + break; + case Smb_311: + vol->ops = &smb311_operations; + vol->vals = &smb311_values; + break; + case Smb_3any: + vol->ops = &smb30_operations; /* currently identical with 3.0 */ + vol->vals = &smb3any_values; + break; + case Smb_default: + vol->ops = &smb30_operations; /* currently identical with 3.0 */ + vol->vals = &smbdefault_values; + break; + default: + cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value); + return 1; + } + return 0; +} + +static const match_table_t cifs_secflavor_tokens = { + { Opt_sec_krb5, "krb5" }, + { Opt_sec_krb5i, "krb5i" }, + { Opt_sec_krb5p, "krb5p" }, + { Opt_sec_ntlmsspi, "ntlmsspi" }, + { Opt_sec_ntlmssp, "ntlmssp" }, + { Opt_ntlm, "ntlm" }, + { Opt_sec_ntlmi, "ntlmi" }, + { Opt_sec_ntlmv2, "nontlm" }, + { Opt_sec_ntlmv2, "ntlmv2" }, + { Opt_sec_ntlmv2i, "ntlmv2i" }, + { Opt_sec_lanman, "lanman" }, + { Opt_sec_none, "none" }, + + { Opt_sec_err, NULL } +}; + +int cifs_parse_security_flavors(char *value, struct smb_vol *vol) +{ + + substring_t args[MAX_OPT_ARGS]; + + /* + * With mount options, the last one should win. Reset any existing + * settings back to default. + */ + vol->sectype = Unspecified; + vol->sign = false; + + switch (match_token(value, cifs_secflavor_tokens, args)) { + case Opt_sec_krb5p: + cifs_dbg(VFS, "sec=krb5p is not supported!\n"); + return 1; + case Opt_sec_krb5i: + vol->sign = true; + fallthrough; + case Opt_sec_krb5: + vol->sectype = Kerberos; + break; + case Opt_sec_ntlmsspi: + vol->sign = true; + fallthrough; + case Opt_sec_ntlmssp: + vol->sectype = RawNTLMSSP; + break; + case Opt_sec_ntlmi: + vol->sign = true; + fallthrough; + case Opt_ntlm: + vol->sectype = NTLM; + break; + case Opt_sec_ntlmv2i: + vol->sign = true; + fallthrough; + case Opt_sec_ntlmv2: + vol->sectype = NTLMv2; + break; +#ifdef CONFIG_CIFS_WEAK_PW_HASH + case Opt_sec_lanman: + vol->sectype = LANMAN; + break; +#endif + case Opt_sec_none: + vol->nullauth = 1; + break; + default: + cifs_dbg(VFS, "bad security option: %s\n", value); + return 1; + } + + return 0; +} + +static const match_table_t cifs_cacheflavor_tokens = { + { Opt_cache_loose, "loose" }, + { Opt_cache_strict, "strict" }, + { Opt_cache_none, "none" }, + { Opt_cache_ro, "ro" }, + { Opt_cache_rw, "singleclient" }, + { Opt_cache_err, NULL } +}; + +int +cifs_parse_cache_flavor(char *value, struct smb_vol *vol) +{ + substring_t args[MAX_OPT_ARGS]; + + switch (match_token(value, cifs_cacheflavor_tokens, args)) { + case Opt_cache_loose: + vol->direct_io = false; + vol->strict_io = false; + vol->cache_ro = false; + vol->cache_rw = false; + break; + case Opt_cache_strict: + vol->direct_io = false; + vol->strict_io = true; + vol->cache_ro = false; + vol->cache_rw = false; + break; + case Opt_cache_none: + vol->direct_io = true; + vol->strict_io = false; + vol->cache_ro = false; + vol->cache_rw = false; + break; + case Opt_cache_ro: + vol->direct_io = false; + vol->strict_io = false; + vol->cache_ro = true; + vol->cache_rw = false; + break; + case Opt_cache_rw: + vol->direct_io = false; + vol->strict_io = false; + vol->cache_ro = false; + vol->cache_rw = true; + break; + default: + cifs_dbg(VFS, "bad cache= option: %s\n", value); + return 1; + } + return 0; +} diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h new file mode 100644 index 000000000000..886208a1b0ef --- /dev/null +++ b/fs/cifs/fs_context.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2020, Microsoft Corporation. + * + * Author(s): Steve French <stfrench@microsoft.com> + * David Howells <dhowells@redhat.com> + */ + +#ifndef _FS_CONTEXT_H +#define _FS_CONTEXT_H + +#include <linux/parser.h> +#include "cifsglob.h" + +enum smb_version { + Smb_1 = 1, + Smb_20, + Smb_21, + Smb_30, + Smb_302, + Smb_311, + Smb_3any, + Smb_default, + Smb_version_err +}; + +int cifs_parse_smb_version(char *value, struct smb_vol *vol, bool is_smb3); + +enum { + Opt_cache_loose, + Opt_cache_strict, + Opt_cache_none, + Opt_cache_ro, + Opt_cache_rw, + Opt_cache_err +}; + +int cifs_parse_cache_flavor(char *value, struct smb_vol *vol); + +enum cifs_sec_param { + Opt_sec_krb5, + Opt_sec_krb5i, + Opt_sec_krb5p, + Opt_sec_ntlmsspi, + Opt_sec_ntlmssp, + Opt_ntlm, + Opt_sec_ntlmi, + Opt_sec_ntlmv2, + Opt_sec_ntlmv2i, + Opt_sec_lanman, + Opt_sec_none, + + Opt_sec_err +}; + +int cifs_parse_security_flavors(char *value, struct smb_vol *vol); + +#endif diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 1f75b25e559a..9ee5f304592f 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -656,7 +656,7 @@ smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct smb311_posix_qinfo * static void cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, struct super_block *sb, bool adjust_tz, - bool symlink) + bool symlink, u32 reparse_tag) { struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); @@ -684,8 +684,22 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, fattr->cf_createtime = le64_to_cpu(info->CreationTime); fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks); - - if (symlink) { + if (reparse_tag == IO_REPARSE_TAG_LX_SYMLINK) { + fattr->cf_mode |= S_IFLNK | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_LNK; + } else if (reparse_tag == IO_REPARSE_TAG_LX_FIFO) { + fattr->cf_mode |= S_IFIFO | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_FIFO; + } else if (reparse_tag == IO_REPARSE_TAG_AF_UNIX) { + fattr->cf_mode |= S_IFSOCK | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_SOCK; + } else if (reparse_tag == IO_REPARSE_TAG_LX_CHR) { + fattr->cf_mode |= S_IFCHR | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_CHR; + } else if (reparse_tag == IO_REPARSE_TAG_LX_BLK) { + fattr->cf_mode |= S_IFBLK | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_BLK; + } else if (symlink) { /* TODO add more reparse tag checks */ fattr->cf_mode = S_IFLNK; fattr->cf_dtype = DT_LNK; } else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { @@ -740,8 +754,9 @@ cifs_get_file_info(struct file *filp) rc = server->ops->query_file_info(xid, tcon, &cfile->fid, &find_data); switch (rc) { case 0: + /* TODO: add support to query reparse tag */ cifs_all_info_to_fattr(&fattr, &find_data, inode->i_sb, false, - false); + false, 0 /* no reparse tag */); break; case -EREMOTE: cifs_create_dfs_fattr(&fattr, inode->i_sb); @@ -910,12 +925,13 @@ cifs_get_inode_info(struct inode **inode, struct cifs_sb_info *cifs_sb = CIFS_SB(sb); bool adjust_tz = false; struct cifs_fattr fattr = {0}; - bool symlink = false; + bool is_reparse_point = false; FILE_ALL_INFO *data = in_data; FILE_ALL_INFO *tmp_data = NULL; void *smb1_backup_rsp_buf = NULL; int rc = 0; int tmprc = 0; + __u32 reparse_tag = 0; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -938,8 +954,8 @@ cifs_get_inode_info(struct inode **inode, goto out; } rc = server->ops->query_path_info(xid, tcon, cifs_sb, - full_path, tmp_data, - &adjust_tz, &symlink); + full_path, tmp_data, + &adjust_tz, &is_reparse_point); data = tmp_data; } @@ -949,7 +965,19 @@ cifs_get_inode_info(struct inode **inode, switch (rc) { case 0: - cifs_all_info_to_fattr(&fattr, data, sb, adjust_tz, symlink); + /* + * If the file is a reparse point, it is more complicated + * since we have to check if its reparse tag matches a known + * special file type e.g. symlink or fifo or char etc. + */ + if ((le32_to_cpu(data->Attributes) & ATTR_REPARSE) && + server->ops->query_reparse_tag) { + rc = server->ops->query_reparse_tag(xid, tcon, cifs_sb, + full_path, &reparse_tag); + cifs_dbg(FYI, "reparse tag 0x%x\n", reparse_tag); + } + cifs_all_info_to_fattr(&fattr, data, sb, adjust_tz, + is_reparse_point, reparse_tag); break; case -EREMOTE: /* DFS link, no metadata available on this server */ @@ -2883,13 +2911,18 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs) { struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb); + int rc, retries = 0; - if (pTcon->unix_ext) - return cifs_setattr_unix(direntry, attrs); - - return cifs_setattr_nounix(direntry, attrs); + do { + if (pTcon->unix_ext) + rc = cifs_setattr_unix(direntry, attrs); + else + rc = cifs_setattr_nounix(direntry, attrs); + retries++; + } while (is_retryable_error(rc) && retries < 2); /* BB: add cifs_setattr_legacy for really old servers */ + return rc; } #if 0 diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 6df0922e7e30..799be3a5d25e 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -168,10 +168,33 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) fattr->cf_uid = cifs_sb->mnt_uid; fattr->cf_gid = cifs_sb->mnt_gid; + /* + * The IO_REPARSE_TAG_LX_ tags originally were used by WSL but they + * are preferred by the Linux client in some cases since, unlike + * the NFS reparse tag (or EAs), they don't require an extra query + * to determine which type of special file they represent. + * TODO: go through all documented reparse tags to see if we can + * reasonably map some of them to directories vs. files vs. symlinks + */ if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode; fattr->cf_dtype = DT_DIR; - } else { + } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_SYMLINK) { + fattr->cf_mode |= S_IFLNK | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_LNK; + } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_FIFO) { + fattr->cf_mode |= S_IFIFO | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_FIFO; + } else if (fattr->cf_cifstag == IO_REPARSE_TAG_AF_UNIX) { + fattr->cf_mode |= S_IFSOCK | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_SOCK; + } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_CHR) { + fattr->cf_mode |= S_IFCHR | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_CHR; + } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_BLK) { + fattr->cf_mode |= S_IFBLK | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_BLK; + } else { /* TODO: should we mark some other reparse points (like DFSR) as directories? */ fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode; fattr->cf_dtype = DT_REG; } @@ -267,9 +290,8 @@ cifs_posix_to_fattr(struct cifs_fattr *fattr, struct smb2_posix_info *info, if (reparse_file_needs_reval(fattr)) fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; - /* TODO map SIDs */ - fattr->cf_uid = cifs_sb->mnt_uid; - fattr->cf_gid = cifs_sb->mnt_gid; + sid_to_id(cifs_sb, &parsed.owner, fattr, SIDOWNER); + sid_to_id(cifs_sb, &parsed.group, fattr, SIDGROUP); } static void __dir_info_to_fattr(struct cifs_fattr *fattr, const void *info) @@ -360,11 +382,11 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb, */ static int -initiate_cifs_search(const unsigned int xid, struct file *file) +initiate_cifs_search(const unsigned int xid, struct file *file, + char *full_path) { __u16 search_flags; int rc = 0; - char *full_path = NULL; struct cifsFileInfo *cifsFile; struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); struct tcon_link *tlink = NULL; @@ -400,12 +422,6 @@ initiate_cifs_search(const unsigned int xid, struct file *file) cifsFile->invalidHandle = true; cifsFile->srch_inf.endOfSearch = false; - full_path = build_path_from_dentry(file_dentry(file)); - if (full_path == NULL) { - rc = -ENOMEM; - goto error_exit; - } - cifs_dbg(FYI, "Full path: %s start at: %lld\n", full_path, file->f_pos); ffirst_retry: @@ -444,7 +460,6 @@ ffirst_retry: goto ffirst_retry; } error_exit: - kfree(full_path); cifs_put_tlink(tlink); return rc; } @@ -688,7 +703,8 @@ static int cifs_save_resume_key(const char *current_entry, */ static int find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, - struct file *file, char **current_entry, int *num_to_ret) + struct file *file, char *full_path, + char **current_entry, int *num_to_ret) { __u16 search_flags; int rc = 0; @@ -741,7 +757,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, ntwrk_buf_start); cfile->srch_inf.ntwrk_buf_start = NULL; } - rc = initiate_cifs_search(xid, file); + rc = initiate_cifs_search(xid, file, full_path); if (rc) { cifs_dbg(FYI, "error %d reinitiating a search on rewind\n", rc); @@ -925,15 +941,22 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) char *tmp_buf = NULL; char *end_of_smb; unsigned int max_len; + char *full_path = NULL; xid = get_xid(); + full_path = build_path_from_dentry(file_dentry(file)); + if (full_path == NULL) { + rc = -ENOMEM; + goto rddir2_exit; + } + /* * Ensure FindFirst doesn't fail before doing filldir() for '.' and * '..'. Otherwise we won't be able to notify VFS in case of failure. */ if (file->private_data == NULL) { - rc = initiate_cifs_search(xid, file); + rc = initiate_cifs_search(xid, file, full_path); cifs_dbg(FYI, "initiate cifs search rc %d\n", rc); if (rc) goto rddir2_exit; @@ -960,8 +983,8 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) } */ tcon = tlink_tcon(cifsFile->tlink); - rc = find_cifs_entry(xid, tcon, ctx->pos, file, ¤t_entry, - &num_to_fill); + rc = find_cifs_entry(xid, tcon, ctx->pos, file, full_path, + ¤t_entry, &num_to_fill); if (rc) { cifs_dbg(FYI, "fce error %d\n", rc); goto rddir2_exit; @@ -1019,6 +1042,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) kfree(tmp_buf); rddir2_exit: + kfree(full_path); free_xid(xid); return rc; } diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h index cf20f0b5d836..99a1951a01ec 100644 --- a/fs/cifs/smb2glob.h +++ b/fs/cifs/smb2glob.h @@ -58,6 +58,7 @@ #define SMB2_HMACSHA256_SIZE (32) #define SMB2_CMACAES_SIZE (16) #define SMB3_SIGNKEY_SIZE (16) +#define SMB3_GCM256_CRYPTKEY_SIZE (32) /* Maximum buffer size value we can send with 1 credit */ #define SMB2_MAX_BUFFER_SIZE 65536 diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index eba01d0908dd..1f900b81c34a 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -506,17 +506,17 @@ move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src) int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, - FILE_ALL_INFO *data, bool *adjust_tz, bool *symlink) + FILE_ALL_INFO *data, bool *adjust_tz, bool *reparse) { int rc; struct smb2_file_all_info *smb2_data; __u32 create_options = 0; - struct cifs_fid fid; bool no_cached_open = tcon->nohandlecache; struct cifsFileInfo *cfile; + struct cached_fid *cfid = NULL; *adjust_tz = false; - *symlink = false; + *reparse = false; smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, GFP_KERNEL); @@ -525,7 +525,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, /* If it is a root and its handle is cached then use it */ if (!strlen(full_path) && !no_cached_open) { - rc = open_shroot(xid, tcon, cifs_sb, &fid); + rc = open_shroot(xid, tcon, cifs_sb, &cfid); if (rc) goto out; @@ -533,12 +533,13 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, move_smb2_info_to_cifs(data, &tcon->crfid.file_all_info); } else { - rc = SMB2_query_info(xid, tcon, fid.persistent_fid, - fid.volatile_fid, smb2_data); + rc = SMB2_query_info(xid, tcon, + cfid->fid->persistent_fid, + cfid->fid->volatile_fid, smb2_data); if (!rc) move_smb2_info_to_cifs(data, smb2_data); } - close_shroot(&tcon->crfid); + close_shroot(cfid); goto out; } @@ -547,7 +548,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, ACL_NO_MODE, smb2_data, SMB2_OP_QUERY_INFO, cfile); if (rc == -EOPNOTSUPP) { - *symlink = true; + *reparse = true; create_options |= OPEN_REPARSE_POINT; /* Failed on a symbolic link - query a reparse point info */ @@ -569,7 +570,7 @@ out: int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, - struct smb311_posix_qinfo *data, bool *adjust_tz, bool *symlink) + struct smb311_posix_qinfo *data, bool *adjust_tz, bool *reparse) { int rc; __u32 create_options = 0; @@ -577,7 +578,7 @@ smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, struct smb311_posix_qinfo *smb2_data; *adjust_tz = false; - *symlink = false; + *reparse = false; /* BB TODO: Make struct larger when add support for parsing owner SIDs */ smb2_data = kzalloc(sizeof(struct smb311_posix_qinfo), @@ -598,7 +599,7 @@ smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, ACL_NO_MODE, smb2_data, SMB2_OP_POSIX_QUERY_INFO, cfile); if (rc == -EOPNOTSUPP) { /* BB TODO: When support for special files added to Samba re-verify this path */ - *symlink = true; + *reparse = true; create_options |= OPEN_REPARSE_POINT; /* Failed on a symbolic link - query a reparse point info */ diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index 7fde3775cb57..c775682ee973 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -488,7 +488,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_PIPE_CONNECTED, -EIO, "STATUS_PIPE_CONNECTED"}, {STATUS_PIPE_LISTENING, -EIO, "STATUS_PIPE_LISTENING"}, {STATUS_INVALID_READ_MODE, -EIO, "STATUS_INVALID_READ_MODE"}, - {STATUS_IO_TIMEOUT, -ETIMEDOUT, "STATUS_IO_TIMEOUT"}, + {STATUS_IO_TIMEOUT, -EAGAIN, "STATUS_IO_TIMEOUT"}, {STATUS_FILE_FORCED_CLOSED, -EIO, "STATUS_FILE_FORCED_CLOSED"}, {STATUS_PROFILING_NOT_STARTED, -EIO, "STATUS_PROFILING_NOT_STARTED"}, {STATUS_PROFILING_NOT_STOPPED, -EIO, "STATUS_PROFILING_NOT_STOPPED"}, @@ -814,7 +814,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_INVALID_VARIANT, -EIO, "STATUS_INVALID_VARIANT"}, {STATUS_DOMAIN_CONTROLLER_NOT_FOUND, -EIO, "STATUS_DOMAIN_CONTROLLER_NOT_FOUND"}, - {STATUS_ACCOUNT_LOCKED_OUT, -EIO, "STATUS_ACCOUNT_LOCKED_OUT"}, + {STATUS_ACCOUNT_LOCKED_OUT, -EACCES, "STATUS_ACCOUNT_LOCKED_OUT"}, {STATUS_HANDLE_NOT_CLOSABLE, -EIO, "STATUS_HANDLE_NOT_CLOSABLE"}, {STATUS_CONNECTION_REFUSED, -EIO, "STATUS_CONNECTION_REFUSED"}, {STATUS_GRACEFUL_DISCONNECT, -EIO, "STATUS_GRACEFUL_DISCONNECT"}, diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index d44df8f95bcd..504766cb6c19 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -72,7 +72,7 @@ smb2_add_credits(struct TCP_Server_Info *server, /* eg found case where write overlapping reconnect messed up credits */ if (((optype & CIFS_OP_MASK) == CIFS_NEG_OP) && (*val != 0)) trace_smb3_reconnect_with_invalid_credits(server->CurrentMid, - server->hostname, *val); + server->hostname, *val, add); if ((instance == 0) || (instance == server->reconnect_instance)) *val += add; else @@ -121,6 +121,8 @@ smb2_add_credits(struct TCP_Server_Info *server, cifs_dbg(FYI, "disabling oplocks\n"); break; default: + trace_smb3_add_credits(server->CurrentMid, + server->hostname, rc, add); cifs_dbg(FYI, "add %u credits total=%d\n", add, rc); } } @@ -651,7 +653,8 @@ smb2_cached_lease_break(struct work_struct *work) * Open the directory at the root of a share */ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, struct cifs_fid *pfid) + struct cifs_sb_info *cifs_sb, + struct cached_fid **cfid) { struct cifs_ses *ses = tcon->ses; struct TCP_Server_Info *server = ses->server; @@ -666,11 +669,12 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, int rc, flags = 0; __le16 utf16_path = 0; /* Null - since an open of top of share */ u8 oplock = SMB2_OPLOCK_LEVEL_II; + struct cifs_fid *pfid; mutex_lock(&tcon->crfid.fid_mutex); if (tcon->crfid.is_valid) { cifs_dbg(FYI, "found a cached root file handle\n"); - memcpy(pfid, tcon->crfid.fid, sizeof(struct cifs_fid)); + *cfid = &tcon->crfid; kref_get(&tcon->crfid.refcount); mutex_unlock(&tcon->crfid.fid_mutex); return 0; @@ -691,6 +695,7 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, if (!server->ops->new_lease_key) return -EIO; + pfid = tcon->crfid.fid; server->ops->new_lease_key(pfid); memset(rqst, 0, sizeof(rqst)); @@ -820,6 +825,8 @@ oshr_free: SMB2_query_info_free(&rqst[1]); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + if (rc == 0) + *cfid = &tcon->crfid; return rc; } @@ -833,6 +840,7 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_fid fid; bool no_cached_open = tcon->nohandlecache; + struct cached_fid *cfid = NULL; oparms.tcon = tcon; oparms.desired_access = FILE_READ_ATTRIBUTES; @@ -841,12 +849,14 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - if (no_cached_open) + if (no_cached_open) { rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL, NULL, NULL); - else - rc = open_shroot(xid, tcon, cifs_sb, &fid); - + } else { + rc = open_shroot(xid, tcon, cifs_sb, &cfid); + if (rc == 0) + memcpy(&fid, cfid->fid, sizeof(struct cifs_fid)); + } if (rc) return; @@ -863,7 +873,7 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, if (no_cached_open) SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); else - close_shroot(&tcon->crfid); + close_shroot(cfid); } static void @@ -2346,6 +2356,17 @@ smb2_is_session_expired(char *buf) return true; } +static bool +smb2_is_status_io_timeout(char *buf) +{ + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + + if (shdr->Status == STATUS_IO_TIMEOUT) + return true; + else + return false; +} + static int smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid, struct cifsInodeInfo *cinode) @@ -3013,6 +3034,133 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, return rc; } +int +smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + __u32 *tag) +{ + int rc; + __le16 *utf16_path = NULL; + __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + struct cifs_open_parms oparms; + struct cifs_fid fid; + struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); + int flags = 0; + struct smb_rqst rqst[3]; + int resp_buftype[3]; + struct kvec rsp_iov[3]; + struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; + struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; + struct kvec close_iov[1]; + struct smb2_ioctl_rsp *ioctl_rsp; + struct reparse_data_buffer *reparse_buf; + u32 plen; + + cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); + + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + memset(rqst, 0, sizeof(rqst)); + resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; + memset(rsp_iov, 0, sizeof(rsp_iov)); + + utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); + if (!utf16_path) + return -ENOMEM; + + /* + * setup smb2open - TODO add optimization to call cifs_get_readable_path + * to see if there is a handle already open that we can use + */ + memset(&open_iov, 0, sizeof(open_iov)); + rqst[0].rq_iov = open_iov; + rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; + + memset(&oparms, 0, sizeof(oparms)); + oparms.tcon = tcon; + oparms.desired_access = FILE_READ_ATTRIBUTES; + oparms.disposition = FILE_OPEN; + oparms.create_options = cifs_create_options(cifs_sb, OPEN_REPARSE_POINT); + oparms.fid = &fid; + oparms.reconnect = false; + + rc = SMB2_open_init(tcon, server, + &rqst[0], &oplock, &oparms, utf16_path); + if (rc) + goto query_rp_exit; + smb2_set_next_command(tcon, &rqst[0]); + + + /* IOCTL */ + memset(&io_iov, 0, sizeof(io_iov)); + rqst[1].rq_iov = io_iov; + rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE; + + rc = SMB2_ioctl_init(tcon, server, + &rqst[1], fid.persistent_fid, + fid.volatile_fid, FSCTL_GET_REPARSE_POINT, + true /* is_fctl */, NULL, 0, + CIFSMaxBufSize - + MAX_SMB2_CREATE_RESPONSE_SIZE - + MAX_SMB2_CLOSE_RESPONSE_SIZE); + if (rc) + goto query_rp_exit; + + smb2_set_next_command(tcon, &rqst[1]); + smb2_set_related(&rqst[1]); + + + /* Close */ + memset(&close_iov, 0, sizeof(close_iov)); + rqst[2].rq_iov = close_iov; + rqst[2].rq_nvec = 1; + + rc = SMB2_close_init(tcon, server, + &rqst[2], COMPOUND_FID, COMPOUND_FID, false); + if (rc) + goto query_rp_exit; + + smb2_set_related(&rqst[2]); + + rc = compound_send_recv(xid, tcon->ses, server, + flags, 3, rqst, + resp_buftype, rsp_iov); + + ioctl_rsp = rsp_iov[1].iov_base; + + /* + * Open was successful and we got an ioctl response. + */ + if (rc == 0) { + /* See MS-FSCC 2.3.23 */ + + reparse_buf = (struct reparse_data_buffer *) + ((char *)ioctl_rsp + + le32_to_cpu(ioctl_rsp->OutputOffset)); + plen = le32_to_cpu(ioctl_rsp->OutputCount); + + if (plen + le32_to_cpu(ioctl_rsp->OutputOffset) > + rsp_iov[1].iov_len) { + cifs_tcon_dbg(FYI, "srv returned invalid ioctl len: %d\n", + plen); + rc = -EIO; + goto query_rp_exit; + } + *tag = le32_to_cpu(reparse_buf->ReparseTag); + } + + query_rp_exit: + kfree(utf16_path); + SMB2_open_free(&rqst[0]); + SMB2_ioctl_free(&rqst[1]); + SMB2_close_free(&rqst[2]); + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); + return rc; +} + static struct cifs_ntsd * get_smb2_acl_by_fid(struct cifs_sb_info *cifs_sb, const struct cifs_fid *cifsfid, u32 *pacllen) @@ -3072,7 +3220,12 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb, oparms.tcon = tcon; oparms.desired_access = READ_CONTROL; oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); + /* + * When querying an ACL, even if the file is a symlink we want to open + * the source not the target, and so the protocol requires that the + * client specify this flag when opening a reparse point + */ + oparms.create_options = cifs_create_options(cifs_sb, 0) | OPEN_REPARSE_POINT; oparms.fid = &fid; oparms.reconnect = false; @@ -3801,10 +3954,11 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len, tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM; tr_hdr->OriginalMessageSize = cpu_to_le32(orig_len); tr_hdr->Flags = cpu_to_le16(0x01); - if (cipher_type == SMB2_ENCRYPTION_AES128_GCM) - get_random_bytes(&tr_hdr->Nonce, SMB3_AES128GCM_NONCE); + if ((cipher_type == SMB2_ENCRYPTION_AES128_GCM) || + (cipher_type == SMB2_ENCRYPTION_AES256_GCM)) + get_random_bytes(&tr_hdr->Nonce, SMB3_AES_GCM_NONCE); else - get_random_bytes(&tr_hdr->Nonce, SMB3_AES128CCM_NONCE); + get_random_bytes(&tr_hdr->Nonce, SMB3_AES_CCM_NONCE); memcpy(&tr_hdr->SessionId, &shdr->SessionId, 8); } @@ -3924,7 +4078,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, if (rc) { cifs_server_dbg(VFS, "%s: Could not get %scryption key\n", __func__, enc ? "en" : "de"); - return 0; + return rc; } rc = smb3_crypto_aead_allocate(server); @@ -3935,7 +4089,12 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, tfm = enc ? server->secmech.ccmaesencrypt : server->secmech.ccmaesdecrypt; - rc = crypto_aead_setkey(tfm, key, SMB3_SIGN_KEY_SIZE); + + if (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM) + rc = crypto_aead_setkey(tfm, key, SMB3_GCM256_CRYPTKEY_SIZE); + else + rc = crypto_aead_setkey(tfm, key, SMB3_SIGN_KEY_SIZE); + if (rc) { cifs_server_dbg(VFS, "%s: Failed to set aead key %d\n", __func__, rc); return rc; @@ -3973,11 +4132,12 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, goto free_sg; } - if (server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) - memcpy(iv, (char *)tr_hdr->Nonce, SMB3_AES128GCM_NONCE); + if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) || + (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) + memcpy(iv, (char *)tr_hdr->Nonce, SMB3_AES_GCM_NONCE); else { iv[0] = 3; - memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES128CCM_NONCE); + memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES_CCM_NONCE); } aead_request_set_crypt(req, sg, sg, crypt_len, iv); @@ -4103,7 +4263,8 @@ smb3_is_transform_hdr(void *buf) static int decrypt_raw_data(struct TCP_Server_Info *server, char *buf, unsigned int buf_data_size, struct page **pages, - unsigned int npages, unsigned int page_data_size) + unsigned int npages, unsigned int page_data_size, + bool is_offloaded) { struct kvec iov[2]; struct smb_rqst rqst = {NULL}; @@ -4129,7 +4290,8 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, memmove(buf, iov[1].iov_base, buf_data_size); - server->total_read = buf_data_size + page_data_size; + if (!is_offloaded) + server->total_read = buf_data_size + page_data_size; return rc; } @@ -4342,7 +4504,7 @@ static void smb2_decrypt_offload(struct work_struct *work) struct mid_q_entry *mid; rc = decrypt_raw_data(dw->server, dw->buf, dw->server->vals->read_rsp_size, - dw->ppages, dw->npages, dw->len); + dw->ppages, dw->npages, dw->len, true); if (rc) { cifs_dbg(VFS, "error decrypting rc=%d\n", rc); goto free_pages; @@ -4448,7 +4610,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, non_offloaded_decrypt: rc = decrypt_raw_data(server, buf, server->vals->read_rsp_size, - pages, npages, len); + pages, npages, len, false); if (rc) goto free_pages; @@ -4504,7 +4666,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server, server->total_read += length; buf_size = pdu_length - sizeof(struct smb2_transform_hdr); - length = decrypt_raw_data(server, buf, buf_size, NULL, 0, 0); + length = decrypt_raw_data(server, buf, buf_size, NULL, 0, 0, false); if (length) return length; @@ -4809,6 +4971,7 @@ struct smb_version_operations smb20_operations = { .make_node = smb2_make_node, .fiemap = smb3_fiemap, .llseek = smb3_llseek, + .is_status_io_timeout = smb2_is_status_io_timeout, }; struct smb_version_operations smb21_operations = { @@ -4909,6 +5072,7 @@ struct smb_version_operations smb21_operations = { .make_node = smb2_make_node, .fiemap = smb3_fiemap, .llseek = smb3_llseek, + .is_status_io_timeout = smb2_is_status_io_timeout, }; struct smb_version_operations smb30_operations = { @@ -4949,6 +5113,8 @@ struct smb_version_operations smb30_operations = { .can_echo = smb2_can_echo, .echo = SMB2_echo, .query_path_info = smb2_query_path_info, + /* WSL tags introduced long after smb2.1, enable for SMB3, 3.11 only */ + .query_reparse_tag = smb2_query_reparse_tag, .get_srv_inum = smb2_get_srv_inum, .query_file_info = smb2_query_file_info, .set_path_size = smb2_set_path_size, @@ -5019,6 +5185,7 @@ struct smb_version_operations smb30_operations = { .make_node = smb2_make_node, .fiemap = smb3_fiemap, .llseek = smb3_llseek, + .is_status_io_timeout = smb2_is_status_io_timeout, }; struct smb_version_operations smb311_operations = { @@ -5059,6 +5226,7 @@ struct smb_version_operations smb311_operations = { .can_echo = smb2_can_echo, .echo = SMB2_echo, .query_path_info = smb2_query_path_info, + .query_reparse_tag = smb2_query_reparse_tag, .get_srv_inum = smb2_get_srv_inum, .query_file_info = smb2_query_file_info, .set_path_size = smb2_set_path_size, @@ -5130,6 +5298,7 @@ struct smb_version_operations smb311_operations = { .make_node = smb2_make_node, .fiemap = smb3_fiemap, .llseek = smb3_llseek, + .is_status_io_timeout = smb2_is_status_io_timeout, }; struct smb_version_values smb20_values = { diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 96c172d94fba..445e80862865 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -449,10 +449,22 @@ static void build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt) { pneg_ctxt->ContextType = SMB2_ENCRYPTION_CAPABILITIES; - pneg_ctxt->DataLength = cpu_to_le16(6); /* Cipher Count + two ciphers */ - pneg_ctxt->CipherCount = cpu_to_le16(2); - pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM; - pneg_ctxt->Ciphers[1] = SMB2_ENCRYPTION_AES128_CCM; + if (require_gcm_256) { + pneg_ctxt->DataLength = cpu_to_le16(4); /* Cipher Count + 1 cipher */ + pneg_ctxt->CipherCount = cpu_to_le16(1); + pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES256_GCM; + } else if (enable_gcm_256) { + pneg_ctxt->DataLength = cpu_to_le16(8); /* Cipher Count + 3 ciphers */ + pneg_ctxt->CipherCount = cpu_to_le16(3); + pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM; + pneg_ctxt->Ciphers[1] = SMB2_ENCRYPTION_AES256_GCM; + pneg_ctxt->Ciphers[2] = SMB2_ENCRYPTION_AES128_CCM; + } else { + pneg_ctxt->DataLength = cpu_to_le16(6); /* Cipher Count + 2 ciphers */ + pneg_ctxt->CipherCount = cpu_to_le16(2); + pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM; + pneg_ctxt->Ciphers[1] = SMB2_ENCRYPTION_AES128_CCM; + } } static unsigned int @@ -598,8 +610,29 @@ static int decode_encrypt_ctx(struct TCP_Server_Info *server, return -EINVAL; } cifs_dbg(FYI, "SMB311 cipher type:%d\n", le16_to_cpu(ctxt->Ciphers[0])); - if ((ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES128_CCM) && - (ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES128_GCM)) { + if (require_gcm_256) { + if (ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES256_GCM) { + cifs_dbg(VFS, "Server does not support requested encryption type (AES256 GCM)\n"); + return -EOPNOTSUPP; + } + } else if (ctxt->Ciphers[0] == 0) { + /* + * e.g. if server only supported AES256_CCM (very unlikely) + * or server supported no encryption types or had all disabled. + * Since GLOBAL_CAP_ENCRYPTION will be not set, in the case + * in which mount requested encryption ("seal") checks later + * on during tree connection will return proper rc, but if + * seal not requested by client, since server is allowed to + * return 0 to indicate no supported cipher, we can't fail here + */ + server->cipher_type = 0; + server->capabilities &= ~SMB2_GLOBAL_CAP_ENCRYPTION; + pr_warn_once("Server does not support requested encryption types\n"); + return 0; + } else if ((ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES128_CCM) && + (ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES128_GCM) && + (ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES256_GCM)) { + /* server returned a cipher we didn't ask for */ pr_warn_once("Invalid SMB3.11 cipher returned\n"); return -EINVAL; } @@ -1948,9 +1981,11 @@ smb2_parse_contexts(struct TCP_Server_Info *server, unsigned int next; unsigned int remaining; char *name; - const char smb3_create_tag_posix[] = {0x93, 0xAD, 0x25, 0x50, 0x9C, - 0xB4, 0x11, 0xE7, 0xB4, 0x23, 0x83, - 0xDE, 0x96, 0x8B, 0xCD, 0x7C}; + static const char smb3_create_tag_posix[] = { + 0x93, 0xAD, 0x25, 0x50, 0x9C, + 0xB4, 0x11, 0xE7, 0xB4, 0x23, 0x83, + 0xDE, 0x96, 0x8B, 0xCD, 0x7C + }; *oplock = 0; data_offset = (char *)rsp + le32_to_cpu(rsp->CreateContextsOffset); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index c3f1baf5bde2..f05f9b12f689 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -128,8 +128,8 @@ struct smb2_sync_pdu { __le16 StructureSize2; /* size of wct area (varies, request specific) */ } __packed; -#define SMB3_AES128CCM_NONCE 11 -#define SMB3_AES128GCM_NONCE 12 +#define SMB3_AES_CCM_NONCE 11 +#define SMB3_AES_GCM_NONCE 12 /* Transform flags (for 3.0 dialect this flag indicates CCM */ #define TRANSFORM_FLAG_ENCRYPTED 0x0001 @@ -153,10 +153,14 @@ struct smb2_compression_transform_hdr { } __packed; /* See MS-SMB2 2.2.42.1 */ +#define SMB2_COMPRESSION_FLAG_NONE 0x0000 +#define SMB2_COMPRESSION_FLAG_CHAINED 0x0001 + struct compression_payload_header { - __le16 AlgorithmId; - __le16 Reserved; - __le32 Length; + __le16 CompressionAlgorithm; + __le16 Flags; + __le32 Length; /* length of compressed playload including field below if present */ + /* __le32 OriginalPayloadSize; */ /* optional */ } __packed; /* See MS-SMB2 2.2.42.2 */ @@ -167,6 +171,26 @@ struct compression_pattern_payload_v1 { __le32 Repetitions; } __packed; +/* See MS-SMB2 2.2.43 */ +struct smb2_rdma_transform { + __le16 RdmaDescriptorOffset; + __le16 RdmaDescriptorLength; + __le32 Channel; /* for values see channel description in smb2 read above */ + __le16 TransformCount; + __le16 Reserved1; + __le32 Reserved2; +} __packed; + +struct smb2_rdma_encryption_transform { + __le16 TransformType; + __le16 SignatureLength; + __le16 NonceLength; + __u16 Reserved; + __u8 Signature[]; /* variable length */ + /* u8 Nonce[] */ + /* followed by padding */ +} __packed; + /* * SMB2 flag definitions */ @@ -297,6 +321,9 @@ struct smb2_negotiate_req { #define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2) #define SMB2_COMPRESSION_CAPABILITIES cpu_to_le16(3) #define SMB2_NETNAME_NEGOTIATE_CONTEXT_ID cpu_to_le16(5) +#define SMB2_TRANSPORT_CAPABILITIES cpu_to_le16(6) +#define SMB2_RDMA_TRANSFORM_CAPABILITIES cpu_to_le16(7) +#define SMB2_SIGNING_CAPABILITIES cpu_to_le16(8) #define SMB2_POSIX_EXTENSIONS_AVAILABLE cpu_to_le16(0x100) struct smb2_neg_context { @@ -325,6 +352,9 @@ struct smb2_preauth_neg_context { /* Encryption Algorithms Ciphers */ #define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001) #define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002) +/* we currently do not request AES256_CCM since presumably GCM faster */ +#define SMB2_ENCRYPTION_AES256_CCM cpu_to_le16(0x0003) +#define SMB2_ENCRYPTION_AES256_GCM cpu_to_le16(0x0004) /* Min encrypt context data is one cipher so 2 bytes + 2 byte count field */ #define MIN_ENCRYPT_CTXT_DATA_LEN 4 @@ -332,8 +362,9 @@ struct smb2_encryption_neg_context { __le16 ContextType; /* 2 */ __le16 DataLength; __le32 Reserved; - __le16 CipherCount; /* AES-128-GCM and AES-128-CCM */ - __le16 Ciphers[2]; + /* CipherCount usally 2, but can be 3 when AES256-GCM enabled */ + __le16 CipherCount; /* AES128-GCM and AES128-CCM by default */ + __le16 Ciphers[3]; } __packed; /* See MS-SMB2 2.2.3.1.3 */ @@ -351,10 +382,10 @@ struct smb2_encryption_neg_context { struct smb2_compression_capabilities_context { __le16 ContextType; /* 3 */ __le16 DataLength; - __u32 Flags; + __u32 Reserved; __le16 CompressionAlgorithmCount; __u16 Padding; - __u32 Reserved1; + __u32 Flags; __le16 CompressionAlgorithms[3]; } __packed; @@ -363,12 +394,44 @@ struct smb2_compression_capabilities_context { * Its struct simply contains NetName, an array of Unicode characters */ struct smb2_netname_neg_context { - __le16 ContextType; /* 0x100 */ + __le16 ContextType; /* 5 */ __le16 DataLength; __le32 Reserved; __le16 NetName[]; /* hostname of target converted to UCS-2 */ } __packed; +/* + * For rdma transform capabilities context see MS-SMB2 2.2.3.1.6 + * and 2.2.4.1.5 + */ + +/* RDMA Transform IDs */ +#define SMB2_RDMA_TRANSFORM_NONE 0x0000 +#define SMB2_RDMA_TRANSFORM_ENCRYPTION 0x0001 + +struct smb2_rdma_transform_capabilities_context { + __le16 ContextType; /* 7 */ + __le16 DataLength; + __u32 Reserved; + __le16 TransformCount; + __u16 Reserved1; + __u32 Reserved2; + __le16 RDMATransformIds[1]; +} __packed; + +/* Signing algorithms */ +#define SIGNING_ALG_HMAC_SHA256 0 +#define SIGNING_ALG_AES_CMAC 1 +#define SIGNING_ALG_AES_GMAC 2 + +struct smb2_signing_capabilities { + __le16 ContextType; /* 8 */ + __le16 DataLength; + __u32 Reserved; + __le16 SigningAlgorithmCount; + __le16 SigningAlgorithms[]; +} __packed; + #define POSIX_CTXT_DATA_LEN 16 struct smb2_posix_neg_context { __le16 ContextType; /* 0x100 */ @@ -936,6 +999,31 @@ struct copychunk_ioctl_rsp { __le32 TotalBytesWritten; } __packed; +/* See MS-FSCC 2.3.29 and 2.3.30 */ +struct get_retrieval_pointer_count_req { + __le64 StartingVcn; /* virtual cluster number (signed) */ +} __packed; + +struct get_retrieval_pointer_count_rsp { + __le32 ExtentCount; +} __packed; + +/* + * See MS-FSCC 2.3.33 and 2.3.34 + * request is the same as get_retrieval_point_count_req struct above + */ +struct smb3_extents { + __le64 NextVcn; + __le64 Lcn; /* logical cluster number */ +} __packed; + +struct get_retrieval_pointers_refcount_rsp { + __le32 ExtentCount; + __u32 Reserved; + __le64 StartingVcn; + struct smb3_extents extents[]; +} __packed; + struct fsctl_set_integrity_information_req { __le16 ChecksumAlgorithm; __le16 Reserved; @@ -1178,6 +1266,7 @@ struct smb2_flush_rsp { #define SMB2_CHANNEL_NONE cpu_to_le32(0x00000000) #define SMB2_CHANNEL_RDMA_V1 cpu_to_le32(0x00000001) /* SMB3 or later */ #define SMB2_CHANNEL_RDMA_V1_INVALIDATE cpu_to_le32(0x00000002) /* >= SMB3.02 */ +#define SMB2_CHANNEL_RDMA_TRANSFORM cpu_to_le32(0x00000003) /* >= SMB3.02, only used on write */ /* SMB2 read request without RFC1001 length at the beginning */ struct smb2_read_plain_req { @@ -1197,6 +1286,10 @@ struct smb2_read_plain_req { __u8 Buffer[1]; } __packed; +/* Read flags */ +#define SMB2_READFLAG_RESPONSE_NONE 0x00000000 +#define SMB2_READFLAG_RESPONSE_RDMA_TRANSFORM 0x00000001 + struct smb2_read_rsp { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 17 */ @@ -1204,7 +1297,7 @@ struct smb2_read_rsp { __u8 Reserved; __le32 DataLength; __le32 DataRemaining; - __u32 Reserved2; + __u32 Flags; __u8 Buffer[1]; } __packed; @@ -1572,6 +1665,7 @@ struct smb2_file_rename_info { /* encoding of request for level 10 */ __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ __le32 FileNameLength; char FileName[]; /* New name to be assigned */ + /* padding - overall struct size must be >= 24 so filename + pad >= 6 */ } __packed; /* level 10 Set */ struct smb2_file_link_info { /* encoding of request for level 11 */ @@ -1623,6 +1717,11 @@ struct smb2_file_eof_info { /* encoding of request for level 10 */ __le64 EndOfFile; /* new end of file value */ } __packed; /* level 20 Set */ +struct smb2_file_reparse_point_info { + __le64 IndexNumber; + __le32 Tag; +} __packed; + struct smb2_file_network_open_info { __le64 CreationTime; __le64 LastAccessTime; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 2f8ecbf54214..d4110447ee3a 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -70,12 +70,16 @@ extern int smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid); extern int open_shroot(unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, struct cifs_fid *pfid); + struct cifs_sb_info *cifs_sb, + struct cached_fid **cfid); extern void close_shroot(struct cached_fid *cfid); extern void close_shroot_lease(struct cached_fid *cfid); extern void close_shroot_lease_locked(struct cached_fid *cfid); extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src); +extern int smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *path, + __u32 *reparse_tag); extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, FILE_ALL_INFO *data, diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index c0348e3b1695..ebccd71cc60a 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -849,12 +849,13 @@ smb3_crypto_aead_allocate(struct TCP_Server_Info *server) struct crypto_aead *tfm; if (!server->secmech.ccmaesencrypt) { - if (server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) + if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) || + (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) tfm = crypto_alloc_aead("gcm(aes)", 0, 0); else tfm = crypto_alloc_aead("ccm(aes)", 0, 0); if (IS_ERR(tfm)) { - cifs_server_dbg(VFS, "%s: Failed to alloc encrypt aead\n", + cifs_server_dbg(VFS, "%s: Failed alloc encrypt aead\n", __func__); return PTR_ERR(tfm); } @@ -862,7 +863,8 @@ smb3_crypto_aead_allocate(struct TCP_Server_Info *server) } if (!server->secmech.ccmaesdecrypt) { - if (server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) + if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) || + (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) tfm = crypto_alloc_aead("gcm(aes)", 0, 0); else tfm = crypto_alloc_aead("ccm(aes)", 0, 0); diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h index 1ff28529cf4b..a0e84747f567 100644 --- a/fs/cifs/smbfsctl.h +++ b/fs/cifs/smbfsctl.h @@ -103,6 +103,8 @@ #define FSCTL_SET_ZERO_ON_DEALLOC 0x00090194 /* BB add struct */ #define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */ #define FSCTL_GET_INTEGRITY_INFORMATION 0x0009027C +#define FSCTL_GET_RETRIEVAL_POINTERS_AND_REFCOUNT 0x000903d3 +#define FSCTL_GET_RETRIEVAL_POINTER_COUNT 0x0009042b #define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF #define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */ #define FSCTL_FILE_LEVEL_TRIM 0x00098208 /* BB add struct */ diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index eef4b08c7208..90e0fab69bb8 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -878,33 +878,39 @@ DEFINE_SMB3_RECONNECT_EVENT(partial_send_reconnect); DECLARE_EVENT_CLASS(smb3_credit_class, TP_PROTO(__u64 currmid, char *hostname, - int credits), - TP_ARGS(currmid, hostname, credits), + int credits, + int credits_to_add), + TP_ARGS(currmid, hostname, credits, credits_to_add), TP_STRUCT__entry( __field(__u64, currmid) __field(char *, hostname) __field(int, credits) + __field(int, credits_to_add) ), TP_fast_assign( __entry->currmid = currmid; __entry->hostname = hostname; __entry->credits = credits; + __entry->credits_to_add = credits_to_add; ), - TP_printk("server=%s current_mid=0x%llx credits=%d", + TP_printk("server=%s current_mid=0x%llx credits=%d credits_to_add=%d", __entry->hostname, __entry->currmid, - __entry->credits) + __entry->credits, + __entry->credits_to_add) ) #define DEFINE_SMB3_CREDIT_EVENT(name) \ DEFINE_EVENT(smb3_credit_class, smb3_##name, \ TP_PROTO(__u64 currmid, \ char *hostname, \ - int credits), \ - TP_ARGS(currmid, hostname, credits)) + int credits, \ + int credits_to_add), \ + TP_ARGS(currmid, hostname, credits, credits_to_add)) DEFINE_SMB3_CREDIT_EVENT(reconnect_with_invalid_credits); DEFINE_SMB3_CREDIT_EVENT(credit_timeout); +DEFINE_SMB3_CREDIT_EVENT(add_credits); #endif /* _CIFS_TRACE_H */ diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index ac7632482736..e27e255d40dd 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -563,7 +563,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, cifs_num_waiters_dec(server); if (!rc) { trace_smb3_credit_timeout(server->CurrentMid, - server->hostname, num_credits); + server->hostname, num_credits, 0); cifs_server_dbg(VFS, "wait timed out after %d ms\n", timeout); return -ENOTSUPP; @@ -604,7 +604,8 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, if (!rc) { trace_smb3_credit_timeout( server->CurrentMid, - server->hostname, num_credits); + server->hostname, num_credits, + 0); cifs_server_dbg(VFS, "wait timed out after %d ms\n", timeout); return -ENOTSUPP; diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 912308600d39..4b90cfd1ec36 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -690,8 +690,7 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = 0; buf->f_files = CRAMFS_SB(sb)->files; buf->f_ffree = 0; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); buf->f_namelen = CRAMFS_MAXPATHLEN; return 0; } diff --git a/fs/efs/super.c b/fs/efs/super.c index a4a945d0ac6a..62b155b9366b 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -342,8 +342,7 @@ static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) { sbi->inode_blocks * (EFS_BLOCKSIZE / sizeof(struct efs_dinode)); buf->f_ffree = sbi->inode_free; /* free inodes */ - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */ return 0; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index b9a09806512a..be10b16ea66e 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -561,8 +561,7 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = EROFS_NAME_LEN; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); return 0; } diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 3ffdce5c7384..87be5bfc31eb 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -89,8 +89,7 @@ static int exfat_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = sbi->num_clusters - 2; /* clu 0 & 1 */ buf->f_bfree = buf->f_blocks - sbi->used_clusters; buf->f_bavail = buf->f_bfree; - buf->f_fsid.val[0] = (unsigned int)id; - buf->f_fsid.val[1] = (unsigned int)(id >> 32); + buf->f_fsid = u64_to_fsid(id); /* Unicode utf16 255 characters */ buf->f_namelen = EXFAT_MAX_FILE_LEN * NLS_MAX_CHARSET_SIZE; return 0; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 7fab2b3b5b39..09f1fe676972 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1455,8 +1455,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf) buf->f_namelen = EXT2_NAME_LEN; fsid = le64_to_cpup((void *)es->s_uuid) ^ le64_to_cpup((void *)es->s_uuid + sizeof(u64)); - buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; - buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; + buf->f_fsid = u64_to_fsid(fsid); spin_unlock(&sbi->s_lock); return 0; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0f5a31498393..ef4734b40e2a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6144,8 +6144,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = EXT4_NAME_LEN; fsid = le64_to_cpup((void *)es->s_uuid) ^ le64_to_cpup((void *)es->s_uuid + sizeof(u64)); - buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; - buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; + buf->f_fsid = u64_to_fsid(fsid); #ifdef CONFIG_QUOTA if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) && diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 0c958fed3392..00eff2f51807 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1442,8 +1442,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) } buf->f_namelen = F2FS_NAME_LEN; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); #ifdef CONFIG_QUOTA if (is_inode_flag_set(dentry->d_inode, FI_PROJ_INHERIT) && diff --git a/fs/fat/inode.c b/fs/fat/inode.c index a0cf99debb1e..bab9b202b496 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -836,8 +836,7 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = sbi->max_cluster - FAT_START_ENT; buf->f_bfree = sbi->free_clusters; buf->f_bavail = sbi->free_clusters; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); buf->f_namelen = (sbi->options.isvfat ? FAT_LFN_LEN : 12) * NLS_MAX_CHARSET_SIZE; diff --git a/fs/file_table.c b/fs/file_table.c index 656647f9575a..709ada3151da 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -339,7 +339,7 @@ void fput_many(struct file *file, unsigned int refs) if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { init_task_work(&file->f_u.fu_rcuhead, ____fput); - if (!task_work_add(task, &file->f_u.fu_rcuhead, true)) + if (!task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME)) return; /* * After this task has run exit_task_work(), diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index d4af283fc888..9cd2ecad07db 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -91,22 +91,13 @@ static int gfs2_writepage(struct page *page, struct writeback_control *wbc) struct inode *inode = page->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_SHIFT; - unsigned offset; + struct iomap_writepage_ctx wpc = { }; if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) goto out; if (current->journal_info) goto redirty; - /* Is the page fully outside i_size? (truncate in progress) */ - offset = i_size & (PAGE_SIZE-1); - if (page->index > end_index || (page->index == end_index && !offset)) { - page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); - goto out; - } - - return nobh_writepage(page, gfs2_get_block_noalloc, wbc); + return iomap_writepage(page, wbc, &wpc, &gfs2_writeback_ops); redirty: redirty_page_for_writepage(wbc, page); @@ -115,11 +106,16 @@ out: return 0; } -/* This is the same as calling block_write_full_page, but it also +/** + * gfs2_write_jdata_page - gfs2 jdata-specific version of block_write_full_page + * @page: The page to write + * @wbc: The writeback control + * + * This is the same as calling block_write_full_page, but it also * writes pages outside of i_size */ -static int gfs2_write_full_page(struct page *page, get_block_t *get_block, - struct writeback_control *wbc) +static int gfs2_write_jdata_page(struct page *page, + struct writeback_control *wbc) { struct inode * const inode = page->mapping->host; loff_t i_size = i_size_read(inode); @@ -137,7 +133,7 @@ static int gfs2_write_full_page(struct page *page, get_block_t *get_block, if (page->index == end_index && offset) zero_user_segment(page, offset, PAGE_SIZE); - return __block_write_full_page(inode, page, get_block, wbc, + return __block_write_full_page(inode, page, gfs2_get_block_noalloc, wbc, end_buffer_async_write); } @@ -166,7 +162,7 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w } gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize); } - return gfs2_write_full_page(page, gfs2_get_block_noalloc, wbc); + return gfs2_write_jdata_page(page, wbc); } /** @@ -208,7 +204,8 @@ static int gfs2_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); - int ret = mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); + struct iomap_writepage_ctx wpc = { }; + int ret; /* * Even if we didn't write any pages here, we might still be holding @@ -216,9 +213,9 @@ static int gfs2_writepages(struct address_space *mapping, * want balance_dirty_pages() to loop indefinitely trying to write out * pages held in the ail that it can't find. */ + ret = iomap_writepages(mapping, wbc, &wpc, &gfs2_writeback_ops); if (ret == 0) set_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags); - return ret; } @@ -470,12 +467,13 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) static int __gfs2_readpage(void *file, struct page *page) { - struct gfs2_inode *ip = GFS2_I(page->mapping->host); - struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); + struct inode *inode = page->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); int error; - if (i_blocksize(page->mapping->host) == PAGE_SIZE && - !page_has_buffers(page)) { + if (!gfs2_is_jdata(ip) || + (i_blocksize(inode) == PAGE_SIZE && !page_has_buffers(page))) { error = iomap_readpage(page, &gfs2_iomap_ops); } else if (gfs2_is_stuffed(ip)) { error = stuffed_readpage(ip, page); @@ -563,8 +561,12 @@ static void gfs2_readahead(struct readahead_control *rac) struct inode *inode = rac->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); - if (!gfs2_is_stuffed(ip)) + if (gfs2_is_stuffed(ip)) + ; + else if (gfs2_is_jdata(ip)) mpage_readahead(rac, gfs2_block_map); + else + iomap_readahead(rac, &gfs2_iomap_ops); } /** @@ -621,7 +623,8 @@ out: static int jdata_set_page_dirty(struct page *page) { - SetPageChecked(page); + if (current->journal_info) + SetPageChecked(page); return __set_page_dirty_buffers(page); } @@ -663,8 +666,11 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh) if (bd) { if (!list_empty(&bd->bd_list) && !buffer_pinned(bh)) list_del_init(&bd->bd_list); - else + else { + spin_lock(&sdp->sd_ail_lock); gfs2_remove_from_journal(bh, REMOVE_JDATA); + spin_unlock(&sdp->sd_ail_lock); + } } bh->b_bdev = NULL; clear_buffer_mapped(bh); @@ -736,7 +742,6 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) */ gfs2_log_lock(sdp); - spin_lock(&sdp->sd_ail_lock); head = bh = page_buffers(page); do { if (atomic_read(&bh->b_count)) @@ -748,7 +753,6 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) goto cannot_release; bh = bh->b_this_page; } while(bh != head); - spin_unlock(&sdp->sd_ail_lock); head = bh = page_buffers(page); do { @@ -774,7 +778,6 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) return try_to_free_buffers(page); cannot_release: - spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); return 0; } @@ -784,12 +787,13 @@ static const struct address_space_operations gfs2_aops = { .writepages = gfs2_writepages, .readpage = gfs2_readpage, .readahead = gfs2_readahead, + .set_page_dirty = iomap_set_page_dirty, + .releasepage = iomap_releasepage, + .invalidatepage = iomap_invalidatepage, .bmap = gfs2_bmap, - .invalidatepage = gfs2_invalidatepage, - .releasepage = gfs2_releasepage, .direct_IO = noop_direct_IO, - .migratepage = buffer_migrate_page, - .is_partially_uptodate = block_is_partially_uptodate, + .migratepage = iomap_migrate_page, + .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 0f69fbd4af66..8dff9cbd0a87 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -56,7 +56,6 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, u64 block, struct page *page) { struct inode *inode = &ip->i_inode; - struct buffer_head *bh; int release = 0; if (!page || page->index) { @@ -80,20 +79,21 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, SetPageUptodate(page); } - if (!page_has_buffers(page)) - create_empty_buffers(page, BIT(inode->i_blkbits), - BIT(BH_Uptodate)); + if (gfs2_is_jdata(ip)) { + struct buffer_head *bh; - bh = page_buffers(page); + if (!page_has_buffers(page)) + create_empty_buffers(page, BIT(inode->i_blkbits), + BIT(BH_Uptodate)); - if (!buffer_mapped(bh)) - map_bh(bh, inode->i_sb, block); + bh = page_buffers(page); + if (!buffer_mapped(bh)) + map_bh(bh, inode->i_sb, block); - set_buffer_uptodate(bh); - if (gfs2_is_jdata(ip)) + set_buffer_uptodate(bh); gfs2_trans_add_data(ip->i_gl, bh); - else { - mark_buffer_dirty(bh); + } else { + set_page_dirty(page); gfs2_ordered_add_inode(ip); } @@ -1158,7 +1158,8 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, struct metapath mp = { .mp_aheight = 1, }; int ret; - iomap->flags |= IOMAP_F_BUFFER_HEAD; + if (gfs2_is_jdata(ip)) + iomap->flags |= IOMAP_F_BUFFER_HEAD; trace_gfs2_iomap_start(ip, pos, length, flags); if (gfs2_iomap_need_write_lock(flags)) { @@ -1291,6 +1292,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock, loff_t length = bh_map->b_size; struct metapath mp = { .mp_aheight = 1, }; struct iomap iomap = { }; + int flags = create ? IOMAP_WRITE : 0; int ret; clear_buffer_mapped(bh_map); @@ -1298,15 +1300,14 @@ int gfs2_block_map(struct inode *inode, sector_t lblock, clear_buffer_boundary(bh_map); trace_gfs2_bmap(ip, bh_map, lblock, create, 1); - if (create) { - ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, &iomap, &mp); - if (!ret && iomap.type == IOMAP_HOLE) + ret = gfs2_iomap_get(inode, pos, length, flags, &iomap, &mp); + if (!ret && iomap.type == IOMAP_HOLE) { + if (create) ret = gfs2_iomap_alloc(inode, &iomap, &mp); - release_metapath(&mp); - } else { - ret = gfs2_iomap_get(inode, pos, length, 0, &iomap, &mp); - release_metapath(&mp); + else + ret = -ENODATA; } + release_metapath(&mp); if (ret) goto out; @@ -2518,3 +2519,26 @@ out: gfs2_trans_end(sdp); return error; } + +static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode, + loff_t offset) +{ + struct metapath mp = { .mp_aheight = 1, }; + int ret; + + if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(inode)))) + return -EIO; + + if (offset >= wpc->iomap.offset && + offset < wpc->iomap.offset + wpc->iomap.length) + return 0; + + memset(&wpc->iomap, 0, sizeof(wpc->iomap)); + ret = gfs2_iomap_get(inode, offset, INT_MAX, 0, &wpc->iomap, &mp); + release_metapath(&mp); + return ret; +} + +const struct iomap_writeback_ops gfs2_writeback_ops = { + .map_blocks = gfs2_map_blocks, +}; diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index b88fd45ab79f..aed4632d47d3 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -44,6 +44,7 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip, } extern const struct iomap_ops gfs2_iomap_ops; +extern const struct iomap_writeback_ops gfs2_writeback_ops; extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); extern int gfs2_block_map(struct inode *inode, sector_t lblock, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index f13b136654ca..5441c17562c5 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -270,7 +270,12 @@ static void __gfs2_glock_put(struct gfs2_glock *gl) gfs2_glock_remove_from_lru(gl); spin_unlock(&gl->gl_lockref.lock); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); - GLOCK_BUG_ON(gl, mapping && mapping->nrpages && !gfs2_withdrawn(sdp)); + if (mapping) { + truncate_inode_pages_final(mapping); + if (!gfs2_withdrawn(sdp)) + GLOCK_BUG_ON(gl, mapping->nrpages || + mapping->nrexceptional); + } trace_gfs2_glock_put(gl); sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); } @@ -453,9 +458,6 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state) else gl->gl_lockref.count--; } - if (held1 && held2 && list_empty(&gl->gl_holders)) - clear_bit(GLF_QUEUED, &gl->gl_flags); - if (new_state != gl->gl_target) /* shorten our minimum hold time */ gl->gl_hold_time = max(gl->gl_hold_time - GL_GLOCK_HOLD_DECR, @@ -1049,7 +1051,8 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, gl->gl_object = NULL; gl->gl_hold_time = GL_GLOCK_DFT_HOLD; INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); - INIT_DELAYED_WORK(&gl->gl_delete, delete_work_func); + if (gl->gl_name.ln_type == LM_TYPE_IOPEN) + INIT_DELAYED_WORK(&gl->gl_delete, delete_work_func); mapping = gfs2_glock2aspace(gl); if (mapping) { @@ -1345,7 +1348,6 @@ fail: if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt)) insert_pt = &gh2->gh_list; } - set_bit(GLF_QUEUED, &gl->gl_flags); trace_gfs2_glock_queue(gh, 1); gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT); @@ -1646,16 +1648,15 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) unsigned long now = jiffies; gfs2_glock_hold(gl); + spin_lock(&gl->gl_lockref.lock); holdtime = gl->gl_tchange + gl->gl_hold_time; - if (test_bit(GLF_QUEUED, &gl->gl_flags) && + if (!list_empty(&gl->gl_holders) && gl->gl_name.ln_type == LM_TYPE_INODE) { if (time_before(now, holdtime)) delay = holdtime - now; if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) delay = gl->gl_hold_time; } - - spin_lock(&gl->gl_lockref.lock); handle_callback(gl, state, delay, true); __gfs2_glock_queue_work(gl, delay); spin_unlock(&gl->gl_lockref.lock); @@ -1842,10 +1843,9 @@ static struct shrinker glock_shrinker = { }; /** - * examine_bucket - Call a function for glock in a hash bucket + * glock_hash_walk - Call a function for glock in a hash bucket * @examiner: the function * @sdp: the filesystem - * @bucket: the bucket * * Note that the function can be called multiple times on the same * object. So the user must ensure that the function can cope with @@ -1901,9 +1901,11 @@ bool gfs2_delete_work_queued(const struct gfs2_glock *gl) static void flush_delete_work(struct gfs2_glock *gl) { - if (cancel_delayed_work(&gl->gl_delete)) { - queue_delayed_work(gfs2_delete_workqueue, - &gl->gl_delete, 0); + if (gl->gl_name.ln_type == LM_TYPE_IOPEN) { + if (cancel_delayed_work(&gl->gl_delete)) { + queue_delayed_work(gfs2_delete_workqueue, + &gl->gl_delete, 0); + } } gfs2_glock_queue_work(gl, 0); } @@ -2100,7 +2102,7 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) *p++ = 'I'; if (test_bit(GLF_FROZEN, gflags)) *p++ = 'F'; - if (test_bit(GLF_QUEUED, gflags)) + if (!list_empty(&gl->gl_holders)) *p++ = 'q'; if (test_bit(GLF_LRU, gflags)) *p++ = 'L'; @@ -2415,7 +2417,7 @@ static const struct seq_operations gfs2_glstats_seq_ops = { .show = gfs2_glstats_seq_show, }; -static const struct seq_operations gfs2_sbstats_seq_ops = { +static const struct seq_operations gfs2_sbstats_sops = { .start = gfs2_sbstats_seq_start, .next = gfs2_sbstats_seq_next, .stop = gfs2_sbstats_seq_stop, @@ -2468,16 +2470,6 @@ static int gfs2_glstats_open(struct inode *inode, struct file *file) return __gfs2_glocks_open(inode, file, &gfs2_glstats_seq_ops); } -static int gfs2_sbstats_open(struct inode *inode, struct file *file) -{ - int ret = seq_open(file, &gfs2_sbstats_seq_ops); - if (ret == 0) { - struct seq_file *seq = file->private_data; - seq->private = inode->i_private; /* sdp */ - } - return ret; -} - static const struct file_operations gfs2_glocks_fops = { .owner = THIS_MODULE, .open = gfs2_glocks_open, @@ -2494,13 +2486,7 @@ static const struct file_operations gfs2_glstats_fops = { .release = gfs2_glocks_release, }; -static const struct file_operations gfs2_sbstats_fops = { - .owner = THIS_MODULE, - .open = gfs2_sbstats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(gfs2_sbstats); void gfs2_create_debugfs_file(struct gfs2_sbd *sdp) { diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index de1d5f1d9ff8..aa3f5236befb 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -178,6 +178,9 @@ static int rgrp_go_sync(struct gfs2_glock *gl) struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct address_space *mapping = &sdp->sd_aspace; struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); + const unsigned bsize = sdp->sd_sb.sb_bsize; + loff_t start = (rgd->rd_addr * bsize) & PAGE_MASK; + loff_t end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1; int error; if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) @@ -186,18 +189,13 @@ static int rgrp_go_sync(struct gfs2_glock *gl) gfs2_log_flush(sdp, gl, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_RGRP_GO_SYNC); - filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end); - error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end); - WARN_ON_ONCE(error); + filemap_fdatawrite_range(mapping, start, end); + error = filemap_fdatawait_range(mapping, start, end); + WARN_ON_ONCE(error && !gfs2_withdrawn(sdp)); mapping_set_error(mapping, error); if (!error) error = gfs2_ail_empty_gl(gl); - - spin_lock(&gl->gl_lockref.lock); - rgd = gl->gl_object; - if (rgd) - gfs2_free_clones(rgd); - spin_unlock(&gl->gl_lockref.lock); + gfs2_free_clones(rgd); return error; } @@ -216,15 +214,23 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct address_space *mapping = &sdp->sd_aspace; struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); + const unsigned bsize = sdp->sd_sb.sb_bsize; + loff_t start = (rgd->rd_addr * bsize) & PAGE_MASK; + loff_t end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1; - if (rgd) - gfs2_rgrp_brelse(rgd); - + gfs2_rgrp_brelse(rgd); WARN_ON_ONCE(!(flags & DIO_METADATA)); - truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end); + truncate_inode_pages_range(mapping, start, end); + rgd->rd_flags &= ~GFS2_RDF_UPTODATE; +} + +static void gfs2_rgrp_go_dump(struct seq_file *seq, struct gfs2_glock *gl, + const char *fs_id_buf) +{ + struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); if (rgd) - rgd->rd_flags &= ~GFS2_RDF_UPTODATE; + gfs2_rgrp_dump(seq, rgd, fs_id_buf); } static struct gfs2_inode *gfs2_glock2inode(struct gfs2_glock *gl) @@ -712,7 +718,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = { .go_sync = rgrp_go_sync, .go_inval = rgrp_go_inval, .go_lock = gfs2_rgrp_go_lock, - .go_dump = gfs2_rgrp_dump, + .go_dump = gfs2_rgrp_go_dump, .go_type = LM_TYPE_RGRP, .go_flags = GLOF_LVB, }; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index ca2ec02436ec..d7707307f4b1 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -41,6 +41,10 @@ struct gfs2_log_header_host { u32 lh_flags; /* GFS2_LOG_HEAD_... */ u32 lh_tail; /* Block number of log tail */ u32 lh_blkno; + + s64 lh_local_total; + s64 lh_local_free; + s64 lh_local_dinodes; }; /* @@ -340,7 +344,6 @@ enum { GLF_REPLY_PENDING = 9, GLF_INITIAL = 10, GLF_FROZEN = 11, - GLF_QUEUED = 12, GLF_LRU = 13, GLF_OBJECT = 14, /* Used only for tracing */ GLF_BLOCKING = 15, @@ -378,17 +381,10 @@ struct gfs2_glock { atomic_t gl_ail_count; atomic_t gl_revokes; struct delayed_work gl_work; - union { - /* For iopen glocks only */ - struct { - struct delayed_work gl_delete; - u64 gl_no_formal_ino; - }; - /* For rgrp glocks only */ - struct { - loff_t start; - loff_t end; - } gl_vm; + /* For iopen glocks only */ + struct { + struct delayed_work gl_delete; + u64 gl_no_formal_ino; }; struct rcu_head gl_rcu; struct rhash_head gl_node; @@ -701,10 +697,18 @@ struct gfs2_pcpu_lkstats { struct gfs2_lkstats lkstats[10]; }; +/* List of local (per node) statfs inodes */ +struct local_statfs_inode { + struct list_head si_list; + struct inode *si_sc_inode; + unsigned int si_jid; /* journal id this statfs inode corresponds to */ +}; + struct gfs2_sbd { struct super_block *sd_vfs; struct gfs2_pcpu_lkstats __percpu *sd_lkstats; struct kobject sd_kobj; + struct completion sd_kobj_unregister; unsigned long sd_flags; /* SDF_... */ struct gfs2_sb_host sd_sb; @@ -751,6 +755,7 @@ struct gfs2_sbd { struct inode *sd_jindex; struct inode *sd_statfs_inode; struct inode *sd_sc_inode; + struct list_head sd_sc_inodes_list; struct inode *sd_qc_inode; struct inode *sd_rindex; struct inode *sd_quota_inode; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 3763c9ff1406..9133b3178677 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -70,7 +70,7 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct) * */ -static void gfs2_remove_from_ail(struct gfs2_bufdata *bd) +void gfs2_remove_from_ail(struct gfs2_bufdata *bd) { bd->bd_tr = NULL; list_del_init(&bd->bd_ail_st_list); @@ -244,13 +244,15 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp) * @tr: the transaction * @max_revokes: If nonzero, issue revokes for the bd items for written buffers * + * returns: the transaction's count of remaining active items */ -static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr, +static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr, int *max_revokes) { struct gfs2_bufdata *bd, *s; struct buffer_head *bh; + int active_count = 0; list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list, bd_ail_st_list) { @@ -265,8 +267,10 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr, * If the ail buffer is not busy and caught an error, flag it * for others. */ - if (!sdp->sd_log_error && buffer_busy(bh)) + if (!sdp->sd_log_error && buffer_busy(bh)) { + active_count++; continue; + } if (!buffer_uptodate(bh) && !cmpxchg(&sdp->sd_log_error, 0, -EIO)) { gfs2_io_error_bh(sdp, bh); @@ -285,6 +289,7 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr, } list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list); } + return active_count; } /** @@ -303,8 +308,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int max_revokes) spin_lock(&sdp->sd_ail_lock); list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) { - gfs2_ail1_empty_one(sdp, tr, &max_revokes); - if (list_empty(&tr->tr_ail1_list) && oldest_tr) + if (!gfs2_ail1_empty_one(sdp, tr, &max_revokes) && oldest_tr) list_move(&tr->tr_list, &sdp->sd_ail2_list); else oldest_tr = 0; @@ -716,16 +720,24 @@ void gfs2_write_revokes(struct gfs2_sbd *sdp) atomic_dec(&sdp->sd_log_blks_free); /* If no blocks have been reserved, we need to also * reserve a block for the header */ - if (!sdp->sd_log_blks_reserved) + if (!sdp->sd_log_blks_reserved) { atomic_dec(&sdp->sd_log_blks_free); + trace_gfs2_log_blocks(sdp, -2); + } else { + trace_gfs2_log_blocks(sdp, -1); + } } gfs2_ail1_empty(sdp, max_revokes); gfs2_log_unlock(sdp); if (!sdp->sd_log_num_revoke) { atomic_inc(&sdp->sd_log_blks_free); - if (!sdp->sd_log_blks_reserved) + if (!sdp->sd_log_blks_reserved) { atomic_inc(&sdp->sd_log_blks_free); + trace_gfs2_log_blocks(sdp, 2); + } else { + trace_gfs2_log_blocks(sdp, 1); + } } } @@ -902,7 +914,7 @@ static void empty_ail1_list(struct gfs2_sbd *sdp) } /** - * drain_bd - drain the buf and databuf queue for a failed transaction + * trans_drain - drain the buf and databuf queue for a failed transaction * @tr: the transaction to drain * * When this is called, we're taking an error exit for a log write that failed @@ -954,10 +966,8 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) goto out; /* Log might have been flushed while we waited for the flush lock */ - if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) { - up_write(&sdp->sd_log_flush_lock); - return; - } + if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) + goto out; trace_gfs2_log_flush(sdp, 1, flags); if (flags & GFS2_LOG_HEAD_FLUSH_SHUTDOWN) @@ -971,25 +981,25 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) if (unlikely (state == SFS_FROZEN)) if (gfs2_assert_withdraw_delayed(sdp, !tr->tr_num_buf_new && !tr->tr_num_databuf_new)) - goto out; + goto out_withdraw; } if (unlikely(state == SFS_FROZEN)) if (gfs2_assert_withdraw_delayed(sdp, !sdp->sd_log_num_revoke)) - goto out; + goto out_withdraw; if (gfs2_assert_withdraw_delayed(sdp, sdp->sd_log_num_revoke == sdp->sd_log_committed_revoke)) - goto out; + goto out_withdraw; gfs2_ordered_write(sdp); if (gfs2_withdrawn(sdp)) - goto out; + goto out_withdraw; lops_before_commit(sdp, tr); if (gfs2_withdrawn(sdp)) - goto out; + goto out_withdraw; gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE); if (gfs2_withdrawn(sdp)) - goto out; + goto out_withdraw; if (sdp->sd_log_head != sdp->sd_log_flush_head) { log_flush_wait(sdp); @@ -1000,7 +1010,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) log_write_header(sdp, flags); } if (gfs2_withdrawn(sdp)) - goto out; + goto out_withdraw; lops_after_commit(sdp, tr); gfs2_log_lock(sdp); @@ -1020,7 +1030,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) if (!sdp->sd_log_idle) { empty_ail1_list(sdp); if (gfs2_withdrawn(sdp)) - goto out; + goto out_withdraw; atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ trace_gfs2_log_blocks(sdp, -1); log_write_header(sdp, flags); @@ -1033,27 +1043,30 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) atomic_set(&sdp->sd_freeze_state, SFS_FROZEN); } -out: - if (gfs2_withdrawn(sdp)) { - trans_drain(tr); - /** - * If the tr_list is empty, we're withdrawing during a log - * flush that targets a transaction, but the transaction was - * never queued onto any of the ail lists. Here we add it to - * ail1 just so that ail_drain() will find and free it. - */ - spin_lock(&sdp->sd_ail_lock); - if (tr && list_empty(&tr->tr_list)) - list_add(&tr->tr_list, &sdp->sd_ail1_list); - spin_unlock(&sdp->sd_ail_lock); - ail_drain(sdp); /* frees all transactions */ - tr = NULL; - } - +out_end: trace_gfs2_log_flush(sdp, 0, flags); +out: up_write(&sdp->sd_log_flush_lock); - gfs2_trans_free(sdp, tr); + if (gfs2_withdrawing(sdp)) + gfs2_withdraw(sdp); + return; + +out_withdraw: + trans_drain(tr); + /** + * If the tr_list is empty, we're withdrawing during a log + * flush that targets a transaction, but the transaction was + * never queued onto any of the ail lists. Here we add it to + * ail1 just so that ail_drain() will find and free it. + */ + spin_lock(&sdp->sd_ail_lock); + if (tr && list_empty(&tr->tr_list)) + list_add(&tr->tr_list, &sdp->sd_ail1_list); + spin_unlock(&sdp->sd_ail_lock); + ail_drain(sdp); /* frees all transactions */ + tr = NULL; + goto out_end; } /** diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 8965c751a303..79f97290146e 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -63,7 +63,7 @@ static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) extern void gfs2_ordered_del_inode(struct gfs2_inode *ip); extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct); - +extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd); extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); extern void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index ed1da4323967..ed69298dd824 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -823,7 +823,7 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start, * */ -static void gfs2_meta_sync(struct gfs2_glock *gl) +void gfs2_meta_sync(struct gfs2_glock *gl) { struct address_space *mapping = gfs2_glock2aspace(gl); struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 9c5e4e491e03..4a3d8aecdf82 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -27,6 +27,7 @@ extern void gfs2_log_submit_bio(struct bio **biop, int opf); extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, bool keep_cache); +extern void gfs2_meta_sync(struct gfs2_glock *gl); static inline unsigned int buf_limit(struct gfs2_sbd *sdp) { diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 9856cc2e0795..2db573e31f78 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -348,38 +348,109 @@ void gfs2_remove_from_journal(struct buffer_head *bh, int meta) brelse(bh); } if (bd) { - spin_lock(&sdp->sd_ail_lock); if (bd->bd_tr) { gfs2_trans_add_revoke(sdp, bd); } else if (was_pinned) { bh->b_private = NULL; kmem_cache_free(gfs2_bufdata_cachep, bd); + } else if (!list_empty(&bd->bd_ail_st_list) && + !list_empty(&bd->bd_ail_gl_list)) { + gfs2_remove_from_ail(bd); } - spin_unlock(&sdp->sd_ail_lock); } clear_buffer_dirty(bh); clear_buffer_uptodate(bh); } /** - * gfs2_meta_wipe - make inode's buffers so they aren't dirty/pinned anymore + * gfs2_ail1_wipe - remove deleted/freed buffers from the ail1 list + * @sdp: superblock + * @bstart: starting block address of buffers to remove + * @blen: length of buffers to be removed + * + * This function is called from gfs2_journal wipe, whose job is to remove + * buffers, corresponding to deleted blocks, from the journal. If we find any + * bufdata elements on the system ail1 list, they haven't been written to + * the journal yet. So we remove them. + */ +static void gfs2_ail1_wipe(struct gfs2_sbd *sdp, u64 bstart, u32 blen) +{ + struct gfs2_trans *tr, *s; + struct gfs2_bufdata *bd, *bs; + struct buffer_head *bh; + u64 end = bstart + blen; + + gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); + list_for_each_entry_safe(tr, s, &sdp->sd_ail1_list, tr_list) { + list_for_each_entry_safe(bd, bs, &tr->tr_ail1_list, + bd_ail_st_list) { + bh = bd->bd_bh; + if (bh->b_blocknr < bstart || bh->b_blocknr >= end) + continue; + + gfs2_remove_from_journal(bh, REMOVE_JDATA); + } + } + spin_unlock(&sdp->sd_ail_lock); + gfs2_log_unlock(sdp); +} + +static struct buffer_head *gfs2_getjdatabuf(struct gfs2_inode *ip, u64 blkno) +{ + struct address_space *mapping = ip->i_inode.i_mapping; + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct page *page; + struct buffer_head *bh; + unsigned int shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift; + unsigned long index = blkno >> shift; /* convert block to page */ + unsigned int bufnum = blkno - (index << shift); + + page = find_get_page_flags(mapping, index, FGP_LOCK|FGP_ACCESSED); + if (!page) + return NULL; + if (!page_has_buffers(page)) { + unlock_page(page); + put_page(page); + return NULL; + } + /* Locate header for our buffer within our page */ + for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page) + /* Do nothing */; + get_bh(bh); + unlock_page(page); + put_page(page); + return bh; +} + +/** + * gfs2_journal_wipe - make inode's buffers so they aren't dirty/pinned anymore * @ip: the inode who owns the buffers * @bstart: the first buffer in the run * @blen: the number of buffers in the run * */ -void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen) +void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head *bh; + int ty; + gfs2_ail1_wipe(sdp, bstart, blen); while (blen) { + ty = REMOVE_META; bh = gfs2_getbuf(ip->i_gl, bstart, NO_CREATE); + if (!bh && gfs2_is_jdata(ip)) { + bh = gfs2_getjdatabuf(ip, bstart); + ty = REMOVE_JDATA; + } if (bh) { lock_buffer(bh); gfs2_log_lock(sdp); - gfs2_remove_from_journal(bh, REMOVE_META); + spin_lock(&sdp->sd_ail_lock); + gfs2_remove_from_journal(bh, ty); + spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); unlock_buffer(bh); brelse(bh); diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index eafb74e861c6..4a8c01929b79 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -60,7 +60,7 @@ enum { }; extern void gfs2_remove_from_journal(struct buffer_head *bh, int meta); -extern void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); +extern void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); extern int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, struct buffer_head **bhp); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 6d18d2c91add..7a7e3c10a9a9 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -110,6 +110,8 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) spin_lock_init(&sdp->sd_trunc_lock); spin_lock_init(&sdp->sd_bitmap_lock); + INIT_LIST_HEAD(&sdp->sd_sc_inodes_list); + mapping = &sdp->sd_aspace; address_space_init_once(mapping); @@ -169,15 +171,19 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent) return -EINVAL; } - /* If format numbers match exactly, we're done. */ - - if (sb->sb_fs_format == GFS2_FORMAT_FS && - sb->sb_multihost_format == GFS2_FORMAT_MULTI) - return 0; + if (sb->sb_fs_format != GFS2_FORMAT_FS || + sb->sb_multihost_format != GFS2_FORMAT_MULTI) { + fs_warn(sdp, "Unknown on-disk format, unable to mount\n"); + return -EINVAL; + } - fs_warn(sdp, "Unknown on-disk format, unable to mount\n"); + if (sb->sb_bsize < 512 || sb->sb_bsize > PAGE_SIZE || + (sb->sb_bsize & (sb->sb_bsize - 1))) { + pr_warn("Invalid superblock size\n"); + return -EINVAL; + } - return -EINVAL; + return 0; } static void end_bio_io_page(struct bio *bio) @@ -604,6 +610,90 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) return error; } +/** + * init_statfs - look up and initialize master and local (per node) statfs inodes + * @sdp: The GFS2 superblock + * + * This should be called after the jindex is initialized in init_journal() and + * before gfs2_journal_recovery() is called because we need to be able to write + * to these inodes during recovery. + * + * Returns: errno + */ +static int init_statfs(struct gfs2_sbd *sdp) +{ + int error = 0; + struct inode *master = d_inode(sdp->sd_master_dir); + struct inode *pn = NULL; + char buf[30]; + struct gfs2_jdesc *jd; + struct gfs2_inode *ip; + + sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs"); + if (IS_ERR(sdp->sd_statfs_inode)) { + error = PTR_ERR(sdp->sd_statfs_inode); + fs_err(sdp, "can't read in statfs inode: %d\n", error); + goto fail; + } + + pn = gfs2_lookup_simple(master, "per_node"); + if (IS_ERR(pn)) { + error = PTR_ERR(pn); + fs_err(sdp, "can't find per_node directory: %d\n", error); + goto put_statfs; + } + + /* For each jid, lookup the corresponding local statfs inode in the + * per_node metafs directory and save it in the sdp->sd_sc_inodes_list. */ + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + struct local_statfs_inode *lsi = + kmalloc(sizeof(struct local_statfs_inode), GFP_NOFS); + if (!lsi) { + error = -ENOMEM; + goto free_local; + } + sprintf(buf, "statfs_change%u", jd->jd_jid); + lsi->si_sc_inode = gfs2_lookup_simple(pn, buf); + if (IS_ERR(lsi->si_sc_inode)) { + error = PTR_ERR(lsi->si_sc_inode); + fs_err(sdp, "can't find local \"sc\" file#%u: %d\n", + jd->jd_jid, error); + goto free_local; + } + lsi->si_jid = jd->jd_jid; + if (jd->jd_jid == sdp->sd_jdesc->jd_jid) + sdp->sd_sc_inode = lsi->si_sc_inode; + + list_add_tail(&lsi->si_list, &sdp->sd_sc_inodes_list); + } + + iput(pn); + ip = GFS2_I(sdp->sd_sc_inode); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, + &sdp->sd_sc_gh); + if (error) { + fs_err(sdp, "can't lock local \"sc\" file: %d\n", error); + goto free_local; + } + return 0; + +free_local: + free_local_statfs_inodes(sdp); + iput(pn); +put_statfs: + iput(sdp->sd_statfs_inode); +fail: + return error; +} + +/* Uninitialize and free up memory used by the list of statfs inodes */ +static void uninit_statfs(struct gfs2_sbd *sdp) +{ + gfs2_glock_dq_uninit(&sdp->sd_sc_gh); + free_local_statfs_inodes(sdp); + iput(sdp->sd_statfs_inode); +} + static int init_journal(struct gfs2_sbd *sdp, int undo) { struct inode *master = d_inode(sdp->sd_master_dir); @@ -690,6 +780,11 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) } trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free)); + /* Lookup statfs inodes here so journal recovery can use them. */ + error = init_statfs(sdp); + if (error) + goto fail_jinode_gh; + if (sdp->sd_lockstruct.ls_first) { unsigned int x; for (x = 0; x < sdp->sd_journals; x++) { @@ -698,14 +793,14 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) if (sdp->sd_args.ar_spectator) { error = check_journal_clean(sdp, jd, true); if (error) - goto fail_jinode_gh; + goto fail_statfs; continue; } error = gfs2_recover_journal(jd, true); if (error) { fs_err(sdp, "error recovering journal %u: %d\n", x, error); - goto fail_jinode_gh; + goto fail_statfs; } } @@ -714,7 +809,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) error = gfs2_recover_journal(sdp->sd_jdesc, true); if (error) { fs_err(sdp, "error recovering my journal: %d\n", error); - goto fail_jinode_gh; + goto fail_statfs; } } @@ -725,6 +820,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) INIT_WORK(&sdp->sd_freeze_work, gfs2_freeze_func); return 0; +fail_statfs: + uninit_statfs(sdp); fail_jinode_gh: /* A withdraw may have done dq/uninit so now we need to check it */ if (!sdp->sd_args.ar_spectator && @@ -758,20 +855,12 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) if (error) goto fail; - /* Read in the master statfs inode */ - sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs"); - if (IS_ERR(sdp->sd_statfs_inode)) { - error = PTR_ERR(sdp->sd_statfs_inode); - fs_err(sdp, "can't read in statfs inode: %d\n", error); - goto fail_journal; - } - /* Read in the resource index inode */ sdp->sd_rindex = gfs2_lookup_simple(master, "rindex"); if (IS_ERR(sdp->sd_rindex)) { error = PTR_ERR(sdp->sd_rindex); fs_err(sdp, "can't get resource index inode: %d\n", error); - goto fail_statfs; + goto fail_journal; } sdp->sd_rindex_uptodate = 0; @@ -800,8 +889,6 @@ fail_qinode: fail_rindex: gfs2_clear_rgrpd(sdp); iput(sdp->sd_rindex); -fail_statfs: - iput(sdp->sd_statfs_inode); fail_journal: init_journal(sdp, UNDO); fail: @@ -829,14 +916,6 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) return error; } - sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid); - sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf); - if (IS_ERR(sdp->sd_sc_inode)) { - error = PTR_ERR(sdp->sd_sc_inode); - fs_err(sdp, "can't find local \"sc\" file: %d\n", error); - goto fail; - } - sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid); sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf); if (IS_ERR(sdp->sd_qc_inode)) { @@ -848,33 +927,21 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) iput(pn); pn = NULL; - ip = GFS2_I(sdp->sd_sc_inode); - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, - &sdp->sd_sc_gh); - if (error) { - fs_err(sdp, "can't lock local \"sc\" file: %d\n", error); - goto fail_qc_i; - } - ip = GFS2_I(sdp->sd_qc_inode); error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &sdp->sd_qc_gh); if (error) { fs_err(sdp, "can't lock local \"qc\" file: %d\n", error); - goto fail_ut_gh; + goto fail_qc_i; } return 0; fail_qc_gh: gfs2_glock_dq_uninit(&sdp->sd_qc_gh); -fail_ut_gh: - gfs2_glock_dq_uninit(&sdp->sd_sc_gh); fail_qc_i: iput(sdp->sd_qc_inode); fail_ut_i: - iput(sdp->sd_sc_inode); -fail: iput(pn); return error; } @@ -1062,26 +1129,14 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) } error = init_names(sdp, silent); - if (error) { - /* In this case, we haven't initialized sysfs, so we have to - manually free the sdp. */ - free_sbd(sdp); - sb->s_fs_info = NULL; - return error; - } + if (error) + goto fail_free; snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s", sdp->sd_table_name); error = gfs2_sys_fs_add(sdp); - /* - * If we hit an error here, gfs2_sys_fs_add will have called function - * kobject_put which causes the sysfs usage count to go to zero, which - * causes sysfs to call function gfs2_sbd_release, which frees sdp. - * Subsequent error paths here will call gfs2_sys_fs_del, which also - * kobject_put to free sdp. - */ if (error) - return error; + goto fail_free; gfs2_create_debugfs_file(sdp); @@ -1179,9 +1234,9 @@ fail_lm: gfs2_lm_unmount(sdp); fail_debug: gfs2_delete_debugfs_file(sdp); - /* gfs2_sys_fs_del must be the last thing we do, since it causes - * sysfs to call function gfs2_sbd_release, which frees sdp. */ gfs2_sys_fs_del(sdp); +fail_free: + free_sbd(sdp); sb->s_fs_info = NULL; return error; } diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 390ea79d682c..b5cbe21efdfb 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -144,6 +144,10 @@ int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh, head->lh_tail = be32_to_cpu(lh->lh_tail); head->lh_blkno = be32_to_cpu(lh->lh_blkno); + head->lh_local_total = be64_to_cpu(lh->lh_local_total); + head->lh_local_free = be64_to_cpu(lh->lh_local_free); + head->lh_local_dinodes = be64_to_cpu(lh->lh_local_dinodes); + return 0; } /** @@ -292,6 +296,109 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, sdp->sd_lockstruct.ls_ops->lm_recovery_result(sdp, jid, message); } +/** + * update_statfs_inode - Update the master statfs inode or zero out the local + * statfs inode for a given journal. + * @jd: The journal + * @head: If NULL, @inode is the local statfs inode and we need to zero it out. + * Otherwise, it @head contains the statfs change info that needs to be + * synced to the master statfs inode (pointed to by @inode). + * @inode: statfs inode to update. + */ +static int update_statfs_inode(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, + struct inode *inode) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_inode *ip; + struct buffer_head *bh; + struct gfs2_statfs_change_host sc; + int error = 0; + + BUG_ON(!inode); + ip = GFS2_I(inode); + + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + goto out; + + spin_lock(&sdp->sd_statfs_spin); + + if (head) { /* Update the master statfs inode */ + gfs2_statfs_change_in(&sc, bh->b_data + sizeof(struct gfs2_dinode)); + sc.sc_total += head->lh_local_total; + sc.sc_free += head->lh_local_free; + sc.sc_dinodes += head->lh_local_dinodes; + gfs2_statfs_change_out(&sc, bh->b_data + sizeof(struct gfs2_dinode)); + + fs_info(sdp, "jid=%u: Updated master statfs Total:%lld, " + "Free:%lld, Dinodes:%lld after change " + "[%+lld,%+lld,%+lld]\n", jd->jd_jid, sc.sc_total, + sc.sc_free, sc.sc_dinodes, head->lh_local_total, + head->lh_local_free, head->lh_local_dinodes); + } else { /* Zero out the local statfs inode */ + memset(bh->b_data + sizeof(struct gfs2_dinode), 0, + sizeof(struct gfs2_statfs_change)); + /* If it's our own journal, reset any in-memory changes too */ + if (jd->jd_jid == sdp->sd_lockstruct.ls_jid) { + memset(&sdp->sd_statfs_local, 0, + sizeof(struct gfs2_statfs_change_host)); + } + } + spin_unlock(&sdp->sd_statfs_spin); + + mark_buffer_dirty(bh); + brelse(bh); + gfs2_meta_sync(ip->i_gl); + +out: + return error; +} + +/** + * recover_local_statfs - Update the master and local statfs changes for this + * journal. + * + * Previously, statfs updates would be read in from the local statfs inode and + * synced to the master statfs inode during recovery. + * + * We now use the statfs updates in the journal head to update the master statfs + * inode instead of reading in from the local statfs inode. To preserve backward + * compatibility with kernels that can't do this, we still need to keep the + * local statfs inode up to date by writing changes to it. At some point in the + * future, we can do away with the local statfs inodes altogether and keep the + * statfs changes solely in the journal. + * + * @jd: the journal + * @head: the journal head + * + * Returns: errno + */ +static void recover_local_statfs(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head) +{ + int error; + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + + if (!head->lh_local_total && !head->lh_local_free + && !head->lh_local_dinodes) /* No change */ + goto zero_local; + + /* First update the master statfs inode with the changes we + * found in the journal. */ + error = update_statfs_inode(jd, head, sdp->sd_statfs_inode); + if (error) + goto out; + +zero_local: + /* Zero out the local statfs inode so any changes in there + * are not re-recovered. */ + error = update_statfs_inode(jd, NULL, + find_local_statfs_inode(sdp, jd->jd_jid)); +out: + return; +} + void gfs2_recover_func(struct work_struct *work) { struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); @@ -411,6 +518,7 @@ void gfs2_recover_func(struct work_struct *work) goto fail_gunlock_thaw; } + recover_local_statfs(jd, &head); clean_journal(jd, &head); up_read(&sdp->sd_log_flush_lock); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 074f228ea839..ee491bb9c1cc 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -878,7 +878,6 @@ static int rgd_insert(struct gfs2_rgrpd *rgd) static int read_rindex_entry(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - const unsigned bsize = sdp->sd_sb.sb_bsize; loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex); struct gfs2_rindex buf; int error; @@ -924,9 +923,6 @@ static int read_rindex_entry(struct gfs2_inode *ip) spin_unlock(&sdp->sd_rindex_spin); if (!error) { glock_set_object(rgd->rd_gl, rgd); - rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_MASK; - rgd->rd_gl->gl_vm.end = PAGE_ALIGN((rgd->rd_addr + - rgd->rd_length) * bsize) - 1; return 0; } @@ -2209,20 +2205,17 @@ static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd, /** * gfs2_rgrp_dump - print out an rgrp * @seq: The iterator - * @gl: The glock in question + * @rgd: The rgrp in question * @fs_id_buf: pointer to file system id (if requested) * */ -void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_glock *gl, +void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd, const char *fs_id_buf) { - struct gfs2_rgrpd *rgd = gl->gl_object; struct gfs2_blkreserv *trs; const struct rb_node *n; - if (rgd == NULL) - return; gfs2_print_dbg(seq, "%s R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n", fs_id_buf, (unsigned long long)rgd->rd_addr, rgd->rd_flags, @@ -2253,7 +2246,7 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd) (unsigned long long)rgd->rd_addr); fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n"); sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname); - gfs2_rgrp_dump(NULL, rgd->rd_gl, fs_id_buf); + gfs2_rgrp_dump(NULL, rgd, fs_id_buf); rgd->rd_flags |= GFS2_RDF_ERROR; } @@ -2445,8 +2438,8 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); /* Directories keep their data in the metadata address space */ - if (meta || ip->i_depth) - gfs2_meta_wipe(ip, bstart, blen); + if (meta || ip->i_depth || gfs2_is_jdata(ip)) + gfs2_journal_wipe(ip, bstart, blen); } /** @@ -2502,7 +2495,7 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) gfs2_statfs_change(sdp, 0, +1, -1); trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE); gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid); - gfs2_meta_wipe(ip, ip->i_no_addr, 1); + gfs2_journal_wipe(ip, ip->i_no_addr, 1); } /** diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index a1d7e14fc55b..9a587ada51ed 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -67,7 +67,7 @@ extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist); extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); -extern void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_glock *gl, +extern void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd, const char *fs_id_buf); extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, struct buffer_head *bh, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 9f4d9e7be839..b285192bd6b3 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -44,6 +44,12 @@ #include "xattr.h" #include "lops.h" +enum dinode_demise { + SHOULD_DELETE_DINODE, + SHOULD_NOT_DELETE_DINODE, + SHOULD_DEFER_EVICTION, +}; + /** * gfs2_jindex_free - Clear all the journal index information * @sdp: The GFS2 superblock @@ -224,7 +230,7 @@ void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) sc->sc_dinodes = be64_to_cpu(str->sc_dinodes); } -static void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf) +void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf) { struct gfs2_statfs_change *str = buf; @@ -702,6 +708,8 @@ restart: if (error) gfs2_io_error(sdp); } + WARN_ON(gfs2_withdrawing(sdp)); + /* At this point, we're through modifying the disk */ /* Release stuff */ @@ -721,7 +729,7 @@ restart: gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); gfs2_glock_dq_uninit(&sdp->sd_sc_gh); gfs2_glock_dq_uninit(&sdp->sd_qc_gh); - iput(sdp->sd_sc_inode); + free_local_statfs_inodes(sdp); iput(sdp->sd_qc_inode); } @@ -736,6 +744,7 @@ restart: /* At this point, we're through participating in the lockspace */ gfs2_sys_fs_del(sdp); + free_sbd(sdp); } /** @@ -1309,109 +1318,98 @@ static bool gfs2_upgrade_iopen_glock(struct inode *inode) } /** - * gfs2_evict_inode - Remove an inode from cache + * evict_should_delete - determine whether the inode is eligible for deletion * @inode: The inode to evict * - * There are three cases to consider: - * 1. i_nlink == 0, we are final opener (and must deallocate) - * 2. i_nlink == 0, we are not the final opener (and cannot deallocate) - * 3. i_nlink > 0 - * - * If the fs is read only, then we have to treat all cases as per #3 - * since we are unable to do any deallocation. The inode will be - * deallocated by the next read/write node to attempt an allocation - * in the same resource group + * This function determines whether the evicted inode is eligible to be deleted + * and locks the inode glock. * - * We have to (at the moment) hold the inodes main lock to cover - * the gap between unlocking the shared lock on the iopen lock and - * taking the exclusive lock. I'd rather do a shared -> exclusive - * conversion on the iopen lock, but we can change that later. This - * is safe, just less efficient. + * Returns: the fate of the dinode */ - -static void gfs2_evict_inode(struct inode *inode) +static enum dinode_demise evict_should_delete(struct inode *inode, + struct gfs2_holder *gh) { + struct gfs2_inode *ip = GFS2_I(inode); struct super_block *sb = inode->i_sb; struct gfs2_sbd *sdp = sb->s_fs_info; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder gh; - struct address_space *metamapping; - int error; - - if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) { - clear_inode(inode); - return; - } - - if (inode->i_nlink || sb_rdonly(sb)) - goto out; + int ret; if (test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) { BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl)); - gfs2_holder_mark_uninitialized(&gh); - goto out_delete; + goto should_delete; } if (test_bit(GIF_DEFERRED_DELETE, &ip->i_flags)) - goto out; + return SHOULD_DEFER_EVICTION; /* Deletes should never happen under memory pressure anymore. */ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) - goto out; + return SHOULD_DEFER_EVICTION; /* Must not read inode block until block type has been verified */ - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); - if (unlikely(error)) { + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, gh); + if (unlikely(ret)) { glock_clear_object(ip->i_iopen_gh.gh_gl, ip); ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&ip->i_iopen_gh); - goto out; + return SHOULD_DEFER_EVICTION; } if (gfs2_inode_already_deleted(ip->i_gl, ip->i_no_formal_ino)) - goto out_truncate; - error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); - if (error) - goto out_truncate; + return SHOULD_NOT_DELETE_DINODE; + ret = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); + if (ret) + return SHOULD_NOT_DELETE_DINODE; if (test_bit(GIF_INVALID, &ip->i_flags)) { - error = gfs2_inode_refresh(ip); - if (error) - goto out_truncate; + ret = gfs2_inode_refresh(ip); + if (ret) + return SHOULD_NOT_DELETE_DINODE; } /* * The inode may have been recreated in the meantime. */ if (inode->i_nlink) - goto out_truncate; + return SHOULD_NOT_DELETE_DINODE; -out_delete: +should_delete: if (gfs2_holder_initialized(&ip->i_iopen_gh) && test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { if (!gfs2_upgrade_iopen_glock(inode)) { gfs2_holder_uninit(&ip->i_iopen_gh); - goto out_truncate; + return SHOULD_NOT_DELETE_DINODE; } } + return SHOULD_DELETE_DINODE; +} + +/** + * evict_unlinked_inode - delete the pieces of an unlinked evicted inode + * @inode: The inode to evict + */ +static int evict_unlinked_inode(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + int ret; if (S_ISDIR(inode->i_mode) && (ip->i_diskflags & GFS2_DIF_EXHASH)) { - error = gfs2_dir_exhash_dealloc(ip); - if (error) - goto out_unlock; + ret = gfs2_dir_exhash_dealloc(ip); + if (ret) + goto out; } if (ip->i_eattr) { - error = gfs2_ea_dealloc(ip); - if (error) - goto out_unlock; + ret = gfs2_ea_dealloc(ip); + if (ret) + goto out; } if (!gfs2_is_stuffed(ip)) { - error = gfs2_file_dealloc(ip); - if (error) - goto out_unlock; + ret = gfs2_file_dealloc(ip); + if (ret) + goto out; } /* We're about to clear the bitmap for the dinode, but as soon as we @@ -1419,11 +1417,24 @@ out_delete: location and try to set gl_object again. We clear gl_object here so that subsequent inode creates don't see an old gl_object. */ glock_clear_object(ip->i_gl, ip); - error = gfs2_dinode_dealloc(ip); + ret = gfs2_dinode_dealloc(ip); gfs2_inode_remember_delete(ip->i_gl, ip->i_no_formal_ino); - goto out_unlock; +out: + return ret; +} + +/* + * evict_linked_inode - evict an inode whose dinode has not been unlinked + * @inode: The inode to evict + */ +static int evict_linked_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_inode *ip = GFS2_I(inode); + struct address_space *metamapping; + int ret; -out_truncate: gfs2_log_flush(sdp, ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_EVICT_INODE); metamapping = gfs2_glock2aspace(ip->i_gl); @@ -1434,15 +1445,63 @@ out_truncate: write_inode_now(inode, 1); gfs2_ail_flush(ip->i_gl, 0); - error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); - if (error) - goto out_unlock; + ret = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); + if (ret) + return ret; + /* Needs to be done before glock release & also in a transaction */ truncate_inode_pages(&inode->i_data, 0); truncate_inode_pages(metamapping, 0); gfs2_trans_end(sdp); + return 0; +} + +/** + * gfs2_evict_inode - Remove an inode from cache + * @inode: The inode to evict + * + * There are three cases to consider: + * 1. i_nlink == 0, we are final opener (and must deallocate) + * 2. i_nlink == 0, we are not the final opener (and cannot deallocate) + * 3. i_nlink > 0 + * + * If the fs is read only, then we have to treat all cases as per #3 + * since we are unable to do any deallocation. The inode will be + * deallocated by the next read/write node to attempt an allocation + * in the same resource group + * + * We have to (at the moment) hold the inodes main lock to cover + * the gap between unlocking the shared lock on the iopen lock and + * taking the exclusive lock. I'd rather do a shared -> exclusive + * conversion on the iopen lock, but we can change that later. This + * is safe, just less efficient. + */ + +static void gfs2_evict_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) { + clear_inode(inode); + return; + } + + if (inode->i_nlink || sb_rdonly(sb)) + goto out; + + gfs2_holder_mark_uninitialized(&gh); + ret = evict_should_delete(inode, &gh); + if (ret == SHOULD_DEFER_EVICTION) + goto out; + if (ret == SHOULD_DELETE_DINODE) + ret = evict_unlinked_inode(inode); + else + ret = evict_linked_inode(inode); -out_unlock: if (gfs2_rs_active(&ip->i_res)) gfs2_rs_deltree(&ip->i_res); @@ -1450,8 +1509,8 @@ out_unlock: glock_clear_object(ip->i_gl, ip); gfs2_glock_dq_uninit(&gh); } - if (error && error != GLR_TRYFAILED && error != -EROFS) - fs_warn(sdp, "gfs2_evict_inode: %d\n", error); + if (ret && ret != GLR_TRYFAILED && ret != -EROFS) + fs_warn(sdp, "gfs2_evict_inode: %d\n", ret); out: truncate_inode_pages_final(&inode->i_data); if (ip->i_qadata) @@ -1502,6 +1561,35 @@ static void gfs2_free_inode(struct inode *inode) kmem_cache_free(gfs2_inode_cachep, GFS2_I(inode)); } +extern void free_local_statfs_inodes(struct gfs2_sbd *sdp) +{ + struct local_statfs_inode *lsi, *safe; + + /* Run through the statfs inodes list to iput and free memory */ + list_for_each_entry_safe(lsi, safe, &sdp->sd_sc_inodes_list, si_list) { + if (lsi->si_jid == sdp->sd_jdesc->jd_jid) + sdp->sd_sc_inode = NULL; /* belongs to this node */ + if (lsi->si_sc_inode) + iput(lsi->si_sc_inode); + list_del(&lsi->si_list); + kfree(lsi); + } +} + +extern struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp, + unsigned int index) +{ + struct local_statfs_inode *lsi; + + /* Return the local (per node) statfs inode in the + * sdp->sd_sc_inodes_list corresponding to the 'index'. */ + list_for_each_entry(lsi, &sdp->sd_sc_inodes_list, si_list) { + if (lsi->si_jid == index) + return lsi->si_sc_inode; + } + return NULL; +} + const struct super_operations gfs2_super_ops = { .alloc_inode = gfs2_alloc_inode, .free_inode = gfs2_free_inode, diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index 51900554ed81..c9fb2a654181 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -37,11 +37,16 @@ extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, s64 dinodes); extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf); +extern void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, + void *buf); extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, struct buffer_head *l_bh); extern int gfs2_statfs_sync(struct super_block *sb, int type); extern void gfs2_freeze_func(struct work_struct *work); +extern void free_local_statfs_inodes(struct gfs2_sbd *sdp); +extern struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp, + unsigned int index); extern void free_sbd(struct gfs2_sbd *sdp); extern struct file_system_type gfs2_fs_type; diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index d28c41bd69b0..c3e72dba7418 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -303,7 +303,7 @@ static void gfs2_sbd_release(struct kobject *kobj) { struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); - free_sbd(sdp); + complete(&sdp->sd_kobj_unregister); } static struct kobj_type gfs2_ktype = { @@ -655,6 +655,7 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp) sprintf(ro, "RDONLY=%d", sb_rdonly(sb)); sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0); + init_completion(&sdp->sd_kobj_unregister); sdp->sd_kobj.kset = gfs2_kset; error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL, "%s", sdp->sd_table_name); @@ -685,6 +686,7 @@ fail_tune: fail_reg: fs_err(sdp, "error %d adding sysfs files\n", error); kobject_put(&sdp->sd_kobj); + wait_for_completion(&sdp->sd_kobj_unregister); sb->s_fs_info = NULL; return error; } @@ -695,6 +697,7 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp) sysfs_remove_group(&sdp->sd_kobj, &tune_group); sysfs_remove_group(&sdp->sd_kobj, &lock_module_group); kobject_put(&sdp->sd_kobj); + wait_for_completion(&sdp->sd_kobj_unregister); } static int gfs2_uevent(struct kset *kset, struct kobject *kobj, diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index e0025258107a..0b2f858d9a8c 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -56,7 +56,6 @@ {(1UL << GLF_REPLY_PENDING), "r" }, \ {(1UL << GLF_INITIAL), "I" }, \ {(1UL << GLF_FROZEN), "F" }, \ - {(1UL << GLF_QUEUED), "q" }, \ {(1UL << GLF_LRU), "L" }, \ {(1UL << GLF_OBJECT), "o" }, \ {(1UL << GLF_BLOCKING), "b" }) @@ -388,15 +387,17 @@ TRACE_EVENT(gfs2_log_blocks, TP_STRUCT__entry( __field( dev_t, dev ) __field( int, blocks ) + __field( int, blks_free ) ), TP_fast_assign( __entry->dev = sdp->sd_vfs->s_dev; __entry->blocks = blocks; + __entry->blks_free = atomic_read(&sdp->sd_log_blks_free); ), - TP_printk("%u,%u log reserve %d", MAJOR(__entry->dev), - MINOR(__entry->dev), __entry->blocks) + TP_printk("%u,%u log reserve %d %d", MAJOR(__entry->dev), + MINOR(__entry->dev), __entry->blocks, __entry->blks_free) ); /* Writing back the AIL */ diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 1cd0328cae20..0fba3bf64189 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -419,7 +419,7 @@ void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, char fs_id_buf[sizeof(sdp->sd_fsname) + 7]; sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname); - gfs2_rgrp_dump(NULL, rgd->rd_gl, fs_id_buf); + gfs2_rgrp_dump(NULL, rgd, fs_id_buf); gfs2_lm(sdp, "fatal: filesystem consistency error\n" " RG = %llu\n" diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 6d9157efe16c..d7562981b3a0 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -205,6 +205,16 @@ static inline bool gfs2_withdrawn(struct gfs2_sbd *sdp) test_bit(SDF_WITHDRAWING, &sdp->sd_flags); } +/** + * gfs2_withdrawing - check if a withdraw is pending + * @sdp: the superblock + */ +static inline bool gfs2_withdrawing(struct gfs2_sbd *sdp) +{ + return test_bit(SDF_WITHDRAWING, &sdp->sd_flags) && + !test_bit(SDF_WITHDRAWN, &sdp->sd_flags); +} + #define gfs2_tune_get(sdp, field) \ gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field) diff --git a/fs/hfs/super.c b/fs/hfs/super.c index c33324686d89..44d07c9e3a7f 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -104,8 +104,7 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = buf->f_bfree; buf->f_files = HFS_SB(sb)->fs_ablocks; buf->f_ffree = HFS_SB(sb)->free_ablocks; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); buf->f_namelen = HFS_NAMELEN; return 0; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 129dca3f4b78..807119ae5adf 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -320,8 +320,7 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = buf->f_bfree; buf->f_files = 0xFFFFFFFF; buf->f_ffree = 0xFFFFFFFF - sbi->next_cnid; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); buf->f_namelen = HFSPLUS_MAX_STRLEN; return 0; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 0a677a9aaf34..a7dbfc892022 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -192,8 +192,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = sbi->sb_n_free; buf->f_files = sbi->sb_dirband_size / 4; buf->f_ffree = hpfs_get_free_dnodes(s); - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); buf->f_namelen = 254; hpfs_unlock(s); diff --git a/fs/io-wq.c b/fs/io-wq.c index 7cb3b4cb9b11..02894df7656d 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -19,7 +19,9 @@ #include <linux/task_work.h> #include <linux/blk-cgroup.h> #include <linux/audit.h> +#include <linux/cpu.h> +#include "../kernel/sched/sched.h" #include "io-wq.h" #define WORKER_IDLE_TIMEOUT (5 * HZ) @@ -123,9 +125,13 @@ struct io_wq { refcount_t refs; struct completion done; + struct hlist_node cpuhp_node; + refcount_t use_refs; }; +static enum cpuhp_state io_wq_online; + static bool io_worker_get(struct io_worker *worker) { return refcount_inc_not_zero(&worker->ref); @@ -187,7 +193,8 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) worker->blkcg_css = NULL; } #endif - + if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY) + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; return dropped_lock; } @@ -483,7 +490,10 @@ static void io_impersonate_work(struct io_worker *worker, if ((work->flags & IO_WQ_WORK_CREDS) && worker->cur_creds != work->identity->creds) io_wq_switch_creds(worker, work); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize; + if (work->flags & IO_WQ_WORK_FSIZE) + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize; + else if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY) + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; io_wq_switch_blkcg(worker, work); #ifdef CONFIG_AUDIT current->loginuid = work->identity->loginuid; @@ -1087,10 +1097,12 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) return ERR_PTR(-ENOMEM); wq->wqes = kcalloc(nr_node_ids, sizeof(struct io_wqe *), GFP_KERNEL); - if (!wq->wqes) { - kfree(wq); - return ERR_PTR(-ENOMEM); - } + if (!wq->wqes) + goto err_wq; + + ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node); + if (ret) + goto err_wqes; wq->free_work = data->free_work; wq->do_work = data->do_work; @@ -1098,6 +1110,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) /* caller must already hold a reference to this */ wq->user = data->user; + ret = -ENOMEM; for_each_node(node) { struct io_wqe *wqe; int alloc_node = node; @@ -1141,9 +1154,12 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) ret = PTR_ERR(wq->manager); complete(&wq->done); err: + cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); for_each_node(node) kfree(wq->wqes[node]); +err_wqes: kfree(wq->wqes); +err_wq: kfree(wq); return ERR_PTR(ret); } @@ -1160,6 +1176,8 @@ static void __io_wq_destroy(struct io_wq *wq) { int node; + cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); + set_bit(IO_WQ_BIT_EXIT, &wq->state); if (wq->manager) kthread_stop(wq->manager); @@ -1187,3 +1205,41 @@ struct task_struct *io_wq_get_task(struct io_wq *wq) { return wq->manager; } + +static bool io_wq_worker_affinity(struct io_worker *worker, void *data) +{ + struct task_struct *task = worker->task; + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(task, &rf); + do_set_cpus_allowed(task, cpumask_of_node(worker->wqe->node)); + task->flags |= PF_NO_SETAFFINITY; + task_rq_unlock(rq, task, &rf); + return false; +} + +static int io_wq_cpu_online(unsigned int cpu, struct hlist_node *node) +{ + struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node); + int i; + + rcu_read_lock(); + for_each_node(i) + io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, NULL); + rcu_read_unlock(); + return 0; +} + +static __init int io_wq_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "io-wq/online", + io_wq_cpu_online, NULL); + if (ret < 0) + return ret; + io_wq_online = ret; + return 0; +} +subsys_initcall(io_wq_init); diff --git a/fs/io-wq.h b/fs/io-wq.h index be21c500c925..cba36f03c355 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -17,6 +17,7 @@ enum { IO_WQ_WORK_MM = 128, IO_WQ_WORK_CREDS = 256, IO_WQ_WORK_BLKCG = 512, + IO_WQ_WORK_FSIZE = 1024, IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */ }; diff --git a/fs/io_uring.c b/fs/io_uring.c index 02dc81622081..b42dfa0243bf 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -277,7 +277,7 @@ struct io_ring_ctx { unsigned sq_mask; unsigned sq_thread_idle; unsigned cached_sq_dropped; - atomic_t cached_cq_overflow; + unsigned cached_cq_overflow; unsigned long sq_check_overflow; struct list_head defer_list; @@ -585,6 +585,7 @@ enum { REQ_F_BUFFER_SELECTED_BIT, REQ_F_NO_FILE_TABLE_BIT, REQ_F_WORK_INITIALIZED_BIT, + REQ_F_LTIMEOUT_ACTIVE_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -614,7 +615,7 @@ enum { REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), /* must not punt to workers */ REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), - /* has linked timeout */ + /* has or had linked timeout */ REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), /* regular file */ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), @@ -628,6 +629,8 @@ enum { REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT), /* io_wq_work is initialized */ REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT), + /* linked timeout is active, i.e. prepared by link's head */ + REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT), }; struct async_poll { @@ -750,8 +753,6 @@ struct io_op_def { unsigned pollout : 1; /* op supports buffer selection */ unsigned buffer_select : 1; - /* needs rlimit(RLIMIT_FSIZE) assigned */ - unsigned needs_fsize : 1; /* must always have async data allocated */ unsigned needs_async_data : 1; /* size of async data needed, if any */ @@ -775,10 +776,10 @@ static const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, - .needs_fsize = 1, .needs_async_data = 1, .async_size = sizeof(struct io_async_rw), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | + IO_WQ_WORK_FSIZE, }, [IORING_OP_FSYNC] = { .needs_file = 1, @@ -789,16 +790,16 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollin = 1, .async_size = sizeof(struct io_async_rw), - .work_flags = IO_WQ_WORK_BLKCG, + .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM, }, [IORING_OP_WRITE_FIXED] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, - .needs_fsize = 1, .async_size = sizeof(struct io_async_rw), - .work_flags = IO_WQ_WORK_BLKCG, + .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE | + IO_WQ_WORK_MM, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, @@ -856,8 +857,7 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_FALLOCATE] = { .needs_file = 1, - .needs_fsize = 1, - .work_flags = IO_WQ_WORK_BLKCG, + .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE, }, [IORING_OP_OPENAT] = { .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG | @@ -887,9 +887,9 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, - .needs_fsize = 1, .async_size = sizeof(struct io_async_rw), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | + IO_WQ_WORK_FSIZE, }, [IORING_OP_FADVISE] = { .needs_file = 1, @@ -1070,6 +1070,12 @@ static void io_init_identity(struct io_identity *id) refcount_set(&id->count, 1); } +static inline void __io_req_init_async(struct io_kiocb *req) +{ + memset(&req->work, 0, sizeof(req->work)); + req->flags |= REQ_F_WORK_INITIALIZED; +} + /* * Note: must call io_req_init_async() for the first time you * touch any members of io_wq_work. @@ -1081,8 +1087,7 @@ static inline void io_req_init_async(struct io_kiocb *req) if (req->flags & REQ_F_WORK_INITIALIZED) return; - memset(&req->work, 0, sizeof(req->work)); - req->flags |= REQ_F_WORK_INITIALIZED; + __io_req_init_async(req); /* Grab a ref if this isn't our static identity */ req->work.identity = tctx->identity; @@ -1174,7 +1179,7 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) struct io_ring_ctx *ctx = req->ctx; return seq != ctx->cached_cq_tail - + atomic_read(&ctx->cached_cq_overflow); + + READ_ONCE(ctx->cached_cq_overflow); } return false; @@ -1285,8 +1290,11 @@ static bool io_grab_identity(struct io_kiocb *req) struct io_identity *id = req->work.identity; struct io_ring_ctx *ctx = req->ctx; - if (def->needs_fsize && id->fsize != rlimit(RLIMIT_FSIZE)) - return false; + if (def->work_flags & IO_WQ_WORK_FSIZE) { + if (id->fsize != rlimit(RLIMIT_FSIZE)) + return false; + req->work.flags |= IO_WQ_WORK_FSIZE; + } if (!(req->work.flags & IO_WQ_WORK_FILES) && (def->work_flags & IO_WQ_WORK_FILES) && @@ -1619,8 +1627,9 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, WRITE_ONCE(cqe->res, req->result); WRITE_ONCE(cqe->flags, req->compl.cflags); } else { + ctx->cached_cq_overflow++; WRITE_ONCE(ctx->rings->cq_overflow, - atomic_inc_return(&ctx->cached_cq_overflow)); + ctx->cached_cq_overflow); } } @@ -1662,8 +1671,8 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) * then we cannot store the request for later flushing, we need * to drop it on the floor. */ - WRITE_ONCE(ctx->rings->cq_overflow, - atomic_inc_return(&ctx->cached_cq_overflow)); + ctx->cached_cq_overflow++; + WRITE_ONCE(ctx->rings->cq_overflow, ctx->cached_cq_overflow); } else { if (list_empty(&ctx->cq_overflow_list)) { set_bit(0, &ctx->sq_check_overflow); @@ -1865,6 +1874,12 @@ static bool __io_kill_linked_timeout(struct io_kiocb *req) link = list_first_entry(&req->link_list, struct io_kiocb, link_list); if (link->opcode != IORING_OP_LINK_TIMEOUT) return false; + /* + * Can happen if a linked timeout fired and link had been like + * req -> link t-out -> link t-out [-> ...] + */ + if (!(link->flags & REQ_F_LTIMEOUT_ACTIVE)) + return false; list_del_init(&link->link_list); wake_ev = io_link_cancel_timeout(link); @@ -1908,10 +1923,12 @@ static struct io_kiocb *io_req_link_next(struct io_kiocb *req) /* * Called if REQ_F_LINK_HEAD is set, and we fail the head request */ -static void __io_fail_links(struct io_kiocb *req) +static void io_fail_links(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + unsigned long flags; + spin_lock_irqsave(&ctx->completion_lock, flags); while (!list_empty(&req->link_list)) { struct io_kiocb *link = list_first_entry(&req->link_list, struct io_kiocb, link_list); @@ -1933,15 +1950,6 @@ static void __io_fail_links(struct io_kiocb *req) } io_commit_cqring(ctx); -} - -static void io_fail_links(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - spin_lock_irqsave(&ctx->completion_lock, flags); - __io_fail_links(req); spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); @@ -1976,7 +1984,8 @@ static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok) { struct task_struct *tsk = req->task; struct io_ring_ctx *ctx = req->ctx; - int ret, notify; + enum task_work_notify_mode notify; + int ret; if (tsk->flags & PF_EXITING) return -ESRCH; @@ -1987,7 +1996,7 @@ static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok) * processing task_work. There's no reliable way to tell if TWA_RESUME * will do the job. */ - notify = 0; + notify = TWA_NONE; if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok) notify = TWA_SIGNAL; @@ -2056,7 +2065,7 @@ static void io_req_task_queue(struct io_kiocb *req) init_task_work(&req->task_work, io_req_task_cancel); tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, 0); + task_work_add(tsk, &req->task_work, TWA_NONE); wake_up_process(tsk); } } @@ -2177,7 +2186,7 @@ static void io_free_req_deferred(struct io_kiocb *req) struct task_struct *tsk; tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, 0); + task_work_add(tsk, &req->task_work, TWA_NONE); wake_up_process(tsk); } } @@ -3108,9 +3117,10 @@ static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) * For files that don't have ->read_iter() and ->write_iter(), handle them * by looping over ->read() or ->write() manually. */ -static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, - struct iov_iter *iter) +static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) { + struct kiocb *kiocb = &req->rw.kiocb; + struct file *file = req->file; ssize_t ret = 0; /* @@ -3130,11 +3140,8 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, if (!iov_iter_is_bvec(iter)) { iovec = iov_iter_iovec(iter); } else { - /* fixed buffers import bvec */ - iovec.iov_base = kmap(iter->bvec->bv_page) - + iter->iov_offset; - iovec.iov_len = min(iter->count, - iter->bvec->bv_len - iter->iov_offset); + iovec.iov_base = u64_to_user_ptr(req->rw.addr); + iovec.iov_len = req->rw.len; } if (rw == READ) { @@ -3145,9 +3152,6 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, iovec.iov_len, io_kiocb_ppos(kiocb)); } - if (iov_iter_is_bvec(iter)) - kunmap(iter->bvec->bv_page); - if (nr < 0) { if (!ret) ret = nr; @@ -3156,6 +3160,8 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, ret += nr; if (nr != iovec.iov_len) break; + req->rw.len -= nr; + req->rw.addr += nr; iov_iter_advance(iter, nr); } @@ -3291,7 +3297,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, /* queue just for cancelation */ init_task_work(&req->task_work, io_req_task_cancel); tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, 0); + task_work_add(tsk, &req->task_work, TWA_NONE); wake_up_process(tsk); } return 1; @@ -3345,7 +3351,7 @@ static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) if (req->file->f_op->read_iter) return call_read_iter(req->file, &req->rw.kiocb, iter); else if (req->file->f_op->read) - return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter); + return loop_rw_iter(READ, req, iter); else return -EINVAL; } @@ -3536,7 +3542,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, if (req->file->f_op->write_iter) ret2 = call_write_iter(req->file, kiocb, iter); else if (req->file->f_op->write) - ret2 = loop_rw_iter(WRITE, req->file, kiocb, iter); + ret2 = loop_rw_iter(WRITE, req, iter); else ret2 = -EINVAL; @@ -4857,7 +4863,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, WRITE_ONCE(poll->canceled, true); tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, 0); + task_work_add(tsk, &req->task_work, TWA_NONE); wake_up_process(tsk); } return 1; @@ -4926,32 +4932,25 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) io_commit_cqring(ctx); } -static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt) +static void io_poll_task_func(struct callback_head *cb) { + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *nxt; if (io_poll_rewait(req, &req->poll)) { spin_unlock_irq(&ctx->completion_lock); - return; - } - - hash_del(&req->hash_node); - io_poll_complete(req, req->result, 0); - spin_unlock_irq(&ctx->completion_lock); - - *nxt = io_put_req_find_next(req); - io_cqring_ev_posted(ctx); -} + } else { + hash_del(&req->hash_node); + io_poll_complete(req, req->result, 0); + spin_unlock_irq(&ctx->completion_lock); -static void io_poll_task_func(struct callback_head *cb) -{ - struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); - struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *nxt = NULL; + nxt = io_put_req_find_next(req); + io_cqring_ev_posted(ctx); + if (nxt) + __io_req_task_submit(nxt); + } - io_poll_task_handler(req, &nxt); - if (nxt) - __io_req_task_submit(nxt); percpu_ref_put(&ctx->refs); } @@ -5105,6 +5104,7 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req, struct io_ring_ctx *ctx = req->ctx; bool cancel = false; + INIT_HLIST_NODE(&req->hash_node); io_init_poll_iocb(poll, mask, wake_func); poll->file = req->file; poll->wait.private = req; @@ -5166,7 +5166,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req) req->flags |= REQ_F_POLLED; req->apoll = apoll; - INIT_HLIST_NODE(&req->hash_node); mask = 0; if (def->pollin) @@ -5348,8 +5347,6 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return -EINVAL; if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) return -EINVAL; - if (!poll->file) - return -EBADF; events = READ_ONCE(sqe->poll32_events); #ifdef __BIG_ENDIAN @@ -5367,7 +5364,6 @@ static int io_poll_add(struct io_kiocb *req) struct io_poll_table ipt; __poll_t mask; - INIT_HLIST_NODE(&req->hash_node); ipt.pt._qproc = io_poll_queue_proc; mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events, @@ -6117,10 +6113,9 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) if (!list_empty(&req->link_list)) { prev = list_entry(req->link_list.prev, struct io_kiocb, link_list); - if (refcount_inc_not_zero(&prev->refs)) { + if (refcount_inc_not_zero(&prev->refs)) list_del_init(&req->link_list); - prev->flags &= ~REQ_F_LINK_TIMEOUT; - } else + else prev = NULL; } @@ -6177,6 +6172,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT) return NULL; + nxt->flags |= REQ_F_LTIMEOUT_ACTIVE; req->flags |= REQ_F_LINK_TIMEOUT; return nxt; } @@ -6191,7 +6187,8 @@ static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs) again: linked_timeout = io_prep_linked_timeout(req); - if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.identity->creds && + if ((req->flags & REQ_F_WORK_INITIALIZED) && + (req->work.flags & IO_WQ_WORK_CREDS) && req->work.identity->creds != current_cred()) { if (old_creds) revert_creds(old_creds); @@ -6199,7 +6196,6 @@ again: old_creds = NULL; /* restored original creds */ else old_creds = override_creds(req->work.identity->creds); - req->work.flags |= IO_WQ_WORK_CREDS; } ret = io_issue_sqe(req, true, cs); @@ -6240,8 +6236,10 @@ punt: if (nxt) { req = nxt; - if (req->flags & REQ_F_FORCE_ASYNC) + if (req->flags & REQ_F_FORCE_ASYNC) { + linked_timeout = NULL; goto punt; + } goto again; } exit: @@ -6504,12 +6502,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (id) { struct io_identity *iod; - io_req_init_async(req); iod = idr_find(&ctx->personality_idr, id); if (unlikely(!iod)) return -EINVAL; refcount_inc(&iod->count); - io_put_identity(current->io_uring, req); + + __io_req_init_async(req); get_cred(iod->creds); req->work.identity = iod; req->work.flags |= IO_WQ_WORK_CREDS; @@ -8685,19 +8683,11 @@ static void io_uring_del_task_file(struct file *file) fput(file); } -static void __io_uring_attempt_task_drop(struct file *file) -{ - struct file *old = xa_load(¤t->io_uring->xa, (unsigned long)file); - - if (old == file) - io_uring_del_task_file(file); -} - /* * Drop task note for this file if we're the only ones that hold it after * pending fput() */ -static void io_uring_attempt_task_drop(struct file *file, bool exiting) +static void io_uring_attempt_task_drop(struct file *file) { if (!current->io_uring) return; @@ -8705,10 +8695,9 @@ static void io_uring_attempt_task_drop(struct file *file, bool exiting) * fput() is pending, will be 2 if the only other ref is our potential * task file note. If the task is exiting, drop regardless of count. */ - if (!exiting && atomic_long_read(&file->f_count) != 2) - return; - - __io_uring_attempt_task_drop(file); + if (fatal_signal_pending(current) || (current->flags & PF_EXITING) || + atomic_long_read(&file->f_count) == 2) + io_uring_del_task_file(file); } void __io_uring_files_cancel(struct files_struct *files) @@ -8766,16 +8755,7 @@ void __io_uring_task_cancel(void) static int io_uring_flush(struct file *file, void *data) { - struct io_ring_ctx *ctx = file->private_data; - - /* - * If the task is going away, cancel work it may have pending - */ - if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) - data = NULL; - - io_uring_cancel_task_requests(ctx, data); - io_uring_attempt_task_drop(file, !data); + io_uring_attempt_task_drop(file); return 0; } diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 78f5c96c76f3..ec90773527ee 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1038,8 +1038,7 @@ static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = 0; buf->f_files = ISOFS_SB(sb)->s_ninodes; buf->f_ffree = 0; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); buf->f_namelen = NAME_MAX; return 0; } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 7b09a9158e40..34f546404aa1 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -383,8 +383,7 @@ static int minix_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = sbi->s_ninodes; buf->f_ffree = minix_count_free_inodes(sb); buf->f_namelen = sbi->s_namelen; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); return 0; } diff --git a/fs/namei.c b/fs/namei.c index f1eb8ccd2be9..d4a6dd772303 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1626,7 +1626,8 @@ static const char *pick_link(struct nameidata *nd, struct path *link, return ERR_PTR(error); } - if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS)) + if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) || + unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW)) return ERR_PTR(-ELOOP); if (!(nd->flags & LOOKUP_RCU)) { diff --git a/fs/namespace.c b/fs/namespace.c index 294e05a13d17..cebaa3e81794 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1191,7 +1191,7 @@ static void mntput_no_expire(struct mount *mnt) struct task_struct *task = current; if (likely(!(task->flags & PF_KTHREAD))) { init_task_work(&mnt->mnt_rcu, __cleanup_mnt); - if (!task_work_add(task, &mnt->mnt_rcu, true)) + if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME)) return; } if (llist_add(&mnt->mnt_llist, &delayed_mntput_list)) @@ -3171,6 +3171,8 @@ int path_mount(const char *dev_name, struct path *path, mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); if (flags & MS_RDONLY) mnt_flags |= MNT_READONLY; + if (flags & MS_NOSYMFOLLOW) + mnt_flags |= MNT_NOSYMFOLLOW; /* The default atime for remount is preservation */ if ((flags & MS_REMOUNT) && diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 2eee5fb1a882..4abd928b0bc8 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -651,8 +651,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = nmaxinodes; buf->f_ffree = nfreeinodes; buf->f_namelen = NILFS_NAME_LEN; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); return 0; } diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 7dc3bc604f78..0d7e948cb29c 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -2643,8 +2643,7 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs) * the least significant 32-bits in f_fsid[0] and the most significant * 32-bits in f_fsid[1]. */ - sfs->f_fsid.val[0] = vol->serial_no & 0xffffffff; - sfs->f_fsid.val[1] = (vol->serial_no >> 32) & 0xffffffff; + sfs->f_fsid = u64_to_fsid(vol->serial_no); /* Maximum length of filenames. */ sfs->f_namelen = NTFS_MAX_NAME_LEN; return 0; diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index b76ec6b88ded..ce93ccca8639 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -282,8 +282,7 @@ static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = sbi->s_num_blocks; buf->f_files = sbi->s_num_blocks; buf->f_namelen = OMFS_NAMELEN; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); buf->f_bfree = buf->f_bavail = buf->f_ffree = omfs_count_free(s); @@ -363,12 +362,11 @@ static int omfs_get_imap(struct super_block *sb) bh = sb_bread(sb, block++); if (!bh) goto nomem_free; - *ptr = kmalloc(sb->s_blocksize, GFP_KERNEL); + *ptr = kmemdup(bh->b_data, sb->s_blocksize, GFP_KERNEL); if (!*ptr) { brelse(bh); goto nomem_free; } - memcpy(*ptr, bh->b_data, sb->s_blocksize); if (count < sb->s_blocksize) memset((void *)*ptr + count, 0xff, sb->s_blocksize - count); diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 3059a9394c2d..e59d4bb3a89e 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -70,6 +70,7 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) { MNT_NOATIME, ",noatime" }, { MNT_NODIRATIME, ",nodiratime" }, { MNT_RELATIME, ",relatime" }, + { MNT_NOSYMFOLLOW, ",nosymfollow" }, { 0, NULL } }; const struct proc_fs_opts *fs_infop; diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index e8da1cde87b9..3fb7fc819b4f 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -137,8 +137,7 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bfree = qnx4_count_free_blocks(sb); buf->f_bavail = buf->f_bfree; buf->f_namelen = QNX4_NAME_MAX; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); return 0; } diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 755293c8c71a..61191f7bdf62 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -166,8 +166,7 @@ static int qnx6_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_ffree = fs32_to_cpu(sbi, sbi->sb->sb_free_inodes); buf->f_bavail = buf->f_bfree; buf->f_namelen = QNX6_LONG_NAME_MAX; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); return 0; } diff --git a/fs/read_write.c b/fs/read_write.c index a669fb049b84..75f764b43418 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1411,6 +1411,59 @@ static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in, } /* + * Performs necessary checks before doing a file copy + * + * Can adjust amount of bytes to copy via @req_count argument. + * Returns appropriate error code that caller should return or + * zero in case the copy should be allowed. + */ +static int generic_copy_file_checks(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t *req_count, unsigned int flags) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + uint64_t count = *req_count; + loff_t size_in; + int ret; + + ret = generic_file_rw_checks(file_in, file_out); + if (ret) + return ret; + + /* Don't touch certain kinds of inodes */ + if (IS_IMMUTABLE(inode_out)) + return -EPERM; + + if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) + return -ETXTBSY; + + /* Ensure offsets don't wrap. */ + if (pos_in + count < pos_in || pos_out + count < pos_out) + return -EOVERFLOW; + + /* Shorten the copy to EOF */ + size_in = i_size_read(inode_in); + if (pos_in >= size_in) + count = 0; + else + count = min(count, size_in - (uint64_t)pos_in); + + ret = generic_write_check_limits(file_out, pos_out, &count); + if (ret) + return ret; + + /* Don't allow overlapped copying within the same file. */ + if (inode_in == inode_out && + pos_out + count > pos_in && + pos_out < pos_in + count) + return -EINVAL; + + *req_count = count; + return 0; +} + +/* * copy_file_range() differs from regular file read and write in that it * specifically allows return partial success. When it does so is up to * the copy_file_range method. @@ -1542,475 +1595,92 @@ out2: return ret; } -static int remap_verify_area(struct file *file, loff_t pos, loff_t len, - bool write) -{ - struct inode *inode = file_inode(file); - - if (unlikely(pos < 0 || len < 0)) - return -EINVAL; - - if (unlikely((loff_t) (pos + len) < 0)) - return -EINVAL; - - if (unlikely(inode->i_flctx && mandatory_lock(inode))) { - loff_t end = len ? pos + len - 1 : OFFSET_MAX; - int retval; - - retval = locks_mandatory_area(inode, file, pos, end, - write ? F_WRLCK : F_RDLCK); - if (retval < 0) - return retval; - } - - return security_file_permission(file, write ? MAY_WRITE : MAY_READ); -} -/* - * Ensure that we don't remap a partial EOF block in the middle of something - * else. Assume that the offsets have already been checked for block - * alignment. - * - * For clone we only link a partial EOF block above or at the destination file's - * EOF. For deduplication we accept a partial EOF block only if it ends at the - * destination file's EOF (can not link it into the middle of a file). - * - * Shorten the request if possible. - */ -static int generic_remap_check_len(struct inode *inode_in, - struct inode *inode_out, - loff_t pos_out, - loff_t *len, - unsigned int remap_flags) -{ - u64 blkmask = i_blocksize(inode_in) - 1; - loff_t new_len = *len; - - if ((*len & blkmask) == 0) - return 0; - - if (pos_out + *len < i_size_read(inode_out)) - new_len &= ~blkmask; - - if (new_len == *len) - return 0; - - if (remap_flags & REMAP_FILE_CAN_SHORTEN) { - *len = new_len; - return 0; - } - - return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; -} - -/* Read a page's worth of file data into the page cache. */ -static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) -{ - struct page *page; - - page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL); - if (IS_ERR(page)) - return page; - if (!PageUptodate(page)) { - put_page(page); - return ERR_PTR(-EIO); - } - return page; -} - /* - * Lock two pages, ensuring that we lock in offset order if the pages are from - * the same file. + * Don't operate on ranges the page cache doesn't support, and don't exceed the + * LFS limits. If pos is under the limit it becomes a short access. If it + * exceeds the limit we return -EFBIG. */ -static void vfs_lock_two_pages(struct page *page1, struct page *page2) -{ - /* Always lock in order of increasing index. */ - if (page1->index > page2->index) - swap(page1, page2); - - lock_page(page1); - if (page1 != page2) - lock_page(page2); -} - -/* Unlock two pages, being careful not to unlock the same page twice. */ -static void vfs_unlock_two_pages(struct page *page1, struct page *page2) +int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) { - unlock_page(page1); - if (page1 != page2) - unlock_page(page2); -} - -/* - * Compare extents of two files to see if they are the same. - * Caller must have locked both inodes to prevent write races. - */ -static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, - struct inode *dest, loff_t destoff, - loff_t len, bool *is_same) -{ - loff_t src_poff; - loff_t dest_poff; - void *src_addr; - void *dest_addr; - struct page *src_page; - struct page *dest_page; - loff_t cmp_len; - bool same; - int error; - - error = -EINVAL; - same = true; - while (len) { - src_poff = srcoff & (PAGE_SIZE - 1); - dest_poff = destoff & (PAGE_SIZE - 1); - cmp_len = min(PAGE_SIZE - src_poff, - PAGE_SIZE - dest_poff); - cmp_len = min(cmp_len, len); - if (cmp_len <= 0) - goto out_error; - - src_page = vfs_dedupe_get_page(src, srcoff); - if (IS_ERR(src_page)) { - error = PTR_ERR(src_page); - goto out_error; - } - dest_page = vfs_dedupe_get_page(dest, destoff); - if (IS_ERR(dest_page)) { - error = PTR_ERR(dest_page); - put_page(src_page); - goto out_error; - } - - vfs_lock_two_pages(src_page, dest_page); + struct inode *inode = file->f_mapping->host; + loff_t max_size = inode->i_sb->s_maxbytes; + loff_t limit = rlimit(RLIMIT_FSIZE); - /* - * Now that we've locked both pages, make sure they're still - * mapped to the file data we're interested in. If not, - * someone is invalidating pages on us and we lose. - */ - if (!PageUptodate(src_page) || !PageUptodate(dest_page) || - src_page->mapping != src->i_mapping || - dest_page->mapping != dest->i_mapping) { - same = false; - goto unlock; + if (limit != RLIM_INFINITY) { + if (pos >= limit) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; } + *count = min(*count, limit - pos); + } - src_addr = kmap_atomic(src_page); - dest_addr = kmap_atomic(dest_page); - - flush_dcache_page(src_page); - flush_dcache_page(dest_page); - - if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) - same = false; + if (!(file->f_flags & O_LARGEFILE)) + max_size = MAX_NON_LFS; - kunmap_atomic(dest_addr); - kunmap_atomic(src_addr); -unlock: - vfs_unlock_two_pages(src_page, dest_page); - put_page(dest_page); - put_page(src_page); + if (unlikely(pos >= max_size)) + return -EFBIG; - if (!same) - break; + *count = min(*count, max_size - pos); - srcoff += cmp_len; - destoff += cmp_len; - len -= cmp_len; - } - - *is_same = same; return 0; - -out_error: - return error; } /* - * Check that the two inodes are eligible for cloning, the ranges make - * sense, and then flush all dirty data. Caller must ensure that the - * inodes have been locked against any other modifications. + * Performs necessary checks before doing a write * - * If there's an error, then the usual negative error code is returned. - * Otherwise returns 0 with *len set to the request length. + * Can adjust writing position or amount of bytes to write. + * Returns appropriate error code that caller should return or + * zero in case that write should be allowed. */ -int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *len, unsigned int remap_flags) +ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) { - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - bool same_inode = (inode_in == inode_out); + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + loff_t count; int ret; - /* Don't touch certain kinds of inodes */ - if (IS_IMMUTABLE(inode_out)) - return -EPERM; - - if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) + if (IS_SWAPFILE(inode)) return -ETXTBSY; - /* Don't reflink dirs, pipes, sockets... */ - if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) - return -EISDIR; - if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) - return -EINVAL; - - /* Zero length dedupe exits immediately; reflink goes to EOF. */ - if (*len == 0) { - loff_t isize = i_size_read(inode_in); - - if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) - return 0; - if (pos_in > isize) - return -EINVAL; - *len = isize - pos_in; - if (*len == 0) - return 0; - } - - /* Check that we don't violate system file offset limits. */ - ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, - remap_flags); - if (ret) - return ret; - - /* Wait for the completion of any pending IOs on both files */ - inode_dio_wait(inode_in); - if (!same_inode) - inode_dio_wait(inode_out); - - ret = filemap_write_and_wait_range(inode_in->i_mapping, - pos_in, pos_in + *len - 1); - if (ret) - return ret; - - ret = filemap_write_and_wait_range(inode_out->i_mapping, - pos_out, pos_out + *len - 1); - if (ret) - return ret; - - /* - * Check that the extents are the same. - */ - if (remap_flags & REMAP_FILE_DEDUP) { - bool is_same = false; - - ret = vfs_dedupe_file_range_compare(inode_in, pos_in, - inode_out, pos_out, *len, &is_same); - if (ret) - return ret; - if (!is_same) - return -EBADE; - } - - ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, - remap_flags); - if (ret) - return ret; - - /* If can't alter the file contents, we're done. */ - if (!(remap_flags & REMAP_FILE_DEDUP)) - ret = file_modified(file_out); - - return ret; -} -EXPORT_SYMBOL(generic_remap_file_range_prep); - -loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t len, unsigned int remap_flags) -{ - loff_t ret; - - WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP); - - /* - * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on - * the same mount. Practically, they only need to be on the same file - * system. - */ - if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) - return -EXDEV; - - ret = generic_file_rw_checks(file_in, file_out); - if (ret < 0) - return ret; - - if (!file_in->f_op->remap_file_range) - return -EOPNOTSUPP; - - ret = remap_verify_area(file_in, pos_in, len, false); - if (ret) - return ret; - - ret = remap_verify_area(file_out, pos_out, len, true); - if (ret) - return ret; - - ret = file_in->f_op->remap_file_range(file_in, pos_in, - file_out, pos_out, len, remap_flags); - if (ret < 0) - return ret; - - fsnotify_access(file_in); - fsnotify_modify(file_out); - return ret; -} -EXPORT_SYMBOL(do_clone_file_range); - -loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t len, unsigned int remap_flags) -{ - loff_t ret; - - file_start_write(file_out); - ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len, - remap_flags); - file_end_write(file_out); - - return ret; -} -EXPORT_SYMBOL(vfs_clone_file_range); - -/* Check whether we are allowed to dedupe the destination file */ -static bool allow_file_dedupe(struct file *file) -{ - if (capable(CAP_SYS_ADMIN)) - return true; - if (file->f_mode & FMODE_WRITE) - return true; - if (uid_eq(current_fsuid(), file_inode(file)->i_uid)) - return true; - if (!inode_permission(file_inode(file), MAY_WRITE)) - return true; - return false; -} + if (!iov_iter_count(from)) + return 0; -loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, - struct file *dst_file, loff_t dst_pos, - loff_t len, unsigned int remap_flags) -{ - loff_t ret; + /* FIXME: this is for backwards compatibility with 2.4 */ + if (iocb->ki_flags & IOCB_APPEND) + iocb->ki_pos = i_size_read(inode); - WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP | - REMAP_FILE_CAN_SHORTEN)); + if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) + return -EINVAL; - ret = mnt_want_write_file(dst_file); + count = iov_iter_count(from); + ret = generic_write_check_limits(file, iocb->ki_pos, &count); if (ret) return ret; - ret = remap_verify_area(dst_file, dst_pos, len, true); - if (ret < 0) - goto out_drop_write; - - ret = -EPERM; - if (!allow_file_dedupe(dst_file)) - goto out_drop_write; - - ret = -EXDEV; - if (src_file->f_path.mnt != dst_file->f_path.mnt) - goto out_drop_write; - - ret = -EISDIR; - if (S_ISDIR(file_inode(dst_file)->i_mode)) - goto out_drop_write; - - ret = -EINVAL; - if (!dst_file->f_op->remap_file_range) - goto out_drop_write; - - if (len == 0) { - ret = 0; - goto out_drop_write; - } - - ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file, - dst_pos, len, remap_flags | REMAP_FILE_DEDUP); -out_drop_write: - mnt_drop_write_file(dst_file); - - return ret; + iov_iter_truncate(from, count); + return iov_iter_count(from); } -EXPORT_SYMBOL(vfs_dedupe_file_range_one); +EXPORT_SYMBOL(generic_write_checks); -int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) +/* + * Performs common checks before doing a file copy/clone + * from @file_in to @file_out. + */ +int generic_file_rw_checks(struct file *file_in, struct file *file_out) { - struct file_dedupe_range_info *info; - struct inode *src = file_inode(file); - u64 off; - u64 len; - int i; - int ret; - u16 count = same->dest_count; - loff_t deduped; - - if (!(file->f_mode & FMODE_READ)) - return -EINVAL; - - if (same->reserved1 || same->reserved2) - return -EINVAL; - - off = same->src_offset; - len = same->src_length; + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); - if (S_ISDIR(src->i_mode)) + /* Don't copy dirs, pipes, sockets... */ + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) return -EISDIR; - - if (!S_ISREG(src->i_mode)) - return -EINVAL; - - if (!file->f_op->remap_file_range) - return -EOPNOTSUPP; - - ret = remap_verify_area(file, off, len, false); - if (ret < 0) - return ret; - ret = 0; - - if (off + len > i_size_read(src)) + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) return -EINVAL; - /* Arbitrary 1G limit on a single dedupe request, can be raised. */ - len = min_t(u64, len, 1 << 30); - - /* pre-format output fields to sane values */ - for (i = 0; i < count; i++) { - same->info[i].bytes_deduped = 0ULL; - same->info[i].status = FILE_DEDUPE_RANGE_SAME; - } - - for (i = 0, info = same->info; i < count; i++, info++) { - struct fd dst_fd = fdget(info->dest_fd); - struct file *dst_file = dst_fd.file; - - if (!dst_file) { - info->status = -EBADF; - goto next_loop; - } - - if (info->reserved) { - info->status = -EINVAL; - goto next_fdput; - } - - deduped = vfs_dedupe_file_range_one(file, off, dst_file, - info->dest_offset, len, - REMAP_FILE_CAN_SHORTEN); - if (deduped == -EBADE) - info->status = FILE_DEDUPE_RANGE_DIFFERS; - else if (deduped < 0) - info->status = deduped; - else - info->bytes_deduped = len; + if (!(file_in->f_mode & FMODE_READ) || + !(file_out->f_mode & FMODE_WRITE) || + (file_out->f_flags & O_APPEND)) + return -EBADF; -next_fdput: - fdput(dst_fd); -next_loop: - if (fatal_signal_pending(current)) - break; - } - return ret; + return 0; } -EXPORT_SYMBOL(vfs_dedupe_file_range); diff --git a/fs/remap_range.c b/fs/remap_range.c new file mode 100644 index 000000000000..e6099beefa97 --- /dev/null +++ b/fs/remap_range.c @@ -0,0 +1,571 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/slab.h> +#include <linux/stat.h> +#include <linux/sched/xacct.h> +#include <linux/fcntl.h> +#include <linux/file.h> +#include <linux/uio.h> +#include <linux/fsnotify.h> +#include <linux/security.h> +#include <linux/export.h> +#include <linux/syscalls.h> +#include <linux/pagemap.h> +#include <linux/splice.h> +#include <linux/compat.h> +#include <linux/mount.h> +#include <linux/fs.h> +#include "internal.h" + +#include <linux/uaccess.h> +#include <asm/unistd.h> + +/* + * Performs necessary checks before doing a clone. + * + * Can adjust amount of bytes to clone via @req_count argument. + * Returns appropriate error code that caller should return or + * zero in case the clone should be allowed. + */ +static int generic_remap_checks(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *req_count, unsigned int remap_flags) +{ + struct inode *inode_in = file_in->f_mapping->host; + struct inode *inode_out = file_out->f_mapping->host; + uint64_t count = *req_count; + uint64_t bcount; + loff_t size_in, size_out; + loff_t bs = inode_out->i_sb->s_blocksize; + int ret; + + /* The start of both ranges must be aligned to an fs block. */ + if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) + return -EINVAL; + + /* Ensure offsets don't wrap. */ + if (pos_in + count < pos_in || pos_out + count < pos_out) + return -EINVAL; + + size_in = i_size_read(inode_in); + size_out = i_size_read(inode_out); + + /* Dedupe requires both ranges to be within EOF. */ + if ((remap_flags & REMAP_FILE_DEDUP) && + (pos_in >= size_in || pos_in + count > size_in || + pos_out >= size_out || pos_out + count > size_out)) + return -EINVAL; + + /* Ensure the infile range is within the infile. */ + if (pos_in >= size_in) + return -EINVAL; + count = min(count, size_in - (uint64_t)pos_in); + + ret = generic_write_check_limits(file_out, pos_out, &count); + if (ret) + return ret; + + /* + * If the user wanted us to link to the infile's EOF, round up to the + * next block boundary for this check. + * + * Otherwise, make sure the count is also block-aligned, having + * already confirmed the starting offsets' block alignment. + */ + if (pos_in + count == size_in) { + bcount = ALIGN(size_in, bs) - pos_in; + } else { + if (!IS_ALIGNED(count, bs)) + count = ALIGN_DOWN(count, bs); + bcount = count; + } + + /* Don't allow overlapped cloning within the same file. */ + if (inode_in == inode_out && + pos_out + bcount > pos_in && + pos_out < pos_in + bcount) + return -EINVAL; + + /* + * We shortened the request but the caller can't deal with that, so + * bounce the request back to userspace. + */ + if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) + return -EINVAL; + + *req_count = count; + return 0; +} + +static int remap_verify_area(struct file *file, loff_t pos, loff_t len, + bool write) +{ + struct inode *inode = file_inode(file); + + if (unlikely(pos < 0 || len < 0)) + return -EINVAL; + + if (unlikely((loff_t) (pos + len) < 0)) + return -EINVAL; + + if (unlikely(inode->i_flctx && mandatory_lock(inode))) { + loff_t end = len ? pos + len - 1 : OFFSET_MAX; + int retval; + + retval = locks_mandatory_area(inode, file, pos, end, + write ? F_WRLCK : F_RDLCK); + if (retval < 0) + return retval; + } + + return security_file_permission(file, write ? MAY_WRITE : MAY_READ); +} + +/* + * Ensure that we don't remap a partial EOF block in the middle of something + * else. Assume that the offsets have already been checked for block + * alignment. + * + * For clone we only link a partial EOF block above or at the destination file's + * EOF. For deduplication we accept a partial EOF block only if it ends at the + * destination file's EOF (can not link it into the middle of a file). + * + * Shorten the request if possible. + */ +static int generic_remap_check_len(struct inode *inode_in, + struct inode *inode_out, + loff_t pos_out, + loff_t *len, + unsigned int remap_flags) +{ + u64 blkmask = i_blocksize(inode_in) - 1; + loff_t new_len = *len; + + if ((*len & blkmask) == 0) + return 0; + + if (pos_out + *len < i_size_read(inode_out)) + new_len &= ~blkmask; + + if (new_len == *len) + return 0; + + if (remap_flags & REMAP_FILE_CAN_SHORTEN) { + *len = new_len; + return 0; + } + + return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; +} + +/* Read a page's worth of file data into the page cache. */ +static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) +{ + struct page *page; + + page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL); + if (IS_ERR(page)) + return page; + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + return page; +} + +/* + * Lock two pages, ensuring that we lock in offset order if the pages are from + * the same file. + */ +static void vfs_lock_two_pages(struct page *page1, struct page *page2) +{ + /* Always lock in order of increasing index. */ + if (page1->index > page2->index) + swap(page1, page2); + + lock_page(page1); + if (page1 != page2) + lock_page(page2); +} + +/* Unlock two pages, being careful not to unlock the same page twice. */ +static void vfs_unlock_two_pages(struct page *page1, struct page *page2) +{ + unlock_page(page1); + if (page1 != page2) + unlock_page(page2); +} + +/* + * Compare extents of two files to see if they are the same. + * Caller must have locked both inodes to prevent write races. + */ +static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, + struct inode *dest, loff_t destoff, + loff_t len, bool *is_same) +{ + loff_t src_poff; + loff_t dest_poff; + void *src_addr; + void *dest_addr; + struct page *src_page; + struct page *dest_page; + loff_t cmp_len; + bool same; + int error; + + error = -EINVAL; + same = true; + while (len) { + src_poff = srcoff & (PAGE_SIZE - 1); + dest_poff = destoff & (PAGE_SIZE - 1); + cmp_len = min(PAGE_SIZE - src_poff, + PAGE_SIZE - dest_poff); + cmp_len = min(cmp_len, len); + if (cmp_len <= 0) + goto out_error; + + src_page = vfs_dedupe_get_page(src, srcoff); + if (IS_ERR(src_page)) { + error = PTR_ERR(src_page); + goto out_error; + } + dest_page = vfs_dedupe_get_page(dest, destoff); + if (IS_ERR(dest_page)) { + error = PTR_ERR(dest_page); + put_page(src_page); + goto out_error; + } + + vfs_lock_two_pages(src_page, dest_page); + + /* + * Now that we've locked both pages, make sure they're still + * mapped to the file data we're interested in. If not, + * someone is invalidating pages on us and we lose. + */ + if (!PageUptodate(src_page) || !PageUptodate(dest_page) || + src_page->mapping != src->i_mapping || + dest_page->mapping != dest->i_mapping) { + same = false; + goto unlock; + } + + src_addr = kmap_atomic(src_page); + dest_addr = kmap_atomic(dest_page); + + flush_dcache_page(src_page); + flush_dcache_page(dest_page); + + if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) + same = false; + + kunmap_atomic(dest_addr); + kunmap_atomic(src_addr); +unlock: + vfs_unlock_two_pages(src_page, dest_page); + put_page(dest_page); + put_page(src_page); + + if (!same) + break; + + srcoff += cmp_len; + destoff += cmp_len; + len -= cmp_len; + } + + *is_same = same; + return 0; + +out_error: + return error; +} + +/* + * Check that the two inodes are eligible for cloning, the ranges make + * sense, and then flush all dirty data. Caller must ensure that the + * inodes have been locked against any other modifications. + * + * If there's an error, then the usual negative error code is returned. + * Otherwise returns 0 with *len set to the request length. + */ +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + bool same_inode = (inode_in == inode_out); + int ret; + + /* Don't touch certain kinds of inodes */ + if (IS_IMMUTABLE(inode_out)) + return -EPERM; + + if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) + return -ETXTBSY; + + /* Don't reflink dirs, pipes, sockets... */ + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) + return -EISDIR; + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) + return -EINVAL; + + /* Zero length dedupe exits immediately; reflink goes to EOF. */ + if (*len == 0) { + loff_t isize = i_size_read(inode_in); + + if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) + return 0; + if (pos_in > isize) + return -EINVAL; + *len = isize - pos_in; + if (*len == 0) + return 0; + } + + /* Check that we don't violate system file offset limits. */ + ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, + remap_flags); + if (ret) + return ret; + + /* Wait for the completion of any pending IOs on both files */ + inode_dio_wait(inode_in); + if (!same_inode) + inode_dio_wait(inode_out); + + ret = filemap_write_and_wait_range(inode_in->i_mapping, + pos_in, pos_in + *len - 1); + if (ret) + return ret; + + ret = filemap_write_and_wait_range(inode_out->i_mapping, + pos_out, pos_out + *len - 1); + if (ret) + return ret; + + /* + * Check that the extents are the same. + */ + if (remap_flags & REMAP_FILE_DEDUP) { + bool is_same = false; + + ret = vfs_dedupe_file_range_compare(inode_in, pos_in, + inode_out, pos_out, *len, &is_same); + if (ret) + return ret; + if (!is_same) + return -EBADE; + } + + ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, + remap_flags); + if (ret) + return ret; + + /* If can't alter the file contents, we're done. */ + if (!(remap_flags & REMAP_FILE_DEDUP)) + ret = file_modified(file_out); + + return ret; +} +EXPORT_SYMBOL(generic_remap_file_range_prep); + +loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) +{ + loff_t ret; + + WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP); + + /* + * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on + * the same mount. Practically, they only need to be on the same file + * system. + */ + if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) + return -EXDEV; + + ret = generic_file_rw_checks(file_in, file_out); + if (ret < 0) + return ret; + + if (!file_in->f_op->remap_file_range) + return -EOPNOTSUPP; + + ret = remap_verify_area(file_in, pos_in, len, false); + if (ret) + return ret; + + ret = remap_verify_area(file_out, pos_out, len, true); + if (ret) + return ret; + + ret = file_in->f_op->remap_file_range(file_in, pos_in, + file_out, pos_out, len, remap_flags); + if (ret < 0) + return ret; + + fsnotify_access(file_in); + fsnotify_modify(file_out); + return ret; +} +EXPORT_SYMBOL(do_clone_file_range); + +loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) +{ + loff_t ret; + + file_start_write(file_out); + ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len, + remap_flags); + file_end_write(file_out); + + return ret; +} +EXPORT_SYMBOL(vfs_clone_file_range); + +/* Check whether we are allowed to dedupe the destination file */ +static bool allow_file_dedupe(struct file *file) +{ + if (capable(CAP_SYS_ADMIN)) + return true; + if (file->f_mode & FMODE_WRITE) + return true; + if (uid_eq(current_fsuid(), file_inode(file)->i_uid)) + return true; + if (!inode_permission(file_inode(file), MAY_WRITE)) + return true; + return false; +} + +loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, + struct file *dst_file, loff_t dst_pos, + loff_t len, unsigned int remap_flags) +{ + loff_t ret; + + WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP | + REMAP_FILE_CAN_SHORTEN)); + + ret = mnt_want_write_file(dst_file); + if (ret) + return ret; + + ret = remap_verify_area(dst_file, dst_pos, len, true); + if (ret < 0) + goto out_drop_write; + + ret = -EPERM; + if (!allow_file_dedupe(dst_file)) + goto out_drop_write; + + ret = -EXDEV; + if (src_file->f_path.mnt != dst_file->f_path.mnt) + goto out_drop_write; + + ret = -EISDIR; + if (S_ISDIR(file_inode(dst_file)->i_mode)) + goto out_drop_write; + + ret = -EINVAL; + if (!dst_file->f_op->remap_file_range) + goto out_drop_write; + + if (len == 0) { + ret = 0; + goto out_drop_write; + } + + ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file, + dst_pos, len, remap_flags | REMAP_FILE_DEDUP); +out_drop_write: + mnt_drop_write_file(dst_file); + + return ret; +} +EXPORT_SYMBOL(vfs_dedupe_file_range_one); + +int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) +{ + struct file_dedupe_range_info *info; + struct inode *src = file_inode(file); + u64 off; + u64 len; + int i; + int ret; + u16 count = same->dest_count; + loff_t deduped; + + if (!(file->f_mode & FMODE_READ)) + return -EINVAL; + + if (same->reserved1 || same->reserved2) + return -EINVAL; + + off = same->src_offset; + len = same->src_length; + + if (S_ISDIR(src->i_mode)) + return -EISDIR; + + if (!S_ISREG(src->i_mode)) + return -EINVAL; + + if (!file->f_op->remap_file_range) + return -EOPNOTSUPP; + + ret = remap_verify_area(file, off, len, false); + if (ret < 0) + return ret; + ret = 0; + + if (off + len > i_size_read(src)) + return -EINVAL; + + /* Arbitrary 1G limit on a single dedupe request, can be raised. */ + len = min_t(u64, len, 1 << 30); + + /* pre-format output fields to sane values */ + for (i = 0; i < count; i++) { + same->info[i].bytes_deduped = 0ULL; + same->info[i].status = FILE_DEDUPE_RANGE_SAME; + } + + for (i = 0, info = same->info; i < count; i++, info++) { + struct fd dst_fd = fdget(info->dest_fd); + struct file *dst_file = dst_fd.file; + + if (!dst_file) { + info->status = -EBADF; + goto next_loop; + } + + if (info->reserved) { + info->status = -EINVAL; + goto next_fdput; + } + + deduped = vfs_dedupe_file_range_one(file, off, dst_file, + info->dest_offset, len, + REMAP_FILE_CAN_SHORTEN); + if (deduped == -EBADE) + info->status = FILE_DEDUPE_RANGE_DIFFERS; + else if (deduped < 0) + info->status = deduped; + else + info->bytes_deduped = len; + +next_fdput: + fdput(dst_fd); +next_loop: + if (fatal_signal_pending(current)) + break; + } + return ret; +} +EXPORT_SYMBOL(vfs_dedupe_file_range); diff --git a/fs/romfs/super.c b/fs/romfs/super.c index b1b7d3f5752f..259f684d9236 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -416,8 +416,7 @@ static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bfree = buf->f_bavail = buf->f_ffree; buf->f_blocks = (romfs_maxsize(dentry->d_sb) + ROMBSIZE - 1) >> ROMBSBITS; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); return 0; } diff --git a/fs/splice.c b/fs/splice.c index 599b740f1098..866d5c2367b2 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1005,9 +1005,8 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, /* * Determine where to splice to/from. */ -long do_splice(struct file *in, loff_t __user *off_in, - struct file *out, loff_t __user *off_out, - size_t len, unsigned int flags) +long do_splice(struct file *in, loff_t *off_in, struct file *out, + loff_t *off_out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; @@ -1041,8 +1040,7 @@ long do_splice(struct file *in, loff_t __user *off_in, if (off_out) { if (!(out->f_mode & FMODE_PWRITE)) return -EINVAL; - if (copy_from_user(&offset, off_out, sizeof(loff_t))) - return -EFAULT; + offset = *off_out; } else { offset = out->f_pos; } @@ -1063,8 +1061,8 @@ long do_splice(struct file *in, loff_t __user *off_in, if (!off_out) out->f_pos = offset; - else if (copy_to_user(off_out, &offset, sizeof(loff_t))) - ret = -EFAULT; + else + *off_out = offset; return ret; } @@ -1075,8 +1073,7 @@ long do_splice(struct file *in, loff_t __user *off_in, if (off_in) { if (!(in->f_mode & FMODE_PREAD)) return -EINVAL; - if (copy_from_user(&offset, off_in, sizeof(loff_t))) - return -EFAULT; + offset = *off_in; } else { offset = in->f_pos; } @@ -1100,8 +1097,8 @@ long do_splice(struct file *in, loff_t __user *off_in, wakeup_pipe_readers(opipe); if (!off_in) in->f_pos = offset; - else if (copy_to_user(off_in, &offset, sizeof(loff_t))) - ret = -EFAULT; + else + *off_in = offset; return ret; } @@ -1109,6 +1106,46 @@ long do_splice(struct file *in, loff_t __user *off_in, return -EINVAL; } +static long __do_splice(struct file *in, loff_t __user *off_in, + struct file *out, loff_t __user *off_out, + size_t len, unsigned int flags) +{ + struct pipe_inode_info *ipipe; + struct pipe_inode_info *opipe; + loff_t offset, *__off_in = NULL, *__off_out = NULL; + long ret; + + ipipe = get_pipe_info(in, true); + opipe = get_pipe_info(out, true); + + if (ipipe && off_in) + return -ESPIPE; + if (opipe && off_out) + return -ESPIPE; + + if (off_out) { + if (copy_from_user(&offset, off_out, sizeof(loff_t))) + return -EFAULT; + __off_out = &offset; + } + if (off_in) { + if (copy_from_user(&offset, off_in, sizeof(loff_t))) + return -EFAULT; + __off_in = &offset; + } + + ret = do_splice(in, __off_in, out, __off_out, len, flags); + if (ret < 0) + return ret; + + if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t))) + return -EFAULT; + if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t))) + return -EFAULT; + + return ret; +} + static int iter_to_pipe(struct iov_iter *from, struct pipe_inode_info *pipe, unsigned flags) @@ -1303,8 +1340,8 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, if (in.file) { out = fdget(fd_out); if (out.file) { - error = do_splice(in.file, off_in, out.file, off_out, - len, flags); + error = __do_splice(in.file, off_in, out.file, off_out, + len, flags); fdput(out); } fdput(in); diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 0cc4ceec0562..d6c6593ec169 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -380,8 +380,7 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = msblk->inodes; buf->f_ffree = 0; buf->f_namelen = SQUASHFS_NAME_LEN; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); return 0; } diff --git a/fs/stat.c b/fs/stat.c index 44f8ad346db4..dacecdda2e79 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -56,7 +56,7 @@ EXPORT_SYMBOL(generic_fillattr); * @path: file to get attributes from * @stat: structure to return attributes in * @request_mask: STATX_xxx flags indicating what the caller wants - * @query_flags: Query mode (KSTAT_QUERY_FLAGS) + * @query_flags: Query mode (AT_STATX_SYNC_TYPE) * * Get attributes without calling security_inode_getattr. * @@ -71,7 +71,7 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, memset(stat, 0, sizeof(*stat)); stat->result_mask |= STATX_BASIC_STATS; - query_flags &= KSTAT_QUERY_FLAGS; + query_flags &= AT_STATX_SYNC_TYPE; /* allow the fs to override these if it really wants to */ /* SB_NOATIME means filesystem supplies dummy atime value */ @@ -97,7 +97,7 @@ EXPORT_SYMBOL(vfs_getattr_nosec); * @path: The file of interest * @stat: Where to return the statistics * @request_mask: STATX_xxx flags indicating what the caller wants - * @query_flags: Query mode (KSTAT_QUERY_FLAGS) + * @query_flags: Query mode (AT_STATX_SYNC_TYPE) * * Ask the filesystem for a file's attributes. The caller must indicate in * request_mask and query_flags to indicate what they want. @@ -126,53 +126,27 @@ int vfs_getattr(const struct path *path, struct kstat *stat, EXPORT_SYMBOL(vfs_getattr); /** - * vfs_statx_fd - Get the enhanced basic attributes by file descriptor + * vfs_fstat - Get the basic attributes by file descriptor * @fd: The file descriptor referring to the file of interest * @stat: The result structure to fill in. - * @request_mask: STATX_xxx flags indicating what the caller wants - * @query_flags: Query mode (KSTAT_QUERY_FLAGS) * * This function is a wrapper around vfs_getattr(). The main difference is * that it uses a file descriptor to determine the file location. * * 0 will be returned on success, and a -ve error code if unsuccessful. */ -int vfs_statx_fd(unsigned int fd, struct kstat *stat, - u32 request_mask, unsigned int query_flags) +int vfs_fstat(int fd, struct kstat *stat) { struct fd f; - int error = -EBADF; - - if (query_flags & ~KSTAT_QUERY_FLAGS) - return -EINVAL; + int error; f = fdget_raw(fd); - if (f.file) { - error = vfs_getattr(&f.file->f_path, stat, - request_mask, query_flags); - fdput(f); - } + if (!f.file) + return -EBADF; + error = vfs_getattr(&f.file->f_path, stat, STATX_BASIC_STATS, 0); + fdput(f); return error; } -EXPORT_SYMBOL(vfs_statx_fd); - -static inline unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, - int flags) -{ - if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | - AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0) - return -EINVAL; - - *lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT; - if (flags & AT_SYMLINK_NOFOLLOW) - *lookup_flags &= ~LOOKUP_FOLLOW; - if (flags & AT_NO_AUTOMOUNT) - *lookup_flags &= ~LOOKUP_AUTOMOUNT; - if (flags & AT_EMPTY_PATH) - *lookup_flags |= LOOKUP_EMPTY; - - return 0; -} /** * vfs_statx - Get basic and extra attributes by filename @@ -189,15 +163,24 @@ static inline unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, * * 0 will be returned on success, and a -ve error code if unsuccessful. */ -int vfs_statx(int dfd, const char __user *filename, int flags, +static int vfs_statx(int dfd, const char __user *filename, int flags, struct kstat *stat, u32 request_mask) { struct path path; - int error = -EINVAL; - unsigned lookup_flags; + unsigned lookup_flags = 0; + int error; - if (vfs_stat_set_lookup_flags(&lookup_flags, flags)) + if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH | + AT_STATX_SYNC_TYPE)) return -EINVAL; + + if (!(flags & AT_SYMLINK_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + if (!(flags & AT_NO_AUTOMOUNT)) + lookup_flags |= LOOKUP_AUTOMOUNT; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (error) @@ -217,8 +200,13 @@ retry: out: return error; } -EXPORT_SYMBOL(vfs_statx); +int vfs_fstatat(int dfd, const char __user *filename, + struct kstat *stat, int flags) +{ + return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT, + stat, STATX_BASIC_STATS); +} #ifdef __ARCH_WANT_OLD_STAT diff --git a/fs/statfs.c b/fs/statfs.c index 2616424012ea..59f33752c131 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -29,6 +29,8 @@ static int flags_by_mnt(int mnt_flags) flags |= ST_NODIRATIME; if (mnt_flags & MNT_RELATIME) flags |= ST_RELATIME; + if (mnt_flags & MNT_NOSYMFOLLOW) + flags |= ST_NOSYMFOLLOW; return flags; } diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 02b1d9d0c182..be47263b8605 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -98,8 +98,7 @@ static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = sbi->s_ninodes; buf->f_ffree = sysv_count_free_inodes(sb); buf->f_namelen = SYSV_NAMELEN; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); return 0; } diff --git a/fs/udf/super.c b/fs/udf/super.c index faf2017ada11..5bef3a68395d 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -2414,8 +2414,7 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf) + buf->f_bfree; buf->f_ffree = buf->f_bfree; buf->f_namelen = UDF_NAME_LEN; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); return 0; } diff --git a/fs/ufs/super.c b/fs/ufs/super.c index e3b69fb280e8..983558b572c7 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1431,8 +1431,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) ? (buf->f_bfree - uspi->s_root_blocks) : 0; buf->f_files = uspi->s_ncg * uspi->s_ipg; buf->f_namelen = UFS_MAXNAMLEN; - buf->f_fsid.val[0] = (u32)id; - buf->f_fsid.val[1] = (u32)(id >> 32); + buf->f_fsid = u64_to_fsid(id); mutex_unlock(&UFS_SB(sb)->s_lock); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index f2a8a0e75e1f..7371a7f7c652 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -947,11 +947,11 @@ xfs_free_file_space( endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); /* We can only free complete realtime extents. */ - if (XFS_IS_REALTIME_INODE(ip)) { - xfs_extlen_t extsz = xfs_get_extsz_hint(ip); - - if ((startoffset_fsb | endoffset_fsb) & (extsz - 1)) - return -EINVAL; + if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) { + startoffset_fsb = roundup_64(startoffset_fsb, + mp->m_sb.sb_rextsize); + endoffset_fsb = rounddown_64(endoffset_fsb, + mp->m_sb.sb_rextsize); } /* @@ -1147,14 +1147,6 @@ xfs_insert_file_space( trace_xfs_insert_file_space(ip); - /* We can only insert complete realtime extents. */ - if (XFS_IS_REALTIME_INODE(ip)) { - xfs_extlen_t extsz = xfs_get_extsz_hint(ip); - - if ((stop_fsb | shift_fsb) & (extsz - 1)) - return -EINVAL; - } - error = xfs_bmap_can_insert_extents(ip, stop_fsb, shift_fsb); if (error) return error; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 3d1b95124744..5b0f93f73837 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -32,6 +32,39 @@ static const struct vm_operations_struct xfs_file_vm_ops; +/* + * Decide if the given file range is aligned to the size of the fundamental + * allocation unit for the file. + */ +static bool +xfs_is_falloc_aligned( + struct xfs_inode *ip, + loff_t pos, + long long int len) +{ + struct xfs_mount *mp = ip->i_mount; + uint64_t mask; + + if (XFS_IS_REALTIME_INODE(ip)) { + if (!is_power_of_2(mp->m_sb.sb_rextsize)) { + u64 rextbytes; + u32 mod; + + rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); + div_u64_rem(pos, rextbytes, &mod); + if (mod) + return false; + div_u64_rem(len, rextbytes, &mod); + return mod == 0; + } + mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1; + } else { + mask = mp->m_sb.sb_blocksize - 1; + } + + return !((pos | len) & mask); +} + int xfs_update_prealloc_flags( struct xfs_inode *ip, @@ -850,9 +883,7 @@ xfs_file_fallocate( if (error) goto out_unlock; } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { - unsigned int blksize_mask = i_blocksize(inode) - 1; - - if (offset & blksize_mask || len & blksize_mask) { + if (!xfs_is_falloc_aligned(ip, offset, len)) { error = -EINVAL; goto out_unlock; } @@ -872,10 +903,9 @@ xfs_file_fallocate( if (error) goto out_unlock; } else if (mode & FALLOC_FL_INSERT_RANGE) { - unsigned int blksize_mask = i_blocksize(inode) - 1; loff_t isize = i_size_read(inode); - if (offset & blksize_mask || len & blksize_mask) { + if (!xfs_is_falloc_aligned(ip, offset, len)) { error = -EINVAL; goto out_unlock; } diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index ad1009778d33..5b7a1e201559 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -175,6 +175,12 @@ static inline xfs_dev_t linux_to_xfs_dev_t(dev_t dev) #define xfs_sort(a,n,s,fn) sort(a,n,s,fn,NULL) #define xfs_stack_trace() dump_stack() +static inline uint64_t rounddown_64(uint64_t x, uint32_t y) +{ + do_div(x, y); + return x * y; +} + static inline uint64_t roundup_64(uint64_t x, uint32_t y) { x += y - 1; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a8289adc1b29..87886b7f77da 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3446,6 +3446,14 @@ xlog_recover_finish( int error; error = xlog_recover_process_intents(log); if (error) { + /* + * Cancel all the unprocessed intent items now so that + * we don't leave them pinned in the AIL. This can + * cause the AIL to livelock on the pinned item if + * anyone tries to push the AIL (inode reclaim does + * this) before we get around to xfs_log_mount_cancel. + */ + xlog_recover_cancel_intents(log); xfs_alert(log->l_mp, "Failed to recover intents"); return error; } diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 4d9bd6bb63ca..3c392b1512ac 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -42,7 +42,7 @@ do { \ #define xfs_printk_once(func, dev, fmt, ...) \ ({ \ - static bool __section(.data.once) __print_once; \ + static bool __section(".data.once") __print_once; \ bool __ret_print_once = !__print_once; \ \ if (!__print_once) { \ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index d1b5f2d2a245..e3e229e52512 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -794,8 +794,7 @@ xfs_fs_statfs( statp->f_namelen = MAXNAMELEN - 1; id = huge_encode_dev(mp->m_ddev_targp->bt_dev); - statp->f_fsid.val[0] = (u32)id; - statp->f_fsid.val[1] = (u32)(id >> 32); + statp->f_fsid = u64_to_fsid(id); icount = percpu_counter_sum(&mp->m_icount); ifree = percpu_counter_sum(&mp->m_ifree); diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 64cc2a9c38c8..ff5930be096c 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1116,8 +1116,7 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf) fsid = le64_to_cpup((void *)sbi->s_uuid.b) ^ le64_to_cpup((void *)sbi->s_uuid.b + sizeof(u64)); - buf->f_fsid.val[0] = (u32)fsid; - buf->f_fsid.val[1] = (u32)(fsid >> 32); + buf->f_fsid = u64_to_fsid(fsid); return 0; } |