21 files changed, 342 insertions, 333 deletions
diff --git a/drivers/staging/lustre/lustre/llite/dcache.c b/drivers/staging/lustre/lustre/llite/dcache.c
index 65bf0c401b44..966f580e26fb 100644
--- a/drivers/staging/lustre/lustre/llite/dcache.c
+++ b/drivers/staging/lustre/lustre/llite/dcache.c
@@ -247,17 +247,14 @@ static int ll_revalidate_dentry(struct dentry *dentry,
 		return 1;
 
 	/*
-	 * if open&create is set, talk to MDS to make sure file is created if
-	 * necessary, because we can't do this in ->open() later since that's
-	 * called on an inode. return 0 here to let lookup to handle this.
+	 * VFS warns us that this is the second go around and previous
+	 * operation failed (most likely open|creat), so this time
+	 * we better talk to the server via the lookup path by name,
+	 * not by fid.
 	 */
-	if ((lookup_flags & (LOOKUP_OPEN | LOOKUP_CREATE)) ==
-	    (LOOKUP_OPEN | LOOKUP_CREATE))
+	if (lookup_flags & LOOKUP_REVAL)
 		return 0;
 
-	if (lookup_flags & (LOOKUP_PARENT | LOOKUP_OPEN | LOOKUP_CREATE))
-		return 1;
-
 	if (!dentry_may_statahead(dir, dentry))
 		return 1;
 
diff --git a/drivers/staging/lustre/lustre/llite/dir.c b/drivers/staging/lustre/lustre/llite/dir.c
index ea5d247a3f70..13b35922a4ca 100644
--- a/drivers/staging/lustre/lustre/llite/dir.c
+++ b/drivers/staging/lustre/lustre/llite/dir.c
@@ -432,7 +432,7 @@ static int ll_dir_setdirstripe(struct inode *parent, struct lmv_user_md *lump,
 
 	if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent)))
 		mode &= ~current_umask();
-	mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR;
+	mode = (mode & (0777 | S_ISVTX)) | S_IFDIR;
 	op_data = ll_prep_md_op_data(NULL, parent, NULL, dirname,
 				     strlen(dirname), mode, LUSTRE_OPC_MKDIR,
 				     lump);
@@ -521,12 +521,15 @@ int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
 	rc = md_setattr(sbi->ll_md_exp, op_data, lump, lum_size, &req);
 	ll_finish_md_op_data(op_data);
 	ptlrpc_req_finished(req);
-	if (rc) {
-		if (rc != -EPERM && rc != -EACCES)
-			CERROR("mdc_setattr fails: rc = %d\n", rc);
-	}
+	if (rc)
+		return rc;
 
-	/* In the following we use the fact that LOV_USER_MAGIC_V1 and
+#if OBD_OCD_VERSION(2, 13, 53, 0) > LUSTRE_VERSION_CODE
+	/*
+	 * 2.9 server has stored filesystem default stripe in ROOT xattr,
+	 * and it's stored into system config for backward compatibility.
+	 *
+	 * In the following we use the fact that LOV_USER_MAGIC_V1 and
 	 * LOV_USER_MAGIC_V3 have the same initial fields so we do not
 	 * need to make the distinction between the 2 versions
 	 */
@@ -567,6 +570,7 @@ int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
 end:
 		kfree(param);
 	}
+#endif
 	return rc;
 }
 
diff --git a/drivers/staging/lustre/lustre/llite/file.c b/drivers/staging/lustre/lustre/llite/file.c
index f634c11216e6..10adfcdd7035 100644
--- a/drivers/staging/lustre/lustre/llite/file.c
+++ b/drivers/staging/lustre/lustre/llite/file.c
@@ -122,26 +122,25 @@ static int ll_close_inode_openhandle(struct obd_export *md_exp,
 				     enum mds_op_bias bias,
 				     void *data)
 {
-	struct obd_export *exp = ll_i2mdexp(inode);
+	const struct ll_inode_info *lli = ll_i2info(inode);
 	struct md_op_data *op_data;
 	struct ptlrpc_request *req = NULL;
-	struct obd_device *obd = class_exp2obd(exp);
 	int rc;
 
-	if (!obd) {
-		/*
-		 * XXX: in case of LMV, is this correct to access
-		 * ->exp_handle?
-		 */
-		CERROR("Invalid MDC connection handle %#llx\n",
-		       ll_i2mdexp(inode)->exp_handle.h_cookie);
+	if (!class_exp2obd(md_exp)) {
+		CERROR("%s: invalid MDC connection handle closing " DFID "\n",
+		       ll_get_fsname(inode->i_sb, NULL, 0),
+		       PFID(&lli->lli_fid));
 		rc = 0;
 		goto out;
 	}
 
 	op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
+	/*
+	 * We leak openhandle and request here on error, but not much to be
+	 * done in OOM case since app won't retry close on error either.
+	 */
 	if (!op_data) {
-		/* XXX We leak openhandle and request here. */
 		rc = -ENOMEM;
 		goto out;
 	}
@@ -170,10 +169,9 @@ static int ll_close_inode_openhandle(struct obd_export *md_exp,
 	}
 
 	rc = md_close(md_exp, op_data, och->och_mod, &req);
-	if (rc) {
-		CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
-		       ll_i2mdexp(inode)->exp_obd->obd_name,
-		       PFID(ll_inode2fid(inode)), rc);
+	if (rc && rc != -EINTR) {
+		CERROR("%s: inode " DFID " mdc close failed: rc = %d\n",
+		       md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 	}
 
 	if (op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP) &&
@@ -192,8 +190,7 @@ out:
 	och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 	kfree(och);
 
-	if (req) /* This is close request */
-		ptlrpc_req_finished(req);
+	ptlrpc_req_finished(req);
 	return rc;
 }
 
@@ -420,6 +417,17 @@ out:
 	ptlrpc_req_finished(req);
 	ll_intent_drop_lock(itp);
 
+	/*
+	 * We did open by fid, but by the time we got to the server,
+	 * the object disappeared. If this is a create, we cannot really
+	 * tell the userspace that the file it was trying to create
+	 * does not exist. Instead let's return -ESTALE, and the VFS will
+	 * retry the create with LOOKUP_REVAL that we are going to catch
+	 * in ll_revalidate_dentry() and use lookup then.
+	 */
+	if (rc == -ENOENT && itp->it_op & IT_CREAT)
+		rc = -ESTALE;
+
 	return rc;
 }
 
@@ -1016,7 +1024,7 @@ static bool file_is_noatime(const struct file *file)
 	return false;
 }
 
-void ll_io_init(struct cl_io *io, const struct file *file, int write)
+static void ll_io_init(struct cl_io *io, const struct file *file, int write)
 {
 	struct inode *inode = file_inode(file);
 
@@ -1821,7 +1829,7 @@ free:
 	return rc;
 }
 
-static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
+int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
 {
 	struct md_op_data	*op_data;
 	int			 rc;
@@ -1883,7 +1891,7 @@ static int ll_hsm_import(struct inode *inode, struct file *file,
 		goto free_hss;
 	}
 
-	attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
+	attr->ia_mode = hui->hui_mode & 0777;
 	attr->ia_mode |= S_IFREG;
 	attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
 	attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
@@ -2618,18 +2626,18 @@ int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
 		       ll_get_fsname(parent->i_sb, NULL, 0), name,
 		       PFID(&op_data->op_fid3));
 		rc = -EINVAL;
-		goto out_free;
+		goto out_unlock;
 	}
 
 	rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
 	if (rc < 0)
-		goto out_free;
+		goto out_unlock;
 
 	if (rc == mdtidx) {
 		CDEBUG(D_INFO, "%s:"DFID" is already on MDT%d.\n", name,
 		       PFID(&op_data->op_fid3), mdtidx);
 		rc = 0;
-		goto out_free;
+		goto out_unlock;
 	}
 again:
 	if (S_ISREG(child_inode->i_mode)) {
@@ -2637,13 +2645,13 @@ again:
 		if (IS_ERR(och)) {
 			rc = PTR_ERR(och);
 			och = NULL;
-			goto out_free;
+			goto out_unlock;
 		}
 
 		rc = ll_data_version(child_inode, &data_version,
 				     LL_DV_WR_FLUSH);
 		if (rc)
-			goto out_free;
+			goto out_close;
 
 		op_data->op_handle = och->och_fh;
 		op_data->op_data = och->och_mod;
@@ -2656,40 +2664,45 @@ again:
 	op_data->op_cli_flags = CLI_MIGRATE;
 	rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
 		       namelen, name, namelen, &request);
-	if (!rc)
+	if (!rc) {
+		LASSERT(request);
 		ll_update_times(request, parent);
 
-	body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
-	if (!body) {
-		rc = -EPROTO;
-		goto out_free;
+		body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
+		LASSERT(body);
+
+		/*
+		 * If the server does release layout lock, then we cleanup
+		 * the client och here, otherwise release it in out_close:
+		 */
+		if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
+			obd_mod_put(och->och_mod);
+			md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
+						  och);
+			och->och_fh.cookie = DEAD_HANDLE_MAGIC;
+			kfree(och);
+			och = NULL;
+		}
 	}
 
-	/*
-	 * If the server does release layout lock, then we cleanup
-	 * the client och here, otherwise release it in out_free:
-	 */
-	if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
-		obd_mod_put(och->och_mod);
-		md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp, och);
-		och->och_fh.cookie = DEAD_HANDLE_MAGIC;
-		kfree(och);
-		och = NULL;
+	if (request) {
+		ptlrpc_req_finished(request);
+		request = NULL;
 	}
 
-	ptlrpc_req_finished(request);
 	/* Try again if the file layout has changed. */
 	if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
 		goto again;
-out_free:
-	if (child_inode) {
-		if (och) /* close the file */
-			ll_lease_close(och, child_inode, NULL);
-		clear_nlink(child_inode);
-		inode_unlock(child_inode);
-		iput(child_inode);
-	}
 
+out_close:
+	if (och) /* close the file */
+		ll_lease_close(och, child_inode, NULL);
+	if (!rc)
+		clear_nlink(child_inode);
+out_unlock:
+	inode_unlock(child_inode);
+	iput(child_inode);
+out_free:
 	ll_finish_md_op_data(op_data);
 	return rc;
 }
diff --git a/drivers/staging/lustre/lustre/llite/lcommon_cl.c b/drivers/staging/lustre/lustre/llite/lcommon_cl.c
index dd1cfd8f5213..f1036f477a51 100644
--- a/drivers/staging/lustre/lustre/llite/lcommon_cl.c
+++ b/drivers/staging/lustre/lustre/llite/lcommon_cl.c
@@ -94,6 +94,7 @@ int cl_setattr_ost(struct cl_object *obj, const struct iattr *attr,
 
 	io = vvp_env_thread_io(env);
 	io->ci_obj = obj;
+	io->ci_verify_layout = 1;
 
 	io->u.ci_setattr.sa_attr.lvb_atime = LTIME_S(attr->ia_atime);
 	io->u.ci_setattr.sa_attr.lvb_mtime = LTIME_S(attr->ia_mtime);
@@ -120,13 +121,7 @@ again:
 	cl_io_fini(env, io);
 	if (unlikely(io->ci_need_restart))
 		goto again;
-	/* HSM import case: file is released, cannot be restored
-	 * no need to fail except if restore registration failed
-	 * with -ENODATA
-	 */
-	if (result == -ENODATA && io->ci_restore_needed &&
-	    io->ci_result != -ENODATA)
-		result = 0;
+
 	cl_env_put(env, &refcheck);
 	return result;
 }
diff --git a/drivers/staging/lustre/lustre/llite/lcommon_misc.c b/drivers/staging/lustre/lustre/llite/lcommon_misc.c
index f48660ed350f..f0c132e2cf92 100644
--- a/drivers/staging/lustre/lustre/llite/lcommon_misc.c
+++ b/drivers/staging/lustre/lustre/llite/lcommon_misc.c
@@ -33,6 +33,7 @@
  * future).
  *
  */
+#define DEBUG_SUBSYSTEM S_LLITE
 #include "../include/obd_class.h"
 #include "../include/obd_support.h"
 #include "../include/obd.h"
@@ -132,7 +133,6 @@ int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock,
 
 	io = vvp_env_thread_io(env);
 	io->ci_obj = obj;
-	io->ci_ignore_layout = 1;
 
 	rc = cl_io_init(env, io, CIT_MISC, io->ci_obj);
 	if (rc != 0) {
diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h
index 065a9a7e120a..ecdfd0c29b7f 100644
--- a/drivers/staging/lustre/lustre/llite/llite_internal.h
+++ b/drivers/staging/lustre/lustre/llite/llite_internal.h
@@ -281,10 +281,8 @@ static inline struct ll_inode_info *ll_i2info(struct inode *inode)
 	return container_of(inode, struct ll_inode_info, lli_vfs_inode);
 }
 
-/* default to about 40meg of readahead on a given system.  That much tied
- * up in 512k readahead requests serviced at 40ms each is about 1GB/s.
- */
-#define SBI_DEFAULT_READAHEAD_MAX (40UL << (20 - PAGE_SHIFT))
+/* default to about 64M of readahead on a given system. */
+#define SBI_DEFAULT_READAHEAD_MAX	(64UL << (20 - PAGE_SHIFT))
 
 /* default to read-ahead full files smaller than 2MB on the second read */
 #define SBI_DEFAULT_READAHEAD_WHOLE_MAX (2UL << (20 - PAGE_SHIFT))
@@ -321,6 +319,9 @@ struct ll_ra_info {
 struct ra_io_arg {
 	unsigned long ria_start;  /* start offset of read-ahead*/
 	unsigned long ria_end;    /* end offset of read-ahead*/
+	unsigned long ria_reserved; /* reserved pages for read-ahead */
+	unsigned long ria_end_min;  /* minimum end to cover current read */
+	bool ria_eof;		    /* reach end of file */
 	/* If stride read pattern is detected, ria_stoff means where
 	 * stride read is started. Note: for normal read-ahead, the
 	 * value here is meaningless, and also it will not be accessed
@@ -505,6 +506,7 @@ struct ll_sb_info {
 						 */
 	/* root squash */
 	struct root_squash_info	  ll_squash;
+	struct path		 ll_mnt;
 
 	__kernel_fsid_t		  ll_fsid;
 	struct kobject		 ll_kobj; /* sysfs object */
@@ -551,6 +553,11 @@ struct ll_readahead_state {
 	 */
 	unsigned long   ras_window_start, ras_window_len;
 	/*
+	 * Optimal RPC size. It decides how many pages will be sent
+	 * for each read-ahead.
+	 */
+	unsigned long	ras_rpc_size;
+	/*
 	 * Where next read-ahead should start at. This lies within read-ahead
 	 * window. Read-ahead window is read in pieces rather than at once
 	 * because: 1. lustre limits total number of pages under read-ahead by
@@ -766,6 +773,7 @@ int ll_merge_attr(const struct lu_env *env, struct inode *inode);
 int ll_fid2path(struct inode *inode, void __user *arg);
 int ll_data_version(struct inode *inode, __u64 *data_version, int flags);
 int ll_hsm_release(struct inode *inode);
+int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss);
 
 /* llite/dcache.c */
 
diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c
index 25f5aed97f63..b229cbc7bb33 100644
--- a/drivers/staging/lustre/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustre/lustre/llite/llite_lib.c
@@ -103,6 +103,7 @@ static struct ll_sb_info *ll_init_sbi(struct super_block *sb)
 	sbi->ll_flags |= LL_SBI_CHECKSUM;
 
 	sbi->ll_flags |= LL_SBI_LRU_RESIZE;
+	sbi->ll_flags |= LL_SBI_LAZYSTATFS;
 
 	for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) {
 		spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i].
@@ -303,6 +304,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	sb->s_magic = LL_SUPER_MAGIC;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sbi->ll_namelen = osfs->os_namelen;
+	sbi->ll_mnt.mnt = current->fs->root.mnt;
 
 	if ((sbi->ll_flags & LL_SBI_USER_XATTR) &&
 	    !(data->ocd_connect_flags & OBD_CONNECT_XATTR)) {
@@ -1402,7 +1404,11 @@ static int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data)
 	 * cache is not cleared yet.
 	 */
 	op_data->op_attr.ia_valid &= ~(TIMES_SET_FLAGS | ATTR_SIZE);
+	if (S_ISREG(inode->i_mode))
+		inode_lock(inode);
 	rc = simple_setattr(dentry, &op_data->op_attr);
+	if (S_ISREG(inode->i_mode))
+		inode_unlock(inode);
 	op_data->op_attr.ia_valid = ia_valid;
 
 	rc = ll_update_inode(inode, &md);
@@ -1431,7 +1437,6 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import)
 	struct inode *inode = d_inode(dentry);
 	struct ll_inode_info *lli = ll_i2info(inode);
 	struct md_op_data *op_data = NULL;
-	bool file_is_released = false;
 	int rc = 0;
 
 	CDEBUG(D_VFSTRACE, "%s: setattr inode "DFID"(%p) from %llu to %llu, valid %x, hsm_import %d\n",
@@ -1486,76 +1491,35 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import)
 		       LTIME_S(attr->ia_mtime), LTIME_S(attr->ia_ctime),
 		       (s64)ktime_get_real_seconds());
 
-	/* We always do an MDS RPC, even if we're only changing the size;
-	 * only the MDS knows whether truncate() should fail with -ETXTBUSY
-	 */
-
-	op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
-	if (!op_data)
-		return -ENOMEM;
-
-	if (!S_ISDIR(inode->i_mode))
+	if (S_ISREG(inode->i_mode))
 		inode_unlock(inode);
 
-	/* truncate on a released file must failed with -ENODATA,
-	 * so size must not be set on MDS for released file
-	 * but other attributes must be set
+	/*
+	 * We always do an MDS RPC, even if we're only changing the size;
+	 * only the MDS knows whether truncate() should fail with -ETXTBUSY
 	 */
-	if (S_ISREG(inode->i_mode)) {
-		struct cl_layout cl = {
-			.cl_is_released = false,
-		};
-		struct lu_env *env;
-		int refcheck;
-		__u32 gen;
+	op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
+	if (!op_data) {
+		rc = -ENOMEM;
+		goto out;
+	}
 
-		rc = ll_layout_refresh(inode, &gen);
-		if (rc < 0)
-			goto out;
+	op_data->op_attr = *attr;
 
+	if (!hsm_import && attr->ia_valid & ATTR_SIZE) {
 		/*
-		 * XXX: the only place we need to know the layout type,
-		 * this will be removed by a later patch. -Jinshan
+		 * If we are changing file size, file content is
+		 * modified, flag it.
 		 */
-		env = cl_env_get(&refcheck);
-		if (IS_ERR(env)) {
-			rc = PTR_ERR(env);
-			goto out;
-		}
-
-		rc = cl_object_layout_get(env, lli->lli_clob, &cl);
-		cl_env_put(env, &refcheck);
-		if (rc < 0)
-			goto out;
-
-		file_is_released = cl.cl_is_released;
-
-		if (!hsm_import && attr->ia_valid & ATTR_SIZE) {
-			if (file_is_released) {
-				rc = ll_layout_restore(inode, 0, attr->ia_size);
-				if (rc < 0)
-					goto out;
-
-				file_is_released = false;
-				ll_layout_refresh(inode, &gen);
-			}
-
-			/*
-			 * If we are changing file size, file content is
-			 * modified, flag it.
-			 */
-			attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
-			op_data->op_bias |= MDS_DATA_MODIFIED;
-		}
+		attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE;
+		op_data->op_bias |= MDS_DATA_MODIFIED;
 	}
 
-	memcpy(&op_data->op_attr, attr, sizeof(*attr));
-
 	rc = ll_md_setattr(dentry, op_data);
 	if (rc)
 		goto out;
 
-	if (!S_ISREG(inode->i_mode) || file_is_released) {
+	if (!S_ISREG(inode->i_mode) || hsm_import) {
 		rc = 0;
 		goto out;
 	}
@@ -1572,11 +1536,40 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import)
 		 */
 		rc = cl_setattr_ost(ll_i2info(inode)->lli_clob, attr, 0);
 	}
+
+	/*
+	 * If the file was restored, it needs to set dirty flag.
+	 *
+	 * We've already sent MDS_DATA_MODIFIED flag in
+	 * ll_md_setattr() for truncate. However, the MDT refuses to
+	 * set the HS_DIRTY flag on released files, so we have to set
+	 * it again if the file has been restored. Please check how
+	 * LLIF_DATA_MODIFIED is set in vvp_io_setattr_fini().
+	 *
+	 * Please notice that if the file is not released, the previous
+	 * MDS_DATA_MODIFIED has taken effect and usually
+	 * LLIF_DATA_MODIFIED is not set(see vvp_io_setattr_fini()).
+	 * This way we can save an RPC for common open + trunc
+	 * operation.
+	 */
+	if (test_and_clear_bit(LLIF_DATA_MODIFIED, &lli->lli_flags)) {
+		struct hsm_state_set hss = {
+			.hss_valid = HSS_SETMASK,
+			.hss_setmask = HS_DIRTY,
+		};
+		int rc2;
+
+		rc2 = ll_hsm_state_set(inode, &hss);
+		if (rc2 < 0)
+			CERROR(DFID "HSM set dirty failed: rc2 = %d\n",
+			       PFID(ll_inode2fid(inode)), rc2);
+	}
+
 out:
 	if (op_data)
 		ll_finish_md_op_data(op_data);
 
-	if (!S_ISDIR(inode->i_mode)) {
+	if (S_ISREG(inode->i_mode)) {
 		inode_lock(inode);
 		if ((attr->ia_valid & ATTR_SIZE) && !hsm_import)
 			inode_dio_wait(inode);
@@ -1599,7 +1592,7 @@ int ll_setattr(struct dentry *de, struct iattr *attr)
 	if (((attr->ia_valid & (ATTR_MODE | ATTR_FORCE | ATTR_SIZE)) ==
 			       (ATTR_SIZE | ATTR_MODE)) &&
 	    (((mode & S_ISUID) && !(attr->ia_mode & S_ISUID)) ||
-	     (((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) &&
+	     (((mode & (S_ISGID | 0010)) == (S_ISGID | 0010)) &&
 	      !(attr->ia_mode & S_ISGID))))
 		attr->ia_valid |= ATTR_FORCE;
 
@@ -1610,7 +1603,7 @@ int ll_setattr(struct dentry *de, struct iattr *attr)
 		attr->ia_valid |= ATTR_KILL_SUID;
 
 	if ((attr->ia_valid & ATTR_MODE) &&
-	    ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) &&
+	    ((mode & (S_ISGID | 0010)) == (S_ISGID | 0010)) &&
 	    !(attr->ia_mode & S_ISGID) &&
 	    !(attr->ia_valid & ATTR_KILL_SGID))
 		attr->ia_valid |= ATTR_KILL_SGID;
@@ -1998,6 +1991,8 @@ void ll_umount_begin(struct super_block *sb)
 	struct ll_sb_info *sbi = ll_s2sbi(sb);
 	struct obd_device *obd;
 	struct obd_ioctl_data *ioc_data;
+	wait_queue_head_t waitq;
+	struct l_wait_info lwi;
 
 	CDEBUG(D_VFSTRACE, "VFS Op: superblock %p count %d active %d\n", sb,
 	       sb->s_count, atomic_read(&sb->s_active));
@@ -2030,9 +2025,14 @@ void ll_umount_begin(struct super_block *sb)
 	}
 
 	/* Really, we'd like to wait until there are no requests outstanding,
-	 * and then continue.  For now, we just invalidate the requests,
-	 * schedule() and sleep one second if needed, and hope.
+	 * and then continue. For now, we just periodically checking for vfs
+	 * to decrement mnt_cnt and hope to finish it within 10sec.
 	 */
+	init_waitqueue_head(&waitq);
+	lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(10),
+				   cfs_time_seconds(1), NULL, NULL);
+	l_wait_event(waitq, may_umount(sbi->ll_mnt.mnt), &lwi);
+
 	schedule();
 }
 
diff --git a/drivers/staging/lustre/lustre/llite/llite_mmap.c b/drivers/staging/lustre/lustre/llite/llite_mmap.c
index ee01f20d8b11..9afa6bec3e6f 100644
--- a/drivers/staging/lustre/lustre/llite/llite_mmap.c
+++ b/drivers/staging/lustre/lustre/llite/llite_mmap.c
@@ -390,15 +390,13 @@ static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 		result = VM_FAULT_LOCKED;
 		break;
 	case -ENODATA:
+	case -EAGAIN:
 	case -EFAULT:
 		result = VM_FAULT_NOPAGE;
 		break;
 	case -ENOMEM:
 		result = VM_FAULT_OOM;
 		break;
-	case -EAGAIN:
-		result = VM_FAULT_RETRY;
-		break;
 	default:
 		result = VM_FAULT_SIGBUS;
 		break;
diff --git a/drivers/staging/lustre/lustre/llite/lproc_llite.c b/drivers/staging/lustre/lustre/llite/lproc_llite.c
index 03682c10fc9e..f3ee584157e0 100644
--- a/drivers/staging/lustre/lustre/llite/lproc_llite.c
+++ b/drivers/staging/lustre/lustre/llite/lproc_llite.c
@@ -924,27 +924,29 @@ static ssize_t ll_unstable_stats_seq_write(struct file *file,
 }
 LPROC_SEQ_FOPS(ll_unstable_stats);
 
-static ssize_t root_squash_show(struct kobject *kobj, struct attribute *attr,
-				char *buf)
+static int ll_root_squash_seq_show(struct seq_file *m, void *v)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kobj);
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
 	struct root_squash_info *squash = &sbi->ll_squash;
 
-	return sprintf(buf, "%u:%u\n", squash->rsi_uid, squash->rsi_gid);
+	seq_printf(m, "%u:%u\n", squash->rsi_uid, squash->rsi_gid);
+	return 0;
 }
 
-static ssize_t root_squash_store(struct kobject *kobj, struct attribute *attr,
-				 const char *buffer, size_t count)
+static ssize_t ll_root_squash_seq_write(struct file *file,
+					const char __user *buffer,
+					size_t count, loff_t *off)
 {
-	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
-					      ll_kobj);
+	struct seq_file *m = file->private_data;
+	struct super_block *sb = m->private;
+	struct ll_sb_info *sbi = ll_s2sbi(sb);
 	struct root_squash_info *squash = &sbi->ll_squash;
 
 	return lprocfs_wr_root_squash(buffer, count, squash,
-				      ll_get_fsname(sbi->ll_sb, NULL, 0));
+				      ll_get_fsname(sb, NULL, 0));
 }
-LUSTRE_RW_ATTR(root_squash);
+LPROC_SEQ_FOPS(ll_root_squash);
 
 static int ll_nosquash_nids_seq_show(struct seq_file *m, void *v)
 {
@@ -997,6 +999,8 @@ static struct lprocfs_vars lprocfs_llite_obd_vars[] = {
 	{ "statahead_stats",  &ll_statahead_stats_fops, NULL, 0 },
 	{ "unstable_stats",   &ll_unstable_stats_fops, NULL },
 	{ "sbi_flags",	      &ll_sbi_flags_fops, NULL, 0 },
+	{ .name =       "root_squash",
+	  .fops =       &ll_root_squash_fops			},
 	{ .name =		"nosquash_nids",
 	  .fops =		&ll_nosquash_nids_fops		},
 	{ NULL }
@@ -1027,7 +1031,6 @@ static struct attribute *llite_attrs[] = {
 	&lustre_attr_max_easize.attr,
 	&lustre_attr_default_easize.attr,
 	&lustre_attr_xattr_cache.attr,
-	&lustre_attr_root_squash.attr,
 	NULL,
 };
 
diff --git a/drivers/staging/lustre/lustre/llite/namei.c b/drivers/staging/lustre/lustre/llite/namei.c
index a8f4e7fb0a46..fc176540bb95 100644
--- a/drivers/staging/lustre/lustre/llite/namei.c
+++ b/drivers/staging/lustre/lustre/llite/namei.c
@@ -994,11 +994,6 @@ static int ll_create_nd(struct inode *dir, struct dentry *dentry,
 	return rc;
 }
 
-/* ll_unlink() doesn't update the inode with the new link count.
- * Instead, ll_ddelete() and ll_d_iput() will update it based upon if there
- * is any lock existing. They will recycle dentries and inodes based upon locks
- * too. b=20433
- */
 static int ll_unlink(struct inode *dir, struct dentry *dchild)
 {
 	struct ptlrpc_request *request = NULL;
@@ -1041,7 +1036,7 @@ static int ll_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 
 	if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir)))
 		mode &= ~current_umask();
-	mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR;
+	mode = (mode & (0777 | S_ISVTX)) | S_IFDIR;
 
 	err = ll_new_node(dir, dentry, NULL, mode, 0, LUSTRE_OPC_MKDIR);
 	if (!err)
@@ -1089,7 +1084,7 @@ static int ll_symlink(struct inode *dir, struct dentry *dentry,
 	CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir="DFID"(%p),target=%.*s\n",
 	       dentry, PFID(ll_inode2fid(dir)), dir, 3000, oldname);
 
-	err = ll_new_node(dir, dentry, oldname, S_IFLNK | S_IRWXUGO,
+	err = ll_new_node(dir, dentry, oldname, S_IFLNK | 0777,
 			  0, LUSTRE_OPC_SYMLINK);
 
 	if (!err)
diff --git a/drivers/staging/lustre/lustre/llite/range_lock.c b/drivers/staging/lustre/lustre/llite/range_lock.c
index 94c818f1478b..14148a097476 100644
--- a/drivers/staging/lustre/lustre/llite/range_lock.c
+++ b/drivers/staging/lustre/lustre/llite/range_lock.c
@@ -61,17 +61,23 @@ void range_lock_tree_init(struct range_lock_tree *tree)
  * Pre:  Caller should have allocated the range lock node.
  * Post: The range lock node is meant to cover [start, end] region
  */
-void range_lock_init(struct range_lock *lock, __u64 start, __u64 end)
+int range_lock_init(struct range_lock *lock, __u64 start, __u64 end)
 {
+	int rc;
+
 	memset(&lock->rl_node, 0, sizeof(lock->rl_node));
 	if (end != LUSTRE_EOF)
 		end >>= PAGE_SHIFT;
-	interval_set(&lock->rl_node, start >> PAGE_SHIFT, end);
+	rc = interval_set(&lock->rl_node, start >> PAGE_SHIFT, end);
+	if (rc)
+		return rc;
+
 	INIT_LIST_HEAD(&lock->rl_next_lock);
 	lock->rl_task = NULL;
 	lock->rl_lock_count = 0;
 	lock->rl_blocking_ranges = 0;
 	lock->rl_sequence = 0;
+	return rc;
 }
 
 static inline struct range_lock *next_lock(struct range_lock *lock)
diff --git a/drivers/staging/lustre/lustre/llite/range_lock.h b/drivers/staging/lustre/lustre/llite/range_lock.h
index c6d04a6f99fd..779091ccec4e 100644
--- a/drivers/staging/lustre/lustre/llite/range_lock.h
+++ b/drivers/staging/lustre/lustre/llite/range_lock.h
@@ -76,7 +76,7 @@ struct range_lock_tree {
 };
 
 void range_lock_tree_init(struct range_lock_tree *tree);
-void range_lock_init(struct range_lock *lock, __u64 start, __u64 end);
+int range_lock_init(struct range_lock *lock, __u64 start, __u64 end);
 int  range_lock(struct range_lock_tree *tree, struct range_lock *lock);
 void range_unlock(struct range_lock_tree *tree, struct range_lock *lock);
 #endif
diff --git a/drivers/staging/lustre/lustre/llite/rw.c b/drivers/staging/lustre/lustre/llite/rw.c
index f10e092979fe..50d027e0cfab 100644
--- a/drivers/staging/lustre/lustre/llite/rw.c
+++ b/drivers/staging/lustre/lustre/llite/rw.c
@@ -92,25 +92,6 @@ static unsigned long ll_ra_count_get(struct ll_sb_info *sbi,
 		goto out;
 	}
 
-	/* If the non-strided (ria_pages == 0) readahead window
-	 * (ria_start + ret) has grown across an RPC boundary, then trim
-	 * readahead size by the amount beyond the RPC so it ends on an
-	 * RPC boundary. If the readahead window is already ending on
-	 * an RPC boundary (beyond_rpc == 0), or smaller than a full
-	 * RPC (beyond_rpc < ret) the readahead size is unchanged.
-	 * The (beyond_rpc != 0) check is skipped since the conditional
-	 * branch is more expensive than subtracting zero from the result.
-	 *
-	 * Strided read is left unaligned to avoid small fragments beyond
-	 * the RPC boundary from needing an extra read RPC.
-	 */
-	if (ria->ria_pages == 0) {
-		long beyond_rpc = (ria->ria_start + ret) % PTLRPC_MAX_BRW_PAGES;
-
-		if (/* beyond_rpc != 0 && */ beyond_rpc < ret)
-			ret -= beyond_rpc;
-	}
-
 	if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) {
 		atomic_sub(ret, &ra->ra_cur_pages);
 		ret = 0;
@@ -147,11 +128,12 @@ void ll_ra_stats_inc(struct inode *inode, enum ra_stat which)
 
 #define RAS_CDEBUG(ras) \
 	CDEBUG(D_READA,						      \
-	       "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu"    \
-	       "csr %lu sf %lu sp %lu sl %lu\n",			    \
+	       "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu rpc %lu "	     \
+	       "r %lu ri %lu csr %lu sf %lu sp %lu sl %lu\n",		     \
 	       ras->ras_last_readpage, ras->ras_consecutive_requests,	\
 	       ras->ras_consecutive_pages, ras->ras_window_start,	    \
 	       ras->ras_window_len, ras->ras_next_readahead,		 \
+	       ras->ras_rpc_size,					     \
 	       ras->ras_requests, ras->ras_request_index,		    \
 	       ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \
 	       ras->ras_stride_pages, ras->ras_stride_length)
@@ -261,20 +243,6 @@ out:
 	ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\
 	ria->ria_pages)
 
-/* Limit this to the blocksize instead of PTLRPC_BRW_MAX_SIZE, since we don't
- * know what the actual RPC size is.  If this needs to change, it makes more
- * sense to tune the i_blkbits value for the file based on the OSTs it is
- * striped over, rather than having a constant value for all files here.
- */
-
-/* RAS_INCREASE_STEP should be (1UL << (inode->i_blkbits - PAGE_SHIFT)).
- * Temporarily set RAS_INCREASE_STEP to 1MB. After 4MB RPC is enabled
- * by default, this should be adjusted corresponding with max_read_ahead_mb
- * and max_read_ahead_per_file_mb otherwise the readahead budget can be used
- * up quickly which will affect read performance significantly. See LU-2816
- */
-#define RAS_INCREASE_STEP(inode) (ONE_MB_BRW_SIZE >> PAGE_SHIFT)
-
 static inline int stride_io_mode(struct ll_readahead_state *ras)
 {
 	return ras->ras_consecutive_stride_requests > 1;
@@ -345,6 +313,17 @@ static int ria_page_count(struct ra_io_arg *ria)
 			       length);
 }
 
+static unsigned long ras_align(struct ll_readahead_state *ras,
+			       unsigned long index,
+			       unsigned long *remainder)
+{
+	unsigned long rem = index % ras->ras_rpc_size;
+
+	if (remainder)
+		*remainder = rem;
+	return index - rem;
+}
+
 /*Check whether the index is in the defined ra-window */
 static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria)
 {
@@ -358,42 +337,63 @@ static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria)
 		ria->ria_length < ria->ria_pages);
 }
 
-static int ll_read_ahead_pages(const struct lu_env *env,
-			       struct cl_io *io, struct cl_page_list *queue,
-			       struct ra_io_arg *ria,
-			       unsigned long *reserved_pages,
-			       pgoff_t *ra_end)
+static unsigned long
+ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io,
+		    struct cl_page_list *queue, struct ll_readahead_state *ras,
+		    struct ra_io_arg *ria)
 {
 	struct cl_read_ahead ra = { 0 };
-	int rc, count = 0;
+	unsigned long ra_end = 0;
 	bool stride_ria;
 	pgoff_t page_idx;
+	int rc;
 
 	LASSERT(ria);
 	RIA_DEBUG(ria);
 
 	stride_ria = ria->ria_length > ria->ria_pages && ria->ria_pages > 0;
 	for (page_idx = ria->ria_start;
-	     page_idx <= ria->ria_end && *reserved_pages > 0; page_idx++) {
+	     page_idx <= ria->ria_end && ria->ria_reserved > 0; page_idx++) {
 		if (ras_inside_ra_window(page_idx, ria)) {
 			if (!ra.cra_end || ra.cra_end < page_idx) {
+				unsigned long end;
+
 				cl_read_ahead_release(env, &ra);
 
 				rc = cl_io_read_ahead(env, io, page_idx, &ra);
 				if (rc < 0)
 					break;
 
+				CDEBUG(D_READA, "idx: %lu, ra: %lu, rpc: %lu\n",
+				       page_idx, ra.cra_end, ra.cra_rpc_size);
 				LASSERTF(ra.cra_end >= page_idx,
 					 "object: %p, indcies %lu / %lu\n",
 					 io->ci_obj, ra.cra_end, page_idx);
+				/*
+				 * update read ahead RPC size.
+				 * NB: it's racy but doesn't matter
+				 */
+				if (ras->ras_rpc_size > ra.cra_rpc_size &&
+				    ra.cra_rpc_size > 0)
+					ras->ras_rpc_size = ra.cra_rpc_size;
+				/* trim it to align with optimal RPC size */
+				end = ras_align(ras, ria->ria_end + 1, NULL);
+				if (end > 0 && !ria->ria_eof)
+					ria->ria_end = end - 1;
+				if (ria->ria_end < ria->ria_end_min)
+					ria->ria_end = ria->ria_end_min;
+				if (ria->ria_end > ra.cra_end)
+					ria->ria_end = ra.cra_end;
 			}
 
-			/* If the page is inside the read-ahead window*/
+			/* If the page is inside the read-ahead window */
 			rc = ll_read_ahead_page(env, io, queue, page_idx);
-			if (!rc) {
-				(*reserved_pages)--;
-				count++;
-			}
+			if (rc < 0)
+				break;
+
+			ra_end = page_idx;
+			if (!rc)
+				ria->ria_reserved--;
 		} else if (stride_ria) {
 			/* If it is not in the read-ahead window, and it is
 			 * read-ahead mode, then check whether it should skip
@@ -420,8 +420,7 @@ static int ll_read_ahead_pages(const struct lu_env *env,
 	}
 	cl_read_ahead_release(env, &ra);
 
-	*ra_end = page_idx;
-	return count;
+	return ra_end;
 }
 
 static int ll_readahead(const struct lu_env *env, struct cl_io *io,
@@ -431,7 +430,7 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
 	struct vvp_io *vio = vvp_env_io(env);
 	struct ll_thread_info *lti = ll_env_info(env);
 	struct cl_attr *attr = vvp_env_thread_attr(env);
-	unsigned long len, mlen = 0, reserved;
+	unsigned long len, mlen = 0;
 	pgoff_t ra_end, start = 0, end = 0;
 	struct inode *inode;
 	struct ra_io_arg *ria = &lti->lti_ria;
@@ -478,29 +477,15 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
 	    end < vio->vui_ra_start + vio->vui_ra_count - 1)
 		end = vio->vui_ra_start + vio->vui_ra_count - 1;
 
-	if (end != 0) {
-		unsigned long rpc_boundary;
-		/*
-		 * Align RA window to an optimal boundary.
-		 *
-		 * XXX This would be better to align to cl_max_pages_per_rpc
-		 * instead of PTLRPC_MAX_BRW_PAGES, because the RPC size may
-		 * be aligned to the RAID stripe size in the future and that
-		 * is more important than the RPC size.
-		 */
-		/* Note: we only trim the RPC, instead of extending the RPC
-		 * to the boundary, so to avoid reading too much pages during
-		 * random reading.
-		 */
-		rpc_boundary = (end + 1) & (~(PTLRPC_MAX_BRW_PAGES - 1));
-		if (rpc_boundary > 0)
-			rpc_boundary--;
-
-		if (rpc_boundary  > start)
-			end = rpc_boundary;
+	if (end) {
+		unsigned long end_index;
 
 		/* Truncate RA window to end of file */
-		end = min(end, (unsigned long)((kms - 1) >> PAGE_SHIFT));
+		end_index = (unsigned long)((kms - 1) >> PAGE_SHIFT);
+		if (end_index <= end) {
+			end = end_index;
+			ria->ria_eof = true;
+		}
 
 		ras->ras_next_readahead = max(end, end + 1);
 		RAS_CDEBUG(ras);
@@ -535,28 +520,31 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
 	/* at least to extend the readahead window to cover current read */
 	if (!hit && vio->vui_ra_valid &&
 	    vio->vui_ra_start + vio->vui_ra_count > ria->ria_start) {
+		unsigned long remainder;
+
 		/* to the end of current read window. */
 		mlen = vio->vui_ra_start + vio->vui_ra_count - ria->ria_start;
 		/* trim to RPC boundary */
-		start = ria->ria_start & (PTLRPC_MAX_BRW_PAGES - 1);
-		mlen = min(mlen, PTLRPC_MAX_BRW_PAGES - start);
+		ras_align(ras, ria->ria_start, &remainder);
+		mlen = min(mlen, ras->ras_rpc_size - remainder);
+		ria->ria_end_min = ria->ria_start + mlen;
 	}
 
-	reserved = ll_ra_count_get(ll_i2sbi(inode), ria, len, mlen);
-	if (reserved < len)
+	ria->ria_reserved = ll_ra_count_get(ll_i2sbi(inode), ria, len, mlen);
+	if (ria->ria_reserved < len)
 		ll_ra_stats_inc(inode, RA_STAT_MAX_IN_FLIGHT);
 
 	CDEBUG(D_READA, "reserved pages %lu/%lu/%lu, ra_cur %d, ra_max %lu\n",
-	       reserved, len, mlen,
+	       ria->ria_reserved, len, mlen,
 	       atomic_read(&ll_i2sbi(inode)->ll_ra_info.ra_cur_pages),
 	       ll_i2sbi(inode)->ll_ra_info.ra_max_pages);
 
-	ret = ll_read_ahead_pages(env, io, queue, ria, &reserved, &ra_end);
+	ra_end = ll_read_ahead_pages(env, io, queue, ras, ria);
 
-	if (reserved != 0)
-		ll_ra_count_put(ll_i2sbi(inode), reserved);
+	if (ria->ria_reserved)
+		ll_ra_count_put(ll_i2sbi(inode), ria->ria_reserved);
 
-	if (ra_end == end + 1 && ra_end == (kms >> PAGE_SHIFT))
+	if (ra_end == end && ra_end == (kms >> PAGE_SHIFT))
 		ll_ra_stats_inc(inode, RA_STAT_EOF);
 
 	/* if we didn't get to the end of the region we reserved from
@@ -568,13 +556,13 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
 	CDEBUG(D_READA, "ra_end = %lu end = %lu stride end = %lu pages = %d\n",
 	       ra_end, end, ria->ria_end, ret);
 
-	if (ra_end != end + 1) {
+	if (ra_end > 0 && ra_end != end) {
 		ll_ra_stats_inc(inode, RA_STAT_FAILED_REACH_END);
 		spin_lock(&ras->ras_lock);
-		if (ra_end < ras->ras_next_readahead &&
+		if (ra_end <= ras->ras_next_readahead &&
 		    index_in_window(ra_end, ras->ras_window_start, 0,
 				    ras->ras_window_len)) {
-			ras->ras_next_readahead = ra_end;
+			ras->ras_next_readahead = ra_end + 1;
 			RAS_CDEBUG(ras);
 		}
 		spin_unlock(&ras->ras_lock);
@@ -586,7 +574,7 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
 static void ras_set_start(struct inode *inode, struct ll_readahead_state *ras,
 			  unsigned long index)
 {
-	ras->ras_window_start = index & (~(RAS_INCREASE_STEP(inode) - 1));
+	ras->ras_window_start = ras_align(ras, index, NULL);
 }
 
 /* called with the ras_lock held or from places where it doesn't matter */
@@ -615,6 +603,7 @@ static void ras_stride_reset(struct ll_readahead_state *ras)
 void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras)
 {
 	spin_lock_init(&ras->ras_lock);
+	ras->ras_rpc_size = PTLRPC_MAX_BRW_PAGES;
 	ras_reset(inode, ras, 0);
 	ras->ras_requests = 0;
 }
@@ -719,12 +708,15 @@ static void ras_increase_window(struct inode *inode,
 	 * but current clio architecture does not support retrieve such
 	 * information from lower layer. FIXME later
 	 */
-	if (stride_io_mode(ras))
-		ras_stride_increase_window(ras, ra, RAS_INCREASE_STEP(inode));
-	else
-		ras->ras_window_len = min(ras->ras_window_len +
-					  RAS_INCREASE_STEP(inode),
-					  ra->ra_max_pages_per_file);
+	if (stride_io_mode(ras)) {
+		ras_stride_increase_window(ras, ra, ras->ras_rpc_size);
+	} else {
+		unsigned long wlen;
+
+		wlen = min(ras->ras_window_len + ras->ras_rpc_size,
+			   ra->ra_max_pages_per_file);
+		ras->ras_window_len = ras_align(ras, wlen, NULL);
+	}
 }
 
 static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
@@ -737,6 +729,10 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
 
 	spin_lock(&ras->ras_lock);
 
+	if (!hit)
+		CDEBUG(D_READA, DFID " pages at %lu miss.\n",
+		       PFID(ll_inode2fid(inode)), index);
+
 	ll_ra_stats_inc_sbi(sbi, hit ? RA_STAT_HIT : RA_STAT_MISS);
 
 	/* reset the read-ahead window in two cases.  First when the app seeks
@@ -852,6 +848,8 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
 		 * instead of ras_window_start, which is RPC aligned
 		 */
 		ras->ras_next_readahead = max(index, ras->ras_next_readahead);
+		ras->ras_window_start = max(ras->ras_stride_offset,
+					    ras->ras_window_start);
 	} else {
 		if (ras->ras_next_readahead < ras->ras_window_start)
 			ras->ras_next_readahead = ras->ras_window_start;
@@ -881,7 +879,7 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
 		 */
 		ras->ras_next_readahead = max(index, ras->ras_next_readahead);
 		ras->ras_stride_offset = index;
-		ras->ras_window_len = RAS_INCREASE_STEP(inode);
+		ras->ras_window_start = max(index, ras->ras_window_start);
 	}
 
 	/* The initial ras_window_len is set to the request size.  To avoid
@@ -1098,38 +1096,39 @@ static int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
 	struct cl_2queue *queue  = &io->ci_queue;
 	struct ll_sb_info *sbi = ll_i2sbi(inode);
 	struct vvp_page *vpg;
+	bool uptodate;
 	int rc = 0;
 
 	vpg = cl2vvp_page(cl_object_page_slice(page->cp_obj, page));
+	uptodate = vpg->vpg_defer_uptodate;
+
 	if (sbi->ll_ra_info.ra_max_pages_per_file > 0 &&
 	    sbi->ll_ra_info.ra_max_pages > 0) {
 		struct vvp_io *vio = vvp_env_io(env);
 		enum ras_update_flags flags = 0;
 
-		if (vpg->vpg_defer_uptodate)
+		if (uptodate)
 			flags |= LL_RAS_HIT;
 		if (!vio->vui_ra_valid)
 			flags |= LL_RAS_MMAP;
 		ras_update(sbi, inode, ras, vvp_index(vpg), flags);
 	}
 
-	if (vpg->vpg_defer_uptodate) {
+	cl_2queue_init(queue);
+	if (uptodate) {
 		vpg->vpg_ra_used = 1;
 		cl_page_export(env, page, 1);
+		cl_page_disown(env, io, page);
+	} else {
+		cl_page_list_add(&queue->c2_qin, page);
 	}
 
-	cl_2queue_init(queue);
-	/*
-	 * Add page into the queue even when it is marked uptodate above.
-	 * this will unlock it automatically as part of cl_page_list_disown().
-	 */
-	cl_page_list_add(&queue->c2_qin, page);
 	if (sbi->ll_ra_info.ra_max_pages_per_file > 0 &&
 	    sbi->ll_ra_info.ra_max_pages > 0) {
 		int rc2;
 
 		rc2 = ll_readahead(env, io, &queue->c2_qin, ras,
-				   vpg->vpg_defer_uptodate);
+				   uptodate);
 		CDEBUG(D_READA, DFID "%d pages read ahead at %lu\n",
 		       PFID(ll_inode2fid(inode)), rc2, vvp_index(vpg));
 	}
diff --git a/drivers/staging/lustre/lustre/llite/rw26.c b/drivers/staging/lustre/lustre/llite/rw26.c
index 21e06e5b514e..d89e79599199 100644
--- a/drivers/staging/lustre/lustre/llite/rw26.c
+++ b/drivers/staging/lustre/lustre/llite/rw26.c
@@ -345,6 +345,10 @@ static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter)
 	ssize_t tot_bytes = 0, result = 0;
 	long size = MAX_DIO_SIZE;
 
+	/* Check EOF by ourselves */
+	if (iov_iter_rw(iter) == READ && file_offset >= i_size_read(inode))
+		return 0;
+
 	/* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */
 	if ((file_offset & ~PAGE_MASK) || (count & ~PAGE_MASK))
 		return -EINVAL;
diff --git a/drivers/staging/lustre/lustre/llite/statahead.c b/drivers/staging/lustre/lustre/llite/statahead.c
index f1ee17f9ec0d..fb7c315b33cb 100644
--- a/drivers/staging/lustre/lustre/llite/statahead.c
+++ b/drivers/staging/lustre/lustre/llite/statahead.c
@@ -79,6 +79,8 @@ struct sa_entry {
 	struct inode	   *se_inode;
 	/* entry name */
 	struct qstr	     se_qstr;
+	/* entry fid */
+	struct lu_fid		se_fid;
 };
 
 static unsigned int sai_generation;
@@ -169,7 +171,7 @@ static inline int is_omitted_entry(struct ll_statahead_info *sai, __u64 index)
 /* allocate sa_entry and hash it to allow scanner process to find it */
 static struct sa_entry *
 sa_alloc(struct dentry *parent, struct ll_statahead_info *sai, __u64 index,
-	 const char *name, int len)
+	 const char *name, int len, const struct lu_fid *fid)
 {
 	struct ll_inode_info *lli;
 	struct sa_entry   *entry;
@@ -194,6 +196,7 @@ sa_alloc(struct dentry *parent, struct ll_statahead_info *sai, __u64 index,
 	entry->se_qstr.hash = full_name_hash(parent, name, len);
 	entry->se_qstr.len = len;
 	entry->se_qstr.name = dname;
+	entry->se_fid = *fid;
 
 	lli = ll_i2info(sai->sai_dentry->d_inode);
 	spin_lock(&lli->lli_sa_lock);
@@ -566,24 +569,8 @@ static void sa_instantiate(struct ll_statahead_info *sai,
 	}
 
 	child = entry->se_inode;
-	if (!child) {
-		/*
-		 * lookup.
-		 */
-		LASSERT(fid_is_zero(&minfo->mi_data.op_fid2));
-
-		/* XXX: No fid in reply, this is probably cross-ref case.
-		 * SA can't handle it yet.
-		 */
-		if (body->mbo_valid & OBD_MD_MDS) {
-			rc = -EAGAIN;
-			goto out;
-		}
-	} else {
-		/*
-		 * revalidate.
-		 */
-		/* unlinked and re-created with the same name */
+	if (child) {
+		/* revalidate; unlinked and re-created with the same name */
 		if (unlikely(!lu_fid_eq(&minfo->mi_data.op_fid2, &body->mbo_fid1))) {
 			entry->se_inode = NULL;
 			iput(child);
@@ -720,50 +707,42 @@ static int ll_statahead_interpret(struct ptlrpc_request *req,
 }
 
 /* finish async stat RPC arguments */
-static void sa_fini_data(struct md_enqueue_info *minfo,
-			 struct ldlm_enqueue_info *einfo)
+static void sa_fini_data(struct md_enqueue_info *minfo)
 {
-	LASSERT(minfo && einfo);
 	iput(minfo->mi_dir);
 	kfree(minfo);
-	kfree(einfo);
 }
 
 /**
  * prepare arguments for async stat RPC.
  */
-static int sa_prep_data(struct inode *dir, struct inode *child,
-			struct sa_entry *entry, struct md_enqueue_info **pmi,
-			struct ldlm_enqueue_info **pei)
+static struct md_enqueue_info *
+sa_prep_data(struct inode *dir, struct inode *child, struct sa_entry *entry)
 {
-	const struct qstr      *qstr = &entry->se_qstr;
 	struct md_enqueue_info   *minfo;
 	struct ldlm_enqueue_info *einfo;
 	struct md_op_data	*op_data;
 
-	einfo = kzalloc(sizeof(*einfo), GFP_NOFS);
-	if (!einfo)
-		return -ENOMEM;
-
 	minfo = kzalloc(sizeof(*minfo), GFP_NOFS);
-	if (!minfo) {
-		kfree(einfo);
-		return -ENOMEM;
-	}
+	if (!minfo)
+		return ERR_PTR(-ENOMEM);
 
-	op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child, qstr->name,
-				     qstr->len, 0, LUSTRE_OPC_ANY, NULL);
+	op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child, NULL, 0, 0,
+				     LUSTRE_OPC_ANY, NULL);
 	if (IS_ERR(op_data)) {
-		kfree(einfo);
 		kfree(minfo);
-		return PTR_ERR(op_data);
+		return (struct md_enqueue_info *)op_data;
 	}
 
+	if (!child)
+		op_data->op_fid2 = entry->se_fid;
+
 	minfo->mi_it.it_op = IT_GETATTR;
 	minfo->mi_dir = igrab(dir);
 	minfo->mi_cb = ll_statahead_interpret;
 	minfo->mi_cbdata = entry;
 
+	einfo = &minfo->mi_einfo;
 	einfo->ei_type   = LDLM_IBITS;
 	einfo->ei_mode   = it_to_lock_mode(&minfo->mi_it);
 	einfo->ei_cb_bl  = ll_md_blocking_ast;
@@ -771,26 +750,22 @@ static int sa_prep_data(struct inode *dir, struct inode *child,
 	einfo->ei_cb_gl  = NULL;
 	einfo->ei_cbdata = NULL;
 
-	*pmi = minfo;
-	*pei = einfo;
-
-	return 0;
+	return minfo;
 }
 
 /* async stat for file not found in dcache */
 static int sa_lookup(struct inode *dir, struct sa_entry *entry)
 {
 	struct md_enqueue_info   *minfo;
-	struct ldlm_enqueue_info *einfo;
 	int		       rc;
 
-	rc = sa_prep_data(dir, NULL, entry, &minfo, &einfo);
-	if (rc)
-		return rc;
+	minfo = sa_prep_data(dir, NULL, entry);
+	if (IS_ERR(minfo))
+		return PTR_ERR(minfo);
 
-	rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo);
+	rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo);
 	if (rc)
-		sa_fini_data(minfo, einfo);
+		sa_fini_data(minfo);
 
 	return rc;
 }
@@ -809,7 +784,6 @@ static int sa_revalidate(struct inode *dir, struct sa_entry *entry,
 	struct lookup_intent      it = { .it_op = IT_GETATTR,
 					 .it_lock_handle = 0 };
 	struct md_enqueue_info   *minfo;
-	struct ldlm_enqueue_info *einfo;
 	int rc;
 
 	if (unlikely(!inode))
@@ -827,25 +801,26 @@ static int sa_revalidate(struct inode *dir, struct sa_entry *entry,
 		return 1;
 	}
 
-	rc = sa_prep_data(dir, inode, entry, &minfo, &einfo);
-	if (rc) {
+	minfo = sa_prep_data(dir, inode, entry);
+	if (IS_ERR(minfo)) {
 		entry->se_inode = NULL;
 		iput(inode);
-		return rc;
+		return PTR_ERR(minfo);
 	}
 
-	rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo);
+	rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo);
 	if (rc) {
 		entry->se_inode = NULL;
 		iput(inode);
-		sa_fini_data(minfo, einfo);
+		sa_fini_data(minfo);
 	}
 
 	return rc;
 }
 
 /* async stat for file with @name */
-static void sa_statahead(struct dentry *parent, const char *name, int len)
+static void sa_statahead(struct dentry *parent, const char *name, int len,
+			 const struct lu_fid *fid)
 {
 	struct inode	     *dir    = d_inode(parent);
 	struct ll_inode_info     *lli    = ll_i2info(dir);
@@ -854,7 +829,7 @@ static void sa_statahead(struct dentry *parent, const char *name, int len)
 	struct sa_entry *entry;
 	int		       rc;
 
-	entry = sa_alloc(parent, sai, sai->sai_index, name, len);
+	entry = sa_alloc(parent, sai, sai->sai_index, name, len, fid);
 	if (IS_ERR(entry))
 		return;
 
@@ -1043,6 +1018,7 @@ static int ll_statahead_thread(void *arg)
 		for (ent = lu_dirent_start(dp);
 		     ent && thread_is_running(sa_thread) && !sa_low_hit(sai);
 		     ent = lu_dirent_next(ent)) {
+			struct lu_fid fid;
 			__u64 hash;
 			int namelen;
 			char *name;
@@ -1088,6 +1064,8 @@ static int ll_statahead_thread(void *arg)
 			if (unlikely(++first == 1))
 				continue;
 
+			fid_le_to_cpu(&fid, &ent->lde_fid);
+
 			/* wait for spare statahead window */
 			do {
 				l_wait_event(sa_thread->t_ctl_waitq,
@@ -1117,7 +1095,7 @@ static int ll_statahead_thread(void *arg)
 			} while (sa_sent_full(sai) &&
 				 thread_is_running(sa_thread));
 
-			sa_statahead(parent, name, namelen);
+			sa_statahead(parent, name, namelen, &fid);
 		}
 
 		pos = le64_to_cpu(dp->ldp_hash_end);
diff --git a/drivers/staging/lustre/lustre/llite/super25.c b/drivers/staging/lustre/lustre/llite/super25.c
index 106cd00910a7..4759802e062d 100644
--- a/drivers/staging/lustre/lustre/llite/super25.c
+++ b/drivers/staging/lustre/lustre/llite/super25.c
@@ -88,7 +88,7 @@ static int __init lustre_init(void)
 	struct timespec64 ts;
 	int i, rc, seed[2];
 
-	CLASSERT(sizeof(LUSTRE_VOLATILE_HDR) == LUSTRE_VOLATILE_HDR_LEN + 1);
+	BUILD_BUG_ON(sizeof(LUSTRE_VOLATILE_HDR) != LUSTRE_VOLATILE_HDR_LEN + 1);
 
 	/* print an address of _any_ initialized kernel symbol from this
 	 * module, to allow debugging with gdb that doesn't support data
diff --git a/drivers/staging/lustre/lustre/llite/vvp_dev.c b/drivers/staging/lustre/lustre/llite/vvp_dev.c
index 12c129f7e4ad..3669ea77ee93 100644
--- a/drivers/staging/lustre/lustre/llite/vvp_dev.c
+++ b/drivers/staging/lustre/lustre/llite/vvp_dev.c
@@ -391,7 +391,7 @@ struct vvp_pgcache_id {
 
 static void vvp_pgcache_id_unpack(loff_t pos, struct vvp_pgcache_id *id)
 {
-	CLASSERT(sizeof(pos) == sizeof(__u64));
+	BUILD_BUG_ON(sizeof(pos) != sizeof(__u64));
 
 	id->vpi_index  = pos & 0xffffffff;
 	id->vpi_depth  = (pos >> PGC_DEPTH_SHIFT) & 0xf;
diff --git a/drivers/staging/lustre/lustre/llite/vvp_internal.h b/drivers/staging/lustre/lustre/llite/vvp_internal.h
index c60d0414ac25..f40fd7f115d1 100644
--- a/drivers/staging/lustre/lustre/llite/vvp_internal.h
+++ b/drivers/staging/lustre/lustre/llite/vvp_internal.h
@@ -301,8 +301,6 @@ static inline struct vvp_lock *cl2vvp_lock(const struct cl_lock_slice *slice)
 # define CLOBINVRNT(env, clob, expr)					\
 	((void)sizeof(env), (void)sizeof(clob), (void)sizeof(!!(expr)))
 
-int lov_read_and_clear_async_rc(struct cl_object *clob);
-
 int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
 		struct cl_io *io);
 int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io);
diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c
index 697cbfbe9374..3e9cf710501b 100644
--- a/drivers/staging/lustre/lustre/llite/vvp_io.c
+++ b/drivers/staging/lustre/lustre/llite/vvp_io.c
@@ -288,7 +288,7 @@ static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
 	       io->ci_ignore_layout, io->ci_verify_layout,
 	       vio->vui_layout_gen, io->ci_restore_needed);
 
-	if (io->ci_restore_needed == 1) {
+	if (io->ci_restore_needed) {
 		int	rc;
 
 		/* file was detected release, we need to restore it
@@ -657,7 +657,15 @@ static void vvp_io_setattr_end(const struct lu_env *env,
 static void vvp_io_setattr_fini(const struct lu_env *env,
 				const struct cl_io_slice *ios)
 {
+	bool restore_needed = ios->cis_io->ci_restore_needed;
+	struct inode *inode = vvp_object_inode(ios->cis_obj);
+
 	vvp_io_fini(env, ios);
+
+	if (restore_needed && !ios->cis_io->ci_restore_needed) {
+		/* restore finished, set data modified flag for HSM */
+		set_bit(LLIF_DATA_MODIFIED, &(ll_i2info(inode))->lli_flags);
+	}
 }
 
 static int vvp_io_read_start(const struct lu_env *env,
@@ -1340,13 +1348,6 @@ int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
 			io->ci_lockreq = CILR_MANDATORY;
 	}
 
-	/* ignore layout change for generic CIT_MISC but not for glimpse.
-	 * io context for glimpse must set ci_verify_layout to true,
-	 * see cl_glimpse_size0() for details.
-	 */
-	if (io->ci_type == CIT_MISC && !io->ci_verify_layout)
-		io->ci_ignore_layout = 1;
-
 	/* Enqueue layout lock and get layout version. We need to do this
 	 * even for operations requiring to open file, such as read and write,
 	 * because it might not grant layout lock in IT_OPEN.
diff --git a/drivers/staging/lustre/lustre/llite/vvp_page.c b/drivers/staging/lustre/lustre/llite/vvp_page.c
index 23d66308ff20..687c0c79d621 100644
--- a/drivers/staging/lustre/lustre/llite/vvp_page.c
+++ b/drivers/staging/lustre/lustre/llite/vvp_page.c
@@ -227,7 +227,8 @@ static int vvp_page_prep_write(const struct lu_env *env,
  * This takes inode as a separate argument, because inode on which error is to
  * be set can be different from \a vmpage inode in case of direct-io.
  */
-static void vvp_vmpage_error(struct inode *inode, struct page *vmpage, int ioret)
+static void vvp_vmpage_error(struct inode *inode, struct page *vmpage,
+			     int ioret)
 {
 	struct vvp_object *obj = cl_inode2vvp(inode);
 
diff --git a/drivers/staging/lustre/lustre/llite/xattr.c b/drivers/staging/lustre/lustre/llite/xattr.c
index 7a848ebc57c1..421cc04ecf1e 100644
--- a/drivers/staging/lustre/lustre/llite/xattr.c
+++ b/drivers/staging/lustre/lustre/llite/xattr.c
@@ -132,6 +132,15 @@ ll_xattr_set_common(const struct xattr_handler *handler,
 	    (!strcmp(name, "ima") || !strcmp(name, "evm")))
 		return -EOPNOTSUPP;
 
+	/*
+	 * In user.* namespace, only regular files and directories can have
+	 * extended attributes.
+	 */
+	if (handler->flags == XATTR_USER_T) {
+		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+			return -EPERM;
+	}
+
 	sprintf(fullname, "%s%s\n", handler->prefix, name);
 	rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
 			 valid, fullname, pv, size, 0, flags,