From 38cb87ee47fb825f6c9d645c019f75b3905c0ab2 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 22 Aug 2018 13:52:21 +0200 Subject: cfg80211: make wmm_rule part of the reg_rule structure Make wmm_rule be part of the reg_rule structure. This simplifies the code a lot at the cost of having bigger memory usage. However in most cases we have only few reg_rule's and when we do have many like in iwlwifi we do not save memory as it allocates a separate wmm_rule for each channel anyway. This also fixes a bug reported in various places where somewhere the pointers were corrupted and we ended up doing a null-dereference. Fixes: 230ebaa189af ("cfg80211: read wmm rules from regulatory database") Signed-off-by: Stanislaw Gruszka [rephrase commit message slightly] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 ++-- include/net/regulatory.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 1beb3ead0385..7229c186d199 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4763,8 +4763,8 @@ const char *reg_initiator_name(enum nl80211_reg_initiator initiator); * * Return: 0 on success. -ENODATA. */ -int reg_query_regdb_wmm(char *alpha2, int freq, u32 *ptr, - struct ieee80211_wmm_rule *rule); +int reg_query_regdb_wmm(char *alpha2, int freq, + struct ieee80211_reg_rule *rule); /* * callbacks for asynchronous cfg80211 methods, notification diff --git a/include/net/regulatory.h b/include/net/regulatory.h index 60f8cc86a447..3469750df0f4 100644 --- a/include/net/regulatory.h +++ b/include/net/regulatory.h @@ -217,15 +217,15 @@ struct ieee80211_wmm_rule { struct ieee80211_reg_rule { struct ieee80211_freq_range freq_range; struct ieee80211_power_rule power_rule; - struct ieee80211_wmm_rule *wmm_rule; + struct ieee80211_wmm_rule wmm_rule; u32 flags; u32 dfs_cac_ms; + bool has_wmm; }; struct ieee80211_regdomain { struct rcu_head rcu_head; u32 n_reg_rules; - u32 n_wmm_rules; char alpha2[3]; enum nl80211_dfs_regions dfs_region; struct ieee80211_reg_rule reg_rules[]; -- cgit v1.3-6-gb490 From 0434ccdcf883e53ec7156a6843943e940dc1feb8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 24 Aug 2018 08:43:36 +0200 Subject: netfilter: nf_tables: rework ct timeout set support Using a private template is problematic: 1. We can't assign both a zone and a timeout policy (zone assigns a conntrack template, so we hit problem 1) 2. Using a template needs to take care of ct refcount, else we'll eventually free the private template due to ->use underflow. This patch reworks template policy to instead work with existing conntrack. As long as such conntrack has not yet been placed into the hash table (unconfirmed) we can still add the timeout extension. The only caveat is that we now need to update/correct ct->timeout to reflect the initial/new state, otherwise the conntrack entry retains the default 'new' timeout. Side effect of this change is that setting the policy must now occur from chains that are evaluated *after* the conntrack lookup has taken place. No released kernel contains the timeout policy feature yet, so this change should be ok. Changes since v2: - don't handle 'ct is confirmed case' - after previous patch, no need to special-case tcp/dccp/sctp timeout anymore Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_timeout.h | 2 +- net/netfilter/nft_ct.c | 59 ++++++++++++++-------------- 2 files changed, 30 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h index d5f62cc6c2ae..3394d75e1c80 100644 --- a/include/net/netfilter/nf_conntrack_timeout.h +++ b/include/net/netfilter/nf_conntrack_timeout.h @@ -30,7 +30,7 @@ struct nf_conn_timeout { }; static inline unsigned int * -nf_ct_timeout_data(struct nf_conn_timeout *t) +nf_ct_timeout_data(const struct nf_conn_timeout *t) { struct nf_ct_timeout *timeout; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 26a8baebd072..5dd87748afa8 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -799,7 +799,7 @@ err: } struct nft_ct_timeout_obj { - struct nf_conn *tmpl; + struct nf_ct_timeout *timeout; u8 l4proto; }; @@ -809,26 +809,42 @@ static void nft_ct_timeout_obj_eval(struct nft_object *obj, { const struct nft_ct_timeout_obj *priv = nft_obj_data(obj); struct nf_conn *ct = (struct nf_conn *)skb_nfct(pkt->skb); - struct sk_buff *skb = pkt->skb; + struct nf_conn_timeout *timeout; + const unsigned int *values; + + if (priv->l4proto != pkt->tprot) + return; - if (ct || - priv->l4proto != pkt->tprot) + if (!ct || nf_ct_is_template(ct) || nf_ct_is_confirmed(ct)) return; - nf_ct_set(skb, priv->tmpl, IP_CT_NEW); + timeout = nf_ct_timeout_find(ct); + if (!timeout) { + timeout = nf_ct_timeout_ext_add(ct, priv->timeout, GFP_ATOMIC); + if (!timeout) { + regs->verdict.code = NF_DROP; + return; + } + } + + rcu_assign_pointer(timeout->timeout, priv->timeout); + + /* adjust the timeout as per 'new' state. ct is unconfirmed, + * so the current timestamp must not be added. + */ + values = nf_ct_timeout_data(timeout); + if (values) + nf_ct_refresh(ct, pkt->skb, values[0]); } static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { - const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; struct nft_ct_timeout_obj *priv = nft_obj_data(obj); const struct nf_conntrack_l4proto *l4proto; - struct nf_conn_timeout *timeout_ext; struct nf_ct_timeout *timeout; int l3num = ctx->family; - struct nf_conn *tmpl; __u8 l4num; int ret; @@ -863,28 +879,14 @@ static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx, timeout->l3num = l3num; timeout->l4proto = l4proto; - tmpl = nf_ct_tmpl_alloc(ctx->net, zone, GFP_ATOMIC); - if (!tmpl) { - ret = -ENOMEM; - goto err_free_timeout; - } - - timeout_ext = nf_ct_timeout_ext_add(tmpl, timeout, GFP_ATOMIC); - if (!timeout_ext) { - ret = -ENOMEM; - goto err_free_tmpl; - } ret = nf_ct_netns_get(ctx->net, ctx->family); if (ret < 0) - goto err_free_tmpl; - - priv->tmpl = tmpl; + goto err_free_timeout; + priv->timeout = timeout; return 0; -err_free_tmpl: - nf_ct_tmpl_free(tmpl); err_free_timeout: kfree(timeout); err_proto_put: @@ -896,22 +898,19 @@ static void nft_ct_timeout_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { struct nft_ct_timeout_obj *priv = nft_obj_data(obj); - struct nf_conn_timeout *t = nf_ct_timeout_find(priv->tmpl); - struct nf_ct_timeout *timeout; + struct nf_ct_timeout *timeout = priv->timeout; - timeout = rcu_dereference_raw(t->timeout); nf_ct_untimeout(ctx->net, timeout); nf_ct_l4proto_put(timeout->l4proto); nf_ct_netns_put(ctx->net, ctx->family); - nf_ct_tmpl_free(priv->tmpl); + kfree(priv->timeout); } static int nft_ct_timeout_obj_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { const struct nft_ct_timeout_obj *priv = nft_obj_data(obj); - const struct nf_conn_timeout *t = nf_ct_timeout_find(priv->tmpl); - const struct nf_ct_timeout *timeout = rcu_dereference_raw(t->timeout); + const struct nf_ct_timeout *timeout = priv->timeout; struct nlattr *nest_params; int ret; -- cgit v1.3-6-gb490 From 45cd0faae3715e305bc46e23b34c5ed4d185ceb8 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 27 Aug 2018 15:56:02 +0300 Subject: vfs: add the fadvise() file operation This is going to be used by overlayfs and possibly useful for other filesystems. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- Documentation/filesystems/vfs.txt | 3 ++ include/linux/fs.h | 5 +++ mm/fadvise.c | 78 ++++++++++++++++++++++----------------- 3 files changed, 53 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index ec2142c8dbd3..a6c6a8af48a2 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -885,6 +885,7 @@ struct file_operations { ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, u64); int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, u64); + int (*fadvise)(struct file *, loff_t, loff_t, int); }; Again, all methods are called without any locks being held, unless @@ -965,6 +966,8 @@ otherwise noted. dedupe_file_range: called by the ioctl(2) system call for FIDEDUPERANGE command. + fadvise: possibly called by the fadvise64() system call. + Note that the file operations are implemented by the specific filesystem in which the inode resides. When opening a device node (character or block special) most filesystems will call special diff --git a/include/linux/fs.h b/include/linux/fs.h index 33322702c910..6c0b4a1c22ff 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1763,6 +1763,7 @@ struct file_operations { u64); int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, u64); + int (*fadvise)(struct file *, loff_t, loff_t, int); } __randomize_layout; struct inode_operations { @@ -3459,4 +3460,8 @@ static inline bool dir_relax_shared(struct inode *inode) extern bool path_noexec(const struct path *path); extern void inode_nohighmem(struct inode *inode); +/* mm/fadvise.c */ +extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, + int advice); + #endif /* _LINUX_FS_H */ diff --git a/mm/fadvise.c b/mm/fadvise.c index 2d8376e3c640..2f59bac1cb77 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -27,9 +27,9 @@ * deactivate the pages and clear PG_Referenced. */ -int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) +static int generic_fadvise(struct file *file, loff_t offset, loff_t len, + int advice) { - struct fd f = fdget(fd); struct inode *inode; struct address_space *mapping; struct backing_dev_info *bdi; @@ -37,22 +37,14 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) pgoff_t start_index; pgoff_t end_index; unsigned long nrpages; - int ret = 0; - if (!f.file) - return -EBADF; + inode = file_inode(file); + if (S_ISFIFO(inode->i_mode)) + return -ESPIPE; - inode = file_inode(f.file); - if (S_ISFIFO(inode->i_mode)) { - ret = -ESPIPE; - goto out; - } - - mapping = f.file->f_mapping; - if (!mapping || len < 0) { - ret = -EINVAL; - goto out; - } + mapping = file->f_mapping; + if (!mapping || len < 0) + return -EINVAL; bdi = inode_to_bdi(mapping->host); @@ -67,9 +59,9 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) /* no bad return value, but ignore advice */ break; default: - ret = -EINVAL; + return -EINVAL; } - goto out; + return 0; } /* @@ -85,21 +77,21 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) switch (advice) { case POSIX_FADV_NORMAL: - f.file->f_ra.ra_pages = bdi->ra_pages; - spin_lock(&f.file->f_lock); - f.file->f_mode &= ~FMODE_RANDOM; - spin_unlock(&f.file->f_lock); + file->f_ra.ra_pages = bdi->ra_pages; + spin_lock(&file->f_lock); + file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_RANDOM: - spin_lock(&f.file->f_lock); - f.file->f_mode |= FMODE_RANDOM; - spin_unlock(&f.file->f_lock); + spin_lock(&file->f_lock); + file->f_mode |= FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_SEQUENTIAL: - f.file->f_ra.ra_pages = bdi->ra_pages * 2; - spin_lock(&f.file->f_lock); - f.file->f_mode &= ~FMODE_RANDOM; - spin_unlock(&f.file->f_lock); + file->f_ra.ra_pages = bdi->ra_pages * 2; + spin_lock(&file->f_lock); + file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_WILLNEED: /* First and last PARTIAL page! */ @@ -115,8 +107,7 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) * Ignore return value because fadvise() shall return * success even if filesystem can't retrieve a hint, */ - force_page_cache_readahead(mapping, f.file, start_index, - nrpages); + force_page_cache_readahead(mapping, file, start_index, nrpages); break; case POSIX_FADV_NOREUSE: break; @@ -183,9 +174,30 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) } break; default: - ret = -EINVAL; + return -EINVAL; } -out: + return 0; +} + +int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice) +{ + if (file->f_op->fadvise) + return file->f_op->fadvise(file, offset, len, advice); + + return generic_fadvise(file, offset, len, advice); +} +EXPORT_SYMBOL(vfs_fadvise); + +int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) +{ + struct fd f = fdget(fd); + int ret; + + if (!f.file) + return -EBADF; + + ret = vfs_fadvise(f.file, offset, len, advice); + fdput(f); return ret; } -- cgit v1.3-6-gb490 From 6b06546206868f723f2061d703a3c3c378dcbf4c Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Fri, 31 Aug 2018 16:22:42 -0400 Subject: Revert "blk-throttle: fix race between blkcg_bio_issue_check() and cgroup_rmdir()" This reverts commit 4c6994806f708559c2812b73501406e21ae5dcd0. Destroying blkgs is tricky because of the nature of the relationship. A blkg should go away when either a blkcg or a request_queue goes away. However, blkg's pin the blkcg to ensure they remain valid. To break this cycle, when a blkcg is offlined, blkgs put back their css ref. This eventually lets css_free() get called which frees the blkcg. The above commit (4c6994806f70) breaks this order of events by trying to destroy blkgs in css_free(). As the blkgs still hold references to the blkcg, css_free() is never called. The race between blkcg_bio_issue_check() and cgroup_rmdir() will be addressed in the following patch by delaying destruction of a blkg until all writeback associated with the blkcg has been finished. Fixes: 4c6994806f70 ("blk-throttle: fix race between blkcg_bio_issue_check() and cgroup_rmdir()") Reviewed-by: Josef Bacik Signed-off-by: Dennis Zhou Cc: Jiufei Xue Cc: Joseph Qi Cc: Tejun Heo Cc: Jens Axboe Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 78 ++++++++++------------------------------------ include/linux/blk-cgroup.h | 1 - 2 files changed, 16 insertions(+), 63 deletions(-) (limited to 'include') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 694595b29b8f..2998e4f095d1 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -310,28 +310,11 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, } } -static void blkg_pd_offline(struct blkcg_gq *blkg) -{ - int i; - - lockdep_assert_held(blkg->q->queue_lock); - lockdep_assert_held(&blkg->blkcg->lock); - - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; - - if (blkg->pd[i] && !blkg->pd[i]->offline && - pol->pd_offline_fn) { - pol->pd_offline_fn(blkg->pd[i]); - blkg->pd[i]->offline = true; - } - } -} - static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; struct blkcg_gq *parent = blkg->parent; + int i; lockdep_assert_held(blkg->q->queue_lock); lockdep_assert_held(&blkcg->lock); @@ -340,6 +323,13 @@ static void blkg_destroy(struct blkcg_gq *blkg) WARN_ON_ONCE(list_empty(&blkg->q_node)); WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + + if (blkg->pd[i] && pol->pd_offline_fn) + pol->pd_offline_fn(blkg->pd[i]); + } + if (parent) { blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes); blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios); @@ -382,7 +372,6 @@ static void blkg_destroy_all(struct request_queue *q) struct blkcg *blkcg = blkg->blkcg; spin_lock(&blkcg->lock); - blkg_pd_offline(blkg); blkg_destroy(blkg); spin_unlock(&blkcg->lock); } @@ -1058,54 +1047,21 @@ static struct cftype blkcg_legacy_files[] = { * @css: css of interest * * This function is called when @css is about to go away and responsible - * for offlining all blkgs pd and killing all wbs associated with @css. - * blkgs pd offline should be done while holding both q and blkcg locks. - * As blkcg lock is nested inside q lock, this function performs reverse - * double lock dancing. + * for shooting down all blkgs associated with @css. blkgs should be + * removed while holding both q and blkcg locks. As blkcg lock is nested + * inside q lock, this function performs reverse double lock dancing. * * This is the blkcg counterpart of ioc_release_fn(). */ static void blkcg_css_offline(struct cgroup_subsys_state *css) { struct blkcg *blkcg = css_to_blkcg(css); - struct blkcg_gq *blkg; spin_lock_irq(&blkcg->lock); - hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { - struct request_queue *q = blkg->q; - - if (spin_trylock(q->queue_lock)) { - blkg_pd_offline(blkg); - spin_unlock(q->queue_lock); - } else { - spin_unlock_irq(&blkcg->lock); - cpu_relax(); - spin_lock_irq(&blkcg->lock); - } - } - - spin_unlock_irq(&blkcg->lock); - - wb_blkcg_offline(blkcg); -} - -/** - * blkcg_destroy_all_blkgs - destroy all blkgs associated with a blkcg - * @blkcg: blkcg of interest - * - * This function is called when blkcg css is about to free and responsible for - * destroying all blkgs associated with @blkcg. - * blkgs should be removed while holding both q and blkcg locks. As blkcg lock - * is nested inside q lock, this function performs reverse double lock dancing. - */ -static void blkcg_destroy_all_blkgs(struct blkcg *blkcg) -{ - spin_lock_irq(&blkcg->lock); while (!hlist_empty(&blkcg->blkg_list)) { struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, - struct blkcg_gq, - blkcg_node); + struct blkcg_gq, blkcg_node); struct request_queue *q = blkg->q; if (spin_trylock(q->queue_lock)) { @@ -1117,7 +1073,10 @@ static void blkcg_destroy_all_blkgs(struct blkcg *blkcg) spin_lock_irq(&blkcg->lock); } } + spin_unlock_irq(&blkcg->lock); + + wb_blkcg_offline(blkcg); } static void blkcg_css_free(struct cgroup_subsys_state *css) @@ -1125,8 +1084,6 @@ static void blkcg_css_free(struct cgroup_subsys_state *css) struct blkcg *blkcg = css_to_blkcg(css); int i; - blkcg_destroy_all_blkgs(blkcg); - mutex_lock(&blkcg_pol_mutex); list_del(&blkcg->all_blkcgs_node); @@ -1480,11 +1437,8 @@ void blkcg_deactivate_policy(struct request_queue *q, list_for_each_entry(blkg, &q->blkg_list, q_node) { if (blkg->pd[pol->plid]) { - if (!blkg->pd[pol->plid]->offline && - pol->pd_offline_fn) { + if (pol->pd_offline_fn) pol->pd_offline_fn(blkg->pd[pol->plid]); - blkg->pd[pol->plid]->offline = true; - } pol->pd_free_fn(blkg->pd[pol->plid]); blkg->pd[pol->plid] = NULL; } diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 34aec30e06c7..1615cdd4c797 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -89,7 +89,6 @@ struct blkg_policy_data { /* the blkg and policy id this per-policy data belongs to */ struct blkcg_gq *blkg; int plid; - bool offline; }; /* -- cgit v1.3-6-gb490 From 59b57717fff8b562825d9d25e0180ad7e8048ca9 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Fri, 31 Aug 2018 16:22:43 -0400 Subject: blkcg: delay blkg destruction until after writeback has finished Currently, blkcg destruction relies on a sequence of events: 1. Destruction starts. blkcg_css_offline() is called and blkgs release their reference to the blkcg. This immediately destroys the cgwbs (writeback). 2. With blkgs giving up their reference, the blkcg ref count should become zero and eventually call blkcg_css_free() which finally frees the blkcg. Jiufei Xue reported that there is a race between blkcg_bio_issue_check() and cgroup_rmdir(). To remedy this, blkg destruction becomes contingent on the completion of all writeback associated with the blkcg. A count of the number of cgwbs is maintained and once that goes to zero, blkg destruction can follow. This should prevent premature blkg destruction related to writeback. The new process for blkcg cleanup is as follows: 1. Destruction starts. blkcg_css_offline() is called which offlines writeback. Blkg destruction is delayed on the cgwb_refcnt count to avoid punting potentially large amounts of outstanding writeback to root while maintaining any ongoing policies. Here, the base cgwb_refcnt is put back. 2. When the cgwb_refcnt becomes zero, blkcg_destroy_blkgs() is called and handles destruction of blkgs. This is where the css reference held by each blkg is released. 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called. This finally frees the blkg. It seems in the past blk-throttle didn't do the most understandable things with taking data from a blkg while associating with current. So, the simplification and unification of what blk-throttle is doing caused this. Fixes: 08e18eab0c579 ("block: add bi_blkg to the bio for cgroups") Reviewed-by: Josef Bacik Signed-off-by: Dennis Zhou Cc: Jiufei Xue Cc: Joseph Qi Cc: Tejun Heo Cc: Josef Bacik Cc: Jens Axboe Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 53 +++++++++++++++++++++++++++++++++++++++------- include/linux/blk-cgroup.h | 44 ++++++++++++++++++++++++++++++++++++++ mm/backing-dev.c | 5 +++++ 3 files changed, 94 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 2998e4f095d1..c19f9078da1e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1042,21 +1042,59 @@ static struct cftype blkcg_legacy_files[] = { { } /* terminate */ }; +/* + * blkcg destruction is a three-stage process. + * + * 1. Destruction starts. The blkcg_css_offline() callback is invoked + * which offlines writeback. Here we tie the next stage of blkg destruction + * to the completion of writeback associated with the blkcg. This lets us + * avoid punting potentially large amounts of outstanding writeback to root + * while maintaining any ongoing policies. The next stage is triggered when + * the nr_cgwbs count goes to zero. + * + * 2. When the nr_cgwbs count goes to zero, blkcg_destroy_blkgs() is called + * and handles the destruction of blkgs. Here the css reference held by + * the blkg is put back eventually allowing blkcg_css_free() to be called. + * This work may occur in cgwb_release_workfn() on the cgwb_release + * workqueue. Any submitted ios that fail to get the blkg ref will be + * punted to the root_blkg. + * + * 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called. + * This finally frees the blkcg. + */ + /** * blkcg_css_offline - cgroup css_offline callback * @css: css of interest * - * This function is called when @css is about to go away and responsible - * for shooting down all blkgs associated with @css. blkgs should be - * removed while holding both q and blkcg locks. As blkcg lock is nested - * inside q lock, this function performs reverse double lock dancing. - * - * This is the blkcg counterpart of ioc_release_fn(). + * This function is called when @css is about to go away. Here the cgwbs are + * offlined first and only once writeback associated with the blkcg has + * finished do we start step 2 (see above). */ static void blkcg_css_offline(struct cgroup_subsys_state *css) { struct blkcg *blkcg = css_to_blkcg(css); + /* this prevents anyone from attaching or migrating to this blkcg */ + wb_blkcg_offline(blkcg); + + /* put the base cgwb reference allowing step 2 to be triggered */ + blkcg_cgwb_put(blkcg); +} + +/** + * blkcg_destroy_blkgs - responsible for shooting down blkgs + * @blkcg: blkcg of interest + * + * blkgs should be removed while holding both q and blkcg locks. As blkcg lock + * is nested inside q lock, this function performs reverse double lock dancing. + * Destroying the blkgs releases the reference held on the blkcg's css allowing + * blkcg_css_free to eventually be called. + * + * This is the blkcg counterpart of ioc_release_fn(). + */ +void blkcg_destroy_blkgs(struct blkcg *blkcg) +{ spin_lock_irq(&blkcg->lock); while (!hlist_empty(&blkcg->blkg_list)) { @@ -1075,8 +1113,6 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css) } spin_unlock_irq(&blkcg->lock); - - wb_blkcg_offline(blkcg); } static void blkcg_css_free(struct cgroup_subsys_state *css) @@ -1146,6 +1182,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) INIT_HLIST_HEAD(&blkcg->blkg_list); #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&blkcg->cgwb_list); + refcount_set(&blkcg->cgwb_refcnt, 1); #endif list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs); diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 1615cdd4c797..6d766a19f2bb 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -56,6 +56,7 @@ struct blkcg { struct list_head all_blkcgs_node; #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; + refcount_t cgwb_refcnt; #endif }; @@ -386,6 +387,49 @@ static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) return cpd ? cpd->blkcg : NULL; } +extern void blkcg_destroy_blkgs(struct blkcg *blkcg); + +#ifdef CONFIG_CGROUP_WRITEBACK + +/** + * blkcg_cgwb_get - get a reference for blkcg->cgwb_list + * @blkcg: blkcg of interest + * + * This is used to track the number of active wb's related to a blkcg. + */ +static inline void blkcg_cgwb_get(struct blkcg *blkcg) +{ + refcount_inc(&blkcg->cgwb_refcnt); +} + +/** + * blkcg_cgwb_put - put a reference for @blkcg->cgwb_list + * @blkcg: blkcg of interest + * + * This is used to track the number of active wb's related to a blkcg. + * When this count goes to zero, all active wb has finished so the + * blkcg can continue destruction by calling blkcg_destroy_blkgs(). + * This work may occur in cgwb_release_workfn() on the cgwb_release + * workqueue. + */ +static inline void blkcg_cgwb_put(struct blkcg *blkcg) +{ + if (refcount_dec_and_test(&blkcg->cgwb_refcnt)) + blkcg_destroy_blkgs(blkcg); +} + +#else + +static inline void blkcg_cgwb_get(struct blkcg *blkcg) { } + +static inline void blkcg_cgwb_put(struct blkcg *blkcg) +{ + /* wb isn't being accounted, so trigger destruction right away */ + blkcg_destroy_blkgs(blkcg); +} + +#endif + /** * blkg_path - format cgroup path of blkg * @blkg: blkg of interest diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f5981e9d6ae2..8a8bb8796c6c 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -491,6 +491,7 @@ static void cgwb_release_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(work, struct bdi_writeback, release_work); + struct blkcg *blkcg = css_to_blkcg(wb->blkcg_css); mutex_lock(&wb->bdi->cgwb_release_mutex); wb_shutdown(wb); @@ -499,6 +500,9 @@ static void cgwb_release_workfn(struct work_struct *work) css_put(wb->blkcg_css); mutex_unlock(&wb->bdi->cgwb_release_mutex); + /* triggers blkg destruction if cgwb_refcnt becomes zero */ + blkcg_cgwb_put(blkcg); + fprop_local_destroy_percpu(&wb->memcg_completions); percpu_ref_exit(&wb->refcnt); wb_exit(wb); @@ -597,6 +601,7 @@ static int cgwb_create(struct backing_dev_info *bdi, list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); list_add(&wb->memcg_node, memcg_cgwb_list); list_add(&wb->blkcg_node, blkcg_cgwb_list); + blkcg_cgwb_get(blkcg); css_get(memcg_css); css_get(blkcg_css); } -- cgit v1.3-6-gb490 From 59a03fea131d671a57b8ed3dc446264c61d4b75f Mon Sep 17 00:00:00 2001 From: Vinson Lee Date: Sat, 1 Sep 2018 21:20:27 +0000 Subject: uapi: Fix linux/rds.h userspace compilation errors. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include linux/in6.h for struct in6_addr. /usr/include/linux/rds.h:156:18: error: field ‘laddr’ has incomplete type struct in6_addr laddr; ^~~~~ /usr/include/linux/rds.h:157:18: error: field ‘faddr’ has incomplete type struct in6_addr faddr; ^~~~~ /usr/include/linux/rds.h:178:18: error: field ‘laddr’ has incomplete type struct in6_addr laddr; ^~~~~ /usr/include/linux/rds.h:179:18: error: field ‘faddr’ has incomplete type struct in6_addr faddr; ^~~~~ /usr/include/linux/rds.h:198:18: error: field ‘bound_addr’ has incomplete type struct in6_addr bound_addr; ^~~~~~~~~~ /usr/include/linux/rds.h:199:18: error: field ‘connected_addr’ has incomplete type struct in6_addr connected_addr; ^~~~~~~~~~~~~~ /usr/include/linux/rds.h:219:18: error: field ‘local_addr’ has incomplete type struct in6_addr local_addr; ^~~~~~~~~~ /usr/include/linux/rds.h:221:18: error: field ‘peer_addr’ has incomplete type struct in6_addr peer_addr; ^~~~~~~~~ /usr/include/linux/rds.h:245:18: error: field ‘src_addr’ has incomplete type struct in6_addr src_addr; ^~~~~~~~ /usr/include/linux/rds.h:246:18: error: field ‘dst_addr’ has incomplete type struct in6_addr dst_addr; ^~~~~~~~ Fixes: b7ff8b1036f0 ("rds: Extend RDS API for IPv6 support") Signed-off-by: Vinson Lee Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- include/uapi/linux/rds.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h index dc520e1a4123..8b73cb603c5f 100644 --- a/include/uapi/linux/rds.h +++ b/include/uapi/linux/rds.h @@ -37,6 +37,7 @@ #include #include /* For __kernel_sockaddr_storage. */ +#include /* For struct in6_addr. */ #define RDS_IB_ABI_VERSION 0x301 -- cgit v1.3-6-gb490 From c43c5e9f524ec914e7e202f9c5ab91779629ccc6 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 3 Sep 2018 10:15:33 +0200 Subject: timekeeping: Fix declaration of read_persistent_wall_and_boot_offset() It is read_persistent_wall_and_boot_offset() and not read_persistent_clock_and_boot_offset() Fixes: 3eca993740b8eb40f51 ("timekeeping: Replace read_boot_clock64() with read_persistent_wall_and_boot_offset()") Signed-off-by: Christian Borntraeger Signed-off-by: Thomas Gleixner Cc: Pavel Tatashin Link: https://lkml.kernel.org/r/20180903081533.34366-1-borntraeger@de.ibm.com --- include/linux/timekeeping.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 5d738804e3d6..a5a3cfc3c2fa 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -258,8 +258,8 @@ extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot); extern int persistent_clock_is_local; extern void read_persistent_clock64(struct timespec64 *ts); -void read_persistent_clock_and_boot_offset(struct timespec64 *wall_clock, - struct timespec64 *boot_offset); +void read_persistent_wall_and_boot_offset(struct timespec64 *wall_clock, + struct timespec64 *boot_offset); extern int update_persistent_clock64(struct timespec64 now); /* -- cgit v1.3-6-gb490 From 9fd0e09a4e86499639653243edfcb417a05c5c46 Mon Sep 17 00:00:00 2001 From: Anthony Wong Date: Fri, 31 Aug 2018 20:06:42 +0800 Subject: r8169: add support for NCube 8168 network card This card identifies itself as: Ethernet controller [0200]: NCube Device [10ff:8168] (rev 06) Subsystem: TP-LINK Technologies Co., Ltd. Device [7470:3468] Adding a new entry to rtl8169_pci_tbl makes the card work. Link: http://launchpad.net/bugs/1788730 Signed-off-by: Anthony Wong Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.c | 1 + include/linux/pci_ids.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index ac306797590e..b08d51bf7a20 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -218,6 +218,7 @@ static const struct pci_device_id rtl8169_pci_tbl[] = { { PCI_DEVICE(PCI_VENDOR_ID_REALTEK, 0x8161), 0, 0, RTL_CFG_1 }, { PCI_DEVICE(PCI_VENDOR_ID_REALTEK, 0x8167), 0, 0, RTL_CFG_0 }, { PCI_DEVICE(PCI_VENDOR_ID_REALTEK, 0x8168), 0, 0, RTL_CFG_1 }, + { PCI_DEVICE(PCI_VENDOR_ID_NCUBE, 0x8168), 0, 0, RTL_CFG_1 }, { PCI_DEVICE(PCI_VENDOR_ID_REALTEK, 0x8169), 0, 0, RTL_CFG_0 }, { PCI_VENDOR_ID_DLINK, 0x4300, PCI_VENDOR_ID_DLINK, 0x4b10, 0, 0, RTL_CFG_1 }, diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 99d366cb0e9f..d157983b84cf 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -3084,4 +3084,6 @@ #define PCI_VENDOR_ID_OCZ 0x1b85 +#define PCI_VENDOR_ID_NCUBE 0x10ff + #endif /* _LINUX_PCI_IDS_H */ -- cgit v1.3-6-gb490 From c48300c92ad9f029f4dcbcf5d71ad880e3acf2fa Mon Sep 17 00:00:00 2001 From: Gleb Fotengauer-Malinovskiy Date: Mon, 3 Sep 2018 20:59:13 +0300 Subject: vhost: fix VHOST_GET_BACKEND_FEATURES ioctl request definition The _IOC_READ flag fits this ioctl request more because this request actually only writes to, but doesn't read from userspace. See NOTEs in include/uapi/asm-generic/ioctl.h for more information. Fixes: 429711aec282 ("vhost: switch to use new message format") Signed-off-by: Gleb Fotengauer-Malinovskiy Acked-by: Jason Wang Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- include/uapi/linux/vhost.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index b1e22c40c4b6..84c3de89696a 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -176,7 +176,7 @@ struct vhost_memory { #define VHOST_BACKEND_F_IOTLB_MSG_V2 0x1 #define VHOST_SET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x25, __u64) -#define VHOST_GET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x26, __u64) +#define VHOST_GET_BACKEND_FEATURES _IOR(VHOST_VIRTIO, 0x26, __u64) /* VHOST_NET specific defines */ -- cgit v1.3-6-gb490 From 0d6c3011409135ea84e2a231b013a22017ff999a Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 4 Sep 2018 15:31:14 +0200 Subject: HID: core: fix grouping by application commit f07b3c1da92d ("HID: generic: create one input report per application type") was effectively the same as MULTI_INPUT: hidinput->report was never set, so hidinput_match_application() always returned null. Fix that by testing against the real application. Note that this breaks some old eGalax touchscreens that expect MULTI_INPUT instead of HID_QUIRK_INPUT_PER_APP. Enable this quirk for backward compatibility on all non-Win8 touchscreens. link: https://bugzilla.kernel.org/show_bug.cgi?id=200847 link: https://bugzilla.kernel.org/show_bug.cgi?id=200849 link: https://bugs.archlinux.org/task/59699 link: https://github.com/NixOS/nixpkgs/issues/45165 Cc: stable@vger.kernel.org # v4.18+ Signed-off-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- drivers/hid/hid-input.c | 4 ++-- drivers/hid/hid-multitouch.c | 3 +++ include/linux/hid.h | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index ac201817a2dd..a481eaf39e88 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -1582,6 +1582,7 @@ static struct hid_input *hidinput_allocate(struct hid_device *hid, input_dev->dev.parent = &hid->dev; hidinput->input = input_dev; + hidinput->application = application; list_add_tail(&hidinput->list, &hid->inputs); INIT_LIST_HEAD(&hidinput->reports); @@ -1677,8 +1678,7 @@ static struct hid_input *hidinput_match_application(struct hid_report *report) struct hid_input *hidinput; list_for_each_entry(hidinput, &hid->inputs, list) { - if (hidinput->report && - hidinput->report->application == report->application) + if (hidinput->application == report->application) return hidinput; } diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index 88da991ef256..da954f3f4da7 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -1697,6 +1697,9 @@ static int mt_probe(struct hid_device *hdev, const struct hid_device_id *id) */ hdev->quirks |= HID_QUIRK_INPUT_PER_APP; + if (id->group != HID_GROUP_MULTITOUCH_WIN_8) + hdev->quirks |= HID_QUIRK_MULTI_INPUT; + timer_setup(&td->release_timer, mt_expired_timeout, 0); ret = hid_parse(hdev); diff --git a/include/linux/hid.h b/include/linux/hid.h index 834e6461a690..d44a78362942 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -526,6 +526,7 @@ struct hid_input { const char *name; bool registered; struct list_head reports; /* the list of reports */ + unsigned int application; /* application usage for this input */ }; enum hid_type { -- cgit v1.3-6-gb490 From 8a2336e549d385bb0b46880435b411df8d8200e8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 4 Sep 2018 15:46:13 -0700 Subject: uapi/linux/keyctl.h: don't use C++ reserved keyword as a struct member name Since this header is in "include/uapi/linux/", apparently people want to use it in userspace programs -- even in C++ ones. However, the header uses a C++ reserved keyword ("private"), so change that to "dh_private" instead to allow the header file to be used in C++ userspace. Fixes https://bugzilla.kernel.org/show_bug.cgi?id=191051 Link: http://lkml.kernel.org/r/0db6c314-1ef4-9bfa-1baa-7214dd2ee061@infradead.org Fixes: ddbb41148724 ("KEYS: Add KEYCTL_DH_COMPUTE command") Signed-off-by: Randy Dunlap Reviewed-by: Andrew Morton Cc: David Howells Cc: James Morris Cc: "Serge E. Hallyn" Cc: Mat Martineau Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/keyctl.h | 2 +- security/keys/dh.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/keyctl.h b/include/uapi/linux/keyctl.h index 7b8c9e19bad1..910cc4334b21 100644 --- a/include/uapi/linux/keyctl.h +++ b/include/uapi/linux/keyctl.h @@ -65,7 +65,7 @@ /* keyctl structures */ struct keyctl_dh_params { - __s32 private; + __s32 dh_private; __s32 prime; __s32 base; }; diff --git a/security/keys/dh.c b/security/keys/dh.c index 711e89d8c415..3b602a1e27fa 100644 --- a/security/keys/dh.c +++ b/security/keys/dh.c @@ -300,7 +300,7 @@ long __keyctl_dh_compute(struct keyctl_dh_params __user *params, } dh_inputs.g_size = dlen; - dlen = dh_data_from_key(pcopy.private, &dh_inputs.key); + dlen = dh_data_from_key(pcopy.dh_private, &dh_inputs.key); if (dlen < 0) { ret = dlen; goto out2; -- cgit v1.3-6-gb490 From d23df2dc56325c72b51670b1fb400ddd23dc17cd Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 3 Sep 2018 12:51:59 -0700 Subject: linux/mod_devicetable.h: fix kernel-doc missing notation for typec_device_id Fix kernel-doc warning for missing struct member description: ../include/linux/mod_devicetable.h:763: warning: Function parameter or member 'driver_data' not described in 'typec_device_id' Fixes: 8a37d87d72f0c ("usb: typec: Bus type for alternate modes") Signed-off-by: Randy Dunlap Cc: Heikki Krogerus Reviewed-by: Heikki Krogerus Signed-off-by: Greg Kroah-Hartman --- include/linux/mod_devicetable.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 1298a7daa57d..01797cb4587e 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -754,6 +754,7 @@ struct tb_service_id { * struct typec_device_id - USB Type-C alternate mode identifiers * @svid: Standard or Vendor ID * @mode: Mode index + * @driver_data: Driver specific data */ struct typec_device_id { __u16 svid; -- cgit v1.3-6-gb490 From 865e63b04e9b2a658d7f26bd13a71dcd964a9118 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 4 Sep 2018 16:26:11 -0400 Subject: tracing: Add back in rcu_irq_enter/exit_irqson() for rcuidle tracepoints Borislav reported the following splat: ============================= WARNING: suspicious RCU usage 4.19.0-rc1+ #1 Not tainted ----------------------------- ./include/linux/rcupdate.h:631 rcu_read_lock() used illegally while idle! other info that might help us debug this: RCU used illegally from idle CPU! rcu_scheduler_active = 2, debug_locks = 1 RCU used illegally from extended quiescent state! 1 lock held by swapper/0/0: #0: 000000004557ee0e (rcu_read_lock){....}, at: perf_event_output_forward+0x0/0x130 stack backtrace: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.19.0-rc1+ #1 Hardware name: LENOVO 2320CTO/2320CTO, BIOS G2ET86WW (2.06 ) 11/13/2012 Call Trace: dump_stack+0x85/0xcb perf_event_output_forward+0xf6/0x130 __perf_event_overflow+0x52/0xe0 perf_swevent_overflow+0x91/0xb0 perf_tp_event+0x11a/0x350 ? find_held_lock+0x2d/0x90 ? __lock_acquire+0x2ce/0x1350 ? __lock_acquire+0x2ce/0x1350 ? retint_kernel+0x2d/0x2d ? find_held_lock+0x2d/0x90 ? tick_nohz_get_sleep_length+0x83/0xb0 ? perf_trace_cpu+0xbb/0xd0 ? perf_trace_buf_alloc+0x5a/0xa0 perf_trace_cpu+0xbb/0xd0 cpuidle_enter_state+0x185/0x340 do_idle+0x1eb/0x260 cpu_startup_entry+0x5f/0x70 start_kernel+0x49b/0x4a6 secondary_startup_64+0xa4/0xb0 This is due to the tracepoints moving to SRCU usage which does not require RCU to be "watching". But perf uses these tracepoints with RCU and expects it to be. Hence, we still need to add in the rcu_irq_enter/exit_irqson() calls for "rcuidle" tracepoints. This is a temporary fix until we have SRCU working in NMI context, and then perf can be converted to use that instead of normal RCU. Link: http://lkml.kernel.org/r/20180904162611.6a120068@gandalf.local.home Cc: x86-ml Cc: Peter Zijlstra Reported-by: Borislav Petkov Tested-by: Borislav Petkov Reviewed-by: "Paul E. McKenney" Fixes: e6753f23d961d ("tracepoint: Make rcuidle tracepoint callers use SRCU") Signed-off-by: Steven Rostedt (VMware) --- include/linux/tracepoint.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 7f2e16e76ac4..041f7e56a289 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -158,8 +158,10 @@ extern void syscall_unregfunc(void); * For rcuidle callers, use srcu since sched-rcu \ * doesn't work from the idle path. \ */ \ - if (rcuidle) \ + if (rcuidle) { \ idx = srcu_read_lock_notrace(&tracepoint_srcu); \ + rcu_irq_enter_irqson(); \ + } \ \ it_func_ptr = rcu_dereference_raw((tp)->funcs); \ \ @@ -171,8 +173,10 @@ extern void syscall_unregfunc(void); } while ((++it_func_ptr)->func); \ } \ \ - if (rcuidle) \ + if (rcuidle) { \ + rcu_irq_exit_irqson(); \ srcu_read_unlock_notrace(&tracepoint_srcu, idx);\ + } \ \ preempt_enable_notrace(); \ } while (0) -- cgit v1.3-6-gb490 From 76d5581c870454be5f1f1a106c57985902e7ea20 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Sun, 5 Aug 2018 09:19:33 +0300 Subject: net/mlx5: Fix use-after-free in self-healing flow When the mlx5 health mechanism detects a problem while the driver is in the middle of init_one or remove_one, the driver needs to prevent the health mechanism from scheduling future work; if future work is scheduled, there is a problem with use-after-free: the system WQ tries to run the work item (which has been freed) at the scheduled future time. Prevent this by disabling work item scheduling in the health mechanism when the driver is in the middle of init_one() or remove_one(). Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters") Signed-off-by: Jack Morgenstein Reviewed-by: Feras Daoud Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/health.c | 10 +++++++++- drivers/net/ethernet/mellanox/mlx5/core/main.c | 6 +++--- include/linux/mlx5/driver.h | 2 +- 3 files changed, 13 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index d39b0b7011b2..9f39aeca863f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -331,9 +331,17 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev) add_timer(&health->timer); } -void mlx5_stop_health_poll(struct mlx5_core_dev *dev) +void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health) { struct mlx5_core_health *health = &dev->priv.health; + unsigned long flags; + + if (disable_health) { + spin_lock_irqsave(&health->wq_lock, flags); + set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); + set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); + spin_unlock_irqrestore(&health->wq_lock, flags); + } del_timer_sync(&health->timer); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index cf3e4a659052..739aad0a0b35 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1286,7 +1286,7 @@ err_cleanup_once: mlx5_cleanup_once(dev); err_stop_poll: - mlx5_stop_health_poll(dev); + mlx5_stop_health_poll(dev, boot); if (mlx5_cmd_teardown_hca(dev)) { dev_err(&dev->pdev->dev, "tear_down_hca failed, skip cleanup\n"); goto out_err; @@ -1346,7 +1346,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv, mlx5_free_irq_vectors(dev); if (cleanup) mlx5_cleanup_once(dev); - mlx5_stop_health_poll(dev); + mlx5_stop_health_poll(dev, cleanup); err = mlx5_cmd_teardown_hca(dev); if (err) { dev_err(&dev->pdev->dev, "tear_down_hca failed, skip cleanup\n"); @@ -1608,7 +1608,7 @@ static int mlx5_try_fast_unload(struct mlx5_core_dev *dev) * with the HCA, so the health polll is no longer needed. */ mlx5_drain_health_wq(dev); - mlx5_stop_health_poll(dev); + mlx5_stop_health_poll(dev, false); ret = mlx5_cmd_force_teardown_hca(dev); if (ret) { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 7a452716de4b..aa65f58c6610 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1052,7 +1052,7 @@ int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn); void mlx5_health_cleanup(struct mlx5_core_dev *dev); int mlx5_health_init(struct mlx5_core_dev *dev); void mlx5_start_health_poll(struct mlx5_core_dev *dev); -void mlx5_stop_health_poll(struct mlx5_core_dev *dev); +void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health); void mlx5_drain_health_wq(struct mlx5_core_dev *dev); void mlx5_trigger_health_work(struct mlx5_core_dev *dev); void mlx5_drain_health_recovery(struct mlx5_core_dev *dev); -- cgit v1.3-6-gb490 From 8d71e818506718e8d7032ce824b5c74a17d4f7a5 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 21 Aug 2018 16:04:41 +0300 Subject: net/mlx5: Use u16 for Work Queue buffer fragment size Minimal stride size is 16. Hence, the number of strides in a fragment (of PAGE_SIZE) is <= PAGE_SIZE / 16 <= 4K. u16 is sufficient to represent this. Fixes: 388ca8be0037 ("IB/mlx5: Implement fragmented completion queue (CQ)") Signed-off-by: Tariq Toukan Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/wq.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/wq.h | 2 +- include/linux/mlx5/driver.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.c b/drivers/net/ethernet/mellanox/mlx5/core/wq.c index c8c315eb5128..d838af9539b1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/wq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.c @@ -39,9 +39,9 @@ u32 mlx5_wq_cyc_get_size(struct mlx5_wq_cyc *wq) return (u32)wq->fbc.sz_m1 + 1; } -u32 mlx5_wq_cyc_get_frag_size(struct mlx5_wq_cyc *wq) +u16 mlx5_wq_cyc_get_frag_size(struct mlx5_wq_cyc *wq) { - return (u32)wq->fbc.frag_sz_m1 + 1; + return wq->fbc.frag_sz_m1 + 1; } u32 mlx5_cqwq_get_size(struct mlx5_cqwq *wq) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.h b/drivers/net/ethernet/mellanox/mlx5/core/wq.h index 2bd4c3184eba..3a1a170bb2d7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/wq.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.h @@ -80,7 +80,7 @@ int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, void *wqc, struct mlx5_wq_cyc *wq, struct mlx5_wq_ctrl *wq_ctrl); u32 mlx5_wq_cyc_get_size(struct mlx5_wq_cyc *wq); -u32 mlx5_wq_cyc_get_frag_size(struct mlx5_wq_cyc *wq); +u16 mlx5_wq_cyc_get_frag_size(struct mlx5_wq_cyc *wq); int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, void *qpc, struct mlx5_wq_qp *wq, diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index aa65f58c6610..3a1258fd8ac3 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -362,7 +362,7 @@ struct mlx5_frag_buf { struct mlx5_frag_buf_ctrl { struct mlx5_frag_buf frag_buf; u32 sz_m1; - u32 frag_sz_m1; + u16 frag_sz_m1; u32 strides_offset; u8 log_sz; u8 log_stride; -- cgit v1.3-6-gb490 From a09036221092989b88c55d24d1f12ceb1d7d361f Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 21 Aug 2018 16:07:58 +0300 Subject: net/mlx5: Use u16 for Work Queue buffer strides offset Minimal stride size is 16. Hence, the number of strides in a fragment (of PAGE_SIZE) is <= PAGE_SIZE / 16 <= 4K. u16 is sufficient to represent this. Fixes: d7037ad73daa ("net/mlx5: Fix QP fragmented buffer allocation") Signed-off-by: Tariq Toukan Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/wq.c | 2 +- include/linux/mlx5/driver.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.c b/drivers/net/ethernet/mellanox/mlx5/core/wq.c index d838af9539b1..68e7f8df2a6d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/wq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.c @@ -138,7 +138,7 @@ int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, void *qpc, struct mlx5_wq_qp *wq, struct mlx5_wq_ctrl *wq_ctrl) { - u32 sq_strides_offset; + u16 sq_strides_offset; u32 rq_pg_remainder; int err; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 3a1258fd8ac3..66d94b4557cf 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -363,7 +363,7 @@ struct mlx5_frag_buf_ctrl { struct mlx5_frag_buf frag_buf; u32 sz_m1; u16 frag_sz_m1; - u32 strides_offset; + u16 strides_offset; u8 log_sz; u8 log_stride; u8 log_frag_strides; @@ -995,7 +995,7 @@ static inline u32 mlx5_base_mkey(const u32 key) } static inline void mlx5_fill_fbc_offset(u8 log_stride, u8 log_sz, - u32 strides_offset, + u16 strides_offset, struct mlx5_frag_buf_ctrl *fbc) { fbc->log_stride = log_stride; -- cgit v1.3-6-gb490 From 09121255c784fd36ad6237a4e239c634b0209de0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 Aug 2018 14:13:13 +0200 Subject: perf/UAPI: Clearly mark __PERF_SAMPLE_CALLCHAIN_EARLY as internal use Vince noted that commit: 6cbc304f2f36 ("perf/x86/intel: Fix unwind errors from PEBS entries (mk-II)") 'leaked' __PERF_SAMPLE_CALLCHAIN_EARLY into the UAPI namespace. And while sys_perf_event_open() will error out if you try to use it, it is exposed. Clearly mark it for internal use only to avoid any confusion. Requested-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index eeb787b1c53c..f35eb72739c0 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -144,7 +144,7 @@ enum perf_event_sample_format { PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */ - __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, + __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */ }; /* -- cgit v1.3-6-gb490 From 01c5f85aebaaddfd7e6051fb2ec80c1d4b463554 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Sep 2018 10:59:53 -0600 Subject: blk-cgroup: increase number of supported policies After merging the iolatency policy, we potentially now have 4 policies being registered, but only support 3. This causes one of them to fail loading. Takashi reports that BFQ no longer works for him, because it fails to load due to policy registration failure. Bump to 5 policies, and also add a warning for when we have exceeded the global amount. If we have to touch this again, we should switch to a dynamic scheme instead. Reported-by: Takashi Iwai Reviewed-by: Jeff Moyer Tested-by: Takashi Iwai Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 4 +++- include/linux/blkdev.h | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index c19f9078da1e..c630e02836a8 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1510,8 +1510,10 @@ int blkcg_policy_register(struct blkcg_policy *pol) for (i = 0; i < BLKCG_MAX_POLS; i++) if (!blkcg_policy[i]) break; - if (i >= BLKCG_MAX_POLS) + if (i >= BLKCG_MAX_POLS) { + pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n"); goto err_unlock; + } /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */ if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d6869e0e2b64..6980014357d4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -54,7 +54,7 @@ struct blk_stat_callback; * Maximum number of blkcg policies allowed to be registered concurrently. * Defined here to simplify include dependency. */ -#define BLKCG_MAX_POLS 3 +#define BLKCG_MAX_POLS 5 typedef void (rq_end_io_fn)(struct request *, blk_status_t); -- cgit v1.3-6-gb490 From bfc456060d0cbcf6902a436d358b60cb1534668c Mon Sep 17 00:00:00 2001 From: Dennis Dalessandro Date: Fri, 31 Aug 2018 10:34:14 -0700 Subject: IB/hfi1,PCI: Allow bus reset while probing Calling into the new API to reset the secondary bus results in a deadlock. This occurs because the device/bus is already locked at probe time. Reverting back to the old behavior while the API is improved. Link: https://bugzilla.kernel.org/show_bug.cgi?id=200985 Fixes: c6a44ba950d1 ("PCI: Rename pci_try_reset_bus() to pci_reset_bus()") Fixes: 409888e0966e ("IB/hfi1: Use pci_try_reset_bus() for initiating PCI Secondary Bus Reset") Signed-off-by: Dennis Dalessandro Signed-off-by: Bjorn Helgaas Reviewed-by: Michael J. Ruhl Cc: Sinan Kaya --- drivers/infiniband/hw/hfi1/pcie.c | 11 ++++------- drivers/pci/pci.c | 1 + include/linux/pci.h | 3 +++ 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c index eec83757d55f..6c967dde58e7 100644 --- a/drivers/infiniband/hw/hfi1/pcie.c +++ b/drivers/infiniband/hw/hfi1/pcie.c @@ -893,14 +893,11 @@ static int trigger_sbr(struct hfi1_devdata *dd) } /* - * A secondary bus reset (SBR) issues a hot reset to our device. - * The following routine does a 1s wait after the reset is dropped - * per PCI Trhfa (recovery time). PCIe 3.0 section 6.6.1 - - * Conventional Reset, paragraph 3, line 35 also says that a 1s - * delay after a reset is required. Per spec requirements, - * the link is either working or not after that point. + * This is an end around to do an SBR during probe time. A new API needs + * to be implemented to have cleaner interface but this fixes the + * current brokenness */ - return pci_reset_bus(dev); + return pci_bridge_secondary_bus_reset(dev->bus->self); } /* diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 30b260332a10..1835f3a7aa8d 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -4547,6 +4547,7 @@ int pci_bridge_secondary_bus_reset(struct pci_dev *dev) return pci_dev_wait(dev, "bus reset", PCIE_RESET_READY_POLL_MS); } +EXPORT_SYMBOL_GPL(pci_bridge_secondary_bus_reset); static int pci_parent_bus_reset(struct pci_dev *dev, int probe) { diff --git a/include/linux/pci.h b/include/linux/pci.h index e72ca8dd6241..6925828f9f25 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1235,6 +1235,9 @@ void pci_bus_remove_resources(struct pci_bus *bus); int devm_request_pci_bus_resources(struct device *dev, struct list_head *resources); +/* Temporary until new and working PCI SBR API in place */ +int pci_bridge_secondary_bus_reset(struct pci_dev *dev); + #define pci_bus_for_each_resource(bus, res, i) \ for (i = 0; \ (res = pci_bus_resource_n(bus, i)) || i < PCI_BRIDGE_RESOURCE_NUM; \ -- cgit v1.3-6-gb490 From 7a9cdebdcc17e426fb5287e4a82db1dfe86339b2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 12 Sep 2018 23:57:48 -1000 Subject: mm: get rid of vmacache_flush_all() entirely Jann Horn points out that the vmacache_flush_all() function is not only potentially expensive, it's buggy too. It also happens to be entirely unnecessary, because the sequence number overflow case can be avoided by simply making the sequence number be 64-bit. That doesn't even grow the data structures in question, because the other adjacent fields are already 64-bit. So simplify the whole thing by just making the sequence number overflow case go away entirely, which gets rid of all the complications and makes the code faster too. Win-win. [ Oleg Nesterov points out that the VMACACHE_FULL_FLUSHES statistics also just goes away entirely with this ] Reported-by: Jann Horn Suggested-by: Will Deacon Acked-by: Davidlohr Bueso Cc: Oleg Nesterov Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 +- include/linux/mm_types_task.h | 2 +- include/linux/vm_event_item.h | 1 - include/linux/vmacache.h | 5 ----- mm/debug.c | 4 ++-- mm/vmacache.c | 38 -------------------------------------- 6 files changed, 4 insertions(+), 48 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index cd2bc939efd0..5ed8f6292a53 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -341,7 +341,7 @@ struct mm_struct { struct { struct vm_area_struct *mmap; /* list of VMAs */ struct rb_root mm_rb; - u32 vmacache_seqnum; /* per-thread vmacache */ + u64 vmacache_seqnum; /* per-thread vmacache */ #ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, unsigned long addr, unsigned long len, diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index 5fe87687664c..d7016dcb245e 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -32,7 +32,7 @@ #define VMACACHE_MASK (VMACACHE_SIZE - 1) struct vmacache { - u32 seqnum; + u64 seqnum; struct vm_area_struct *vmas[VMACACHE_SIZE]; }; diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 5c7f010676a7..47a3441cf4c4 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -105,7 +105,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_DEBUG_VM_VMACACHE VMACACHE_FIND_CALLS, VMACACHE_FIND_HITS, - VMACACHE_FULL_FLUSHES, #endif #ifdef CONFIG_SWAP SWAP_RA, diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h index 3e9a963edd6a..6fce268a4588 100644 --- a/include/linux/vmacache.h +++ b/include/linux/vmacache.h @@ -10,7 +10,6 @@ static inline void vmacache_flush(struct task_struct *tsk) memset(tsk->vmacache.vmas, 0, sizeof(tsk->vmacache.vmas)); } -extern void vmacache_flush_all(struct mm_struct *mm); extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma); extern struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr); @@ -24,10 +23,6 @@ extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, static inline void vmacache_invalidate(struct mm_struct *mm) { mm->vmacache_seqnum++; - - /* deal with overflows */ - if (unlikely(mm->vmacache_seqnum == 0)) - vmacache_flush_all(mm); } #endif /* __LINUX_VMACACHE_H */ diff --git a/mm/debug.c b/mm/debug.c index 38c926520c97..bd10aad8539a 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -114,7 +114,7 @@ EXPORT_SYMBOL(dump_vma); void dump_mm(const struct mm_struct *mm) { - pr_emerg("mm %px mmap %px seqnum %d task_size %lu\n" + pr_emerg("mm %px mmap %px seqnum %llu task_size %lu\n" #ifdef CONFIG_MMU "get_unmapped_area %px\n" #endif @@ -142,7 +142,7 @@ void dump_mm(const struct mm_struct *mm) "tlb_flush_pending %d\n" "def_flags: %#lx(%pGv)\n", - mm, mm->mmap, mm->vmacache_seqnum, mm->task_size, + mm, mm->mmap, (long long) mm->vmacache_seqnum, mm->task_size, #ifdef CONFIG_MMU mm->get_unmapped_area, #endif diff --git a/mm/vmacache.c b/mm/vmacache.c index ea517bef7dc5..cdc32a3b02fa 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c @@ -19,44 +19,6 @@ #endif #define VMACACHE_HASH(addr) ((addr >> VMACACHE_SHIFT) & VMACACHE_MASK) -/* - * Flush vma caches for threads that share a given mm. - * - * The operation is safe because the caller holds the mmap_sem - * exclusively and other threads accessing the vma cache will - * have mmap_sem held at least for read, so no extra locking - * is required to maintain the vma cache. - */ -void vmacache_flush_all(struct mm_struct *mm) -{ - struct task_struct *g, *p; - - count_vm_vmacache_event(VMACACHE_FULL_FLUSHES); - - /* - * Single threaded tasks need not iterate the entire - * list of process. We can avoid the flushing as well - * since the mm's seqnum was increased and don't have - * to worry about other threads' seqnum. Current's - * flush will occur upon the next lookup. - */ - if (atomic_read(&mm->mm_users) == 1) - return; - - rcu_read_lock(); - for_each_process_thread(g, p) { - /* - * Only flush the vmacache pointers as the - * mm seqnum is already set and curr's will - * be set upon invalidation when the next - * lookup is done. - */ - if (mm == p->mm) - vmacache_flush(p); - } - rcu_read_unlock(); -} - /* * This task may be accessing a foreign mm via (for example) * get_user_pages()->find_vma(). The vmacache is task-local and this -- cgit v1.3-6-gb490 From 500dd232449e7c07500e713dc6970aa713f8e4f1 Mon Sep 17 00:00:00 2001 From: Andrew Murray Date: Thu, 13 Sep 2018 13:48:27 +0100 Subject: asm-generic: io: Fix ioport_map() for !CONFIG_GENERIC_IOMAP && CONFIG_INDIRECT_PIO The !CONFIG_GENERIC_IOMAP version of ioport_map uses MMIO_UPPER_LIMIT to prevent users from making I/O accesses outside the expected I/O range - however it erroneously treats MMIO_UPPER_LIMIT as a mask which is contradictory to its other users. The introduction of CONFIG_INDIRECT_PIO, which subtracts an arbitrary amount from IO_SPACE_LIMIT to form MMIO_UPPER_LIMIT, results in ioport_map mangling the given port rather than capping it. We address this by aligning more closely with the CONFIG_GENERIC_IOMAP implementation of ioport_map by using the comparison operator and returning NULL where the port exceeds MMIO_UPPER_LIMIT. Though note that we preserve the existing behavior of masking with IO_SPACE_LIMIT such that we don't break existing buggy drivers that somehow rely on this masking. Fixes: 5745392e0c2b ("PCI: Apply the new generic I/O management on PCI IO hosts") Reported-by: Will Deacon Reviewed-by: Arnd Bergmann Signed-off-by: Andrew Murray Signed-off-by: Will Deacon --- include/asm-generic/io.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index 66d1d45fa2e1..d356f802945a 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -1026,7 +1026,8 @@ static inline void __iomem *ioremap_wt(phys_addr_t offset, size_t size) #define ioport_map ioport_map static inline void __iomem *ioport_map(unsigned long port, unsigned int nr) { - return PCI_IOBASE + (port & MMIO_UPPER_LIMIT); + port &= IO_SPACE_LIMIT; + return (port > MMIO_UPPER_LIMIT) ? NULL : PCI_IOBASE + port; } #endif -- cgit v1.3-6-gb490 From 197ecb3802c04499d8ff4f8cb28f6efa008067db Mon Sep 17 00:00:00 2001 From: Marek Marczykowski-Górecki Date: Fri, 7 Sep 2018 18:49:08 +0200 Subject: xen/balloon: add runtime control for scrubbing ballooned out pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Scrubbing pages on initial balloon down can take some time, especially in nested virtualization case (nested EPT is slow). When HVM/PVH guest is started with memory= significantly lower than maxmem=, all the extra pages will be scrubbed before returning to Xen. But since most of them weren't used at all at that point, Xen needs to populate them first (from populate-on-demand pool). In nested virt case (Xen inside KVM) this slows down the guest boot by 15-30s with just 1.5GB needed to be returned to Xen. Add runtime parameter to enable/disable it, to allow initially disabling scrubbing, then enable it back during boot (for example in initramfs). Such usage relies on assumption that a) most pages ballooned out during initial boot weren't used at all, and b) even if they were, very few secrets are in the guest at that time (before any serious userspace kicks in). Convert CONFIG_XEN_SCRUB_PAGES to CONFIG_XEN_SCRUB_PAGES_DEFAULT (also enabled by default), controlling default value for the new runtime switch. Signed-off-by: Marek Marczykowski-Górecki Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- Documentation/ABI/stable/sysfs-devices-system-xen_memory | 9 +++++++++ Documentation/admin-guide/kernel-parameters.txt | 6 ++++++ drivers/xen/Kconfig | 10 +++++++--- drivers/xen/mem-reservation.c | 4 ++++ drivers/xen/xen-balloon.c | 3 +++ include/xen/mem-reservation.h | 7 ++++--- 6 files changed, 33 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/Documentation/ABI/stable/sysfs-devices-system-xen_memory b/Documentation/ABI/stable/sysfs-devices-system-xen_memory index caa311d59ac1..6d83f95a8a8e 100644 --- a/Documentation/ABI/stable/sysfs-devices-system-xen_memory +++ b/Documentation/ABI/stable/sysfs-devices-system-xen_memory @@ -75,3 +75,12 @@ Contact: Konrad Rzeszutek Wilk Description: Amount (in KiB) of low (or normal) memory in the balloon. + +What: /sys/devices/system/xen_memory/xen_memory0/scrub_pages +Date: September 2018 +KernelVersion: 4.20 +Contact: xen-devel@lists.xenproject.org +Description: + Control scrubbing pages before returning them to Xen for others domains + use. Can be set with xen_scrub_pages cmdline + parameter. Default value controlled with CONFIG_XEN_SCRUB_PAGES_DEFAULT. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 64a3bf54b974..92eb1f42240d 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5000,6 +5000,12 @@ Disables the PV optimizations forcing the HVM guest to run as generic HVM guest with no PV drivers. + xen_scrub_pages= [XEN] + Boolean option to control scrubbing pages before giving them back + to Xen, for use by other domains. Can be also changed at runtime + with /sys/devices/system/xen_memory/xen_memory0/scrub_pages. + Default value controlled with CONFIG_XEN_SCRUB_PAGES_DEFAULT. + xirc2ps_cs= [NET,PCMCIA] Format: ,,,,,[,[,[,]]] diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index b459edfacff3..90d387b50ab7 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -79,15 +79,19 @@ config XEN_BALLOON_MEMORY_HOTPLUG_LIMIT This value is used to allocate enough space in internal tables needed for physical memory administration. -config XEN_SCRUB_PAGES - bool "Scrub pages before returning them to system" +config XEN_SCRUB_PAGES_DEFAULT + bool "Scrub pages before returning them to system by default" depends on XEN_BALLOON default y help Scrub pages before returning them to the system for reuse by other domains. This makes sure that any confidential data is not accidentally visible to other domains. Is it more - secure, but slightly less efficient. + secure, but slightly less efficient. This can be controlled with + xen_scrub_pages=0 parameter and + /sys/devices/system/xen_memory/xen_memory0/scrub_pages. + This option only sets the default value. + If in doubt, say yes. config XEN_DEV_EVTCHN diff --git a/drivers/xen/mem-reservation.c b/drivers/xen/mem-reservation.c index 084799c6180e..3782cf070338 100644 --- a/drivers/xen/mem-reservation.c +++ b/drivers/xen/mem-reservation.c @@ -14,6 +14,10 @@ #include #include +#include + +bool __read_mostly xen_scrub_pages = IS_ENABLED(CONFIG_XEN_SCRUB_PAGES_DEFAULT); +core_param(xen_scrub_pages, xen_scrub_pages, bool, 0); /* * Use one extent per PAGE_SIZE to avoid to break down the page into diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c index 294f35ce9e46..63c1494a8d73 100644 --- a/drivers/xen/xen-balloon.c +++ b/drivers/xen/xen-balloon.c @@ -44,6 +44,7 @@ #include #include #include +#include #define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) @@ -137,6 +138,7 @@ static DEVICE_ULONG_ATTR(schedule_delay, 0444, balloon_stats.schedule_delay); static DEVICE_ULONG_ATTR(max_schedule_delay, 0644, balloon_stats.max_schedule_delay); static DEVICE_ULONG_ATTR(retry_count, 0444, balloon_stats.retry_count); static DEVICE_ULONG_ATTR(max_retry_count, 0644, balloon_stats.max_retry_count); +static DEVICE_BOOL_ATTR(scrub_pages, 0644, xen_scrub_pages); static ssize_t show_target_kb(struct device *dev, struct device_attribute *attr, char *buf) @@ -203,6 +205,7 @@ static struct attribute *balloon_attrs[] = { &dev_attr_max_schedule_delay.attr.attr, &dev_attr_retry_count.attr.attr, &dev_attr_max_retry_count.attr.attr, + &dev_attr_scrub_pages.attr.attr, NULL }; diff --git a/include/xen/mem-reservation.h b/include/xen/mem-reservation.h index 80b52b4945e9..a2ab516fcd2c 100644 --- a/include/xen/mem-reservation.h +++ b/include/xen/mem-reservation.h @@ -17,11 +17,12 @@ #include +extern bool xen_scrub_pages; + static inline void xenmem_reservation_scrub_page(struct page *page) { -#ifdef CONFIG_XEN_SCRUB_PAGES - clear_highpage(page); -#endif + if (xen_scrub_pages) + clear_highpage(page); } #ifdef CONFIG_XEN_HAVE_PVMMU -- cgit v1.3-6-gb490