aboutsummaryrefslogtreecommitdiffstats
path: root/ipc
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-05-14 20:08:51 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2019-05-14 20:08:51 -0700
commit1064d857738187c764c0bd76040f424397f857c7 (patch)
tree13d16c0aed50b64c20b8fe235b15172f3c997f15 /ipc
parentMerge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost (diff)
parentmm: memcontrol: fix NUMA round-robin reclaim at intermediate level (diff)
downloadlinux-dev-1064d857738187c764c0bd76040f424397f857c7.tar.xz
linux-dev-1064d857738187c764c0bd76040f424397f857c7.zip
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton: - a couple of hotfixes - almost all of the rest of MM - lib/ updates - binfmt_elf updates - autofs updates - quite a lot of misc fixes and updates - reiserfs, fatfs - signals - exec - cpumask - rapidio - sysctl - pids - eventfd - gcov - panic - pps - gdb script updates - ipc updates * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (126 commits) mm: memcontrol: fix NUMA round-robin reclaim at intermediate level mm: memcontrol: fix recursive statistics correctness & scalabilty mm: memcontrol: move stat/event counting functions out-of-line mm: memcontrol: make cgroup stats and events query API explicitly local drivers/virt/fsl_hypervisor.c: prevent integer overflow in ioctl drivers/virt/fsl_hypervisor.c: dereferencing error pointers in ioctl mm, memcg: rename ambiguously named memory.stat counters and functions arch: remove <asm/sizes.h> and <asm-generic/sizes.h> treewide: replace #include <asm/sizes.h> with #include <linux/sizes.h> fs/block_dev.c: Remove duplicate header fs/cachefiles/namei.c: remove duplicate header include/linux/sched/signal.h: replace `tsk' with `task' fs/coda/psdev.c: remove duplicate header ipc: do cyclic id allocation for the ipc object. ipc: conserve sequence numbers in ipcmni_extend mode ipc: allow boot time extension of IPCMNI from 32k to 16M ipc/mqueue: optimize msg_get() ipc/mqueue: remove redundant wq task assignment ipc: prevent lockup on alloc_msg and free_msg scripts/gdb: print cached rate in lx-clk-summary ...
Diffstat (limited to 'ipc')
-rw-r--r--ipc/ipc_sysctl.c14
-rw-r--r--ipc/mqueue.c72
-rw-r--r--ipc/msgutil.c6
-rw-r--r--ipc/util.c48
-rw-r--r--ipc/util.h47
5 files changed, 139 insertions, 48 deletions
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 49f9bf4ffc7f..bfaae457810c 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -120,7 +120,9 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
static int zero;
static int one = 1;
static int int_max = INT_MAX;
-static int ipc_mni = IPCMNI;
+int ipc_mni = IPCMNI;
+int ipc_mni_shift = IPCMNI_SHIFT;
+int ipc_min_cycle = RADIX_TREE_MAP_SIZE;
static struct ctl_table ipc_kern_table[] = {
{
@@ -246,3 +248,13 @@ static int __init ipc_sysctl_init(void)
}
device_initcall(ipc_sysctl_init);
+
+static int __init ipc_mni_extend(char *str)
+{
+ ipc_mni = IPCMNI_EXTEND;
+ ipc_mni_shift = IPCMNI_EXTEND_SHIFT;
+ ipc_min_cycle = IPCMNI_EXTEND_MIN_CYCLE;
+ pr_info("IPCMNI extended to %d.\n", ipc_mni);
+ return 0;
+}
+early_param("ipcmni_extend", ipc_mni_extend);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index ba44164ea1f9..216cad1ff0d0 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -76,6 +76,7 @@ struct mqueue_inode_info {
wait_queue_head_t wait_q;
struct rb_root msg_tree;
+ struct rb_node *msg_tree_rightmost;
struct posix_msg_tree_node *node_cache;
struct mq_attr attr;
@@ -131,6 +132,7 @@ static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info)
{
struct rb_node **p, *parent = NULL;
struct posix_msg_tree_node *leaf;
+ bool rightmost = true;
p = &info->msg_tree.rb_node;
while (*p) {
@@ -139,9 +141,10 @@ static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info)
if (likely(leaf->priority == msg->m_type))
goto insert_msg;
- else if (msg->m_type < leaf->priority)
+ else if (msg->m_type < leaf->priority) {
p = &(*p)->rb_left;
- else
+ rightmost = false;
+ } else
p = &(*p)->rb_right;
}
if (info->node_cache) {
@@ -154,6 +157,10 @@ static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info)
INIT_LIST_HEAD(&leaf->msg_list);
}
leaf->priority = msg->m_type;
+
+ if (rightmost)
+ info->msg_tree_rightmost = &leaf->rb_node;
+
rb_link_node(&leaf->rb_node, parent, p);
rb_insert_color(&leaf->rb_node, &info->msg_tree);
insert_msg:
@@ -163,23 +170,35 @@ insert_msg:
return 0;
}
+static inline void msg_tree_erase(struct posix_msg_tree_node *leaf,
+ struct mqueue_inode_info *info)
+{
+ struct rb_node *node = &leaf->rb_node;
+
+ if (info->msg_tree_rightmost == node)
+ info->msg_tree_rightmost = rb_prev(node);
+
+ rb_erase(node, &info->msg_tree);
+ if (info->node_cache) {
+ kfree(leaf);
+ } else {
+ info->node_cache = leaf;
+ }
+}
+
static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)
{
- struct rb_node **p, *parent = NULL;
+ struct rb_node *parent = NULL;
struct posix_msg_tree_node *leaf;
struct msg_msg *msg;
try_again:
- p = &info->msg_tree.rb_node;
- while (*p) {
- parent = *p;
- /*
- * During insert, low priorities go to the left and high to the
- * right. On receive, we want the highest priorities first, so
- * walk all the way to the right.
- */
- p = &(*p)->rb_right;
- }
+ /*
+ * During insert, low priorities go to the left and high to the
+ * right. On receive, we want the highest priorities first, so
+ * walk all the way to the right.
+ */
+ parent = info->msg_tree_rightmost;
if (!parent) {
if (info->attr.mq_curmsgs) {
pr_warn_once("Inconsistency in POSIX message queue, "
@@ -194,24 +213,14 @@ try_again:
pr_warn_once("Inconsistency in POSIX message queue, "
"empty leaf node but we haven't implemented "
"lazy leaf delete!\n");
- rb_erase(&leaf->rb_node, &info->msg_tree);
- if (info->node_cache) {
- kfree(leaf);
- } else {
- info->node_cache = leaf;
- }
+ msg_tree_erase(leaf, info);
goto try_again;
} else {
msg = list_first_entry(&leaf->msg_list,
struct msg_msg, m_list);
list_del(&msg->m_list);
if (list_empty(&leaf->msg_list)) {
- rb_erase(&leaf->rb_node, &info->msg_tree);
- if (info->node_cache) {
- kfree(leaf);
- } else {
- info->node_cache = leaf;
- }
+ msg_tree_erase(leaf, info);
}
}
info->attr.mq_curmsgs--;
@@ -254,6 +263,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
info->qsize = 0;
info->user = NULL; /* set when all is ok */
info->msg_tree = RB_ROOT;
+ info->msg_tree_rightmost = NULL;
info->node_cache = NULL;
memset(&info->attr, 0, sizeof(info->attr));
info->attr.mq_maxmsg = min(ipc_ns->mq_msg_max,
@@ -430,7 +440,8 @@ static void mqueue_evict_inode(struct inode *inode)
struct user_struct *user;
unsigned long mq_bytes, mq_treesize;
struct ipc_namespace *ipc_ns;
- struct msg_msg *msg;
+ struct msg_msg *msg, *nmsg;
+ LIST_HEAD(tmp_msg);
clear_inode(inode);
@@ -441,10 +452,15 @@ static void mqueue_evict_inode(struct inode *inode)
info = MQUEUE_I(inode);
spin_lock(&info->lock);
while ((msg = msg_get(info)) != NULL)
- free_msg(msg);
+ list_add_tail(&msg->m_list, &tmp_msg);
kfree(info->node_cache);
spin_unlock(&info->lock);
+ list_for_each_entry_safe(msg, nmsg, &tmp_msg, m_list) {
+ list_del(&msg->m_list);
+ free_msg(msg);
+ }
+
/* Total amount of bytes accounted for the mqueue */
mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) +
min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) *
@@ -605,8 +621,6 @@ static void wq_add(struct mqueue_inode_info *info, int sr,
{
struct ext_wait_queue *walk;
- ewp->task = current;
-
list_for_each_entry(walk, &info->e_wait_q[sr].list, list) {
if (walk->task->prio <= current->prio) {
list_add_tail(&ewp->list, &walk->list);
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index 84598025a6ad..e65593742e2b 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -18,6 +18,7 @@
#include <linux/utsname.h>
#include <linux/proc_ns.h>
#include <linux/uaccess.h>
+#include <linux/sched.h>
#include "util.h"
@@ -64,6 +65,9 @@ static struct msg_msg *alloc_msg(size_t len)
pseg = &msg->next;
while (len > 0) {
struct msg_msgseg *seg;
+
+ cond_resched();
+
alen = min(len, DATALEN_SEG);
seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL_ACCOUNT);
if (seg == NULL)
@@ -176,6 +180,8 @@ void free_msg(struct msg_msg *msg)
kfree(msg);
while (seg != NULL) {
struct msg_msgseg *tmp = seg->next;
+
+ cond_resched();
kfree(seg);
seg = tmp;
}
diff --git a/ipc/util.c b/ipc/util.c
index 095274a871f8..d126d156efc6 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -109,7 +109,7 @@ static const struct rhashtable_params ipc_kht_params = {
* @ids: ipc identifier set
*
* Set up the sequence range to use for the ipc identifier range (limited
- * below IPCMNI) then initialise the keys hashtable and ids idr.
+ * below ipc_mni) then initialise the keys hashtable and ids idr.
*/
void ipc_init_ids(struct ipc_ids *ids)
{
@@ -119,6 +119,7 @@ void ipc_init_ids(struct ipc_ids *ids)
rhashtable_init(&ids->key_ht, &ipc_kht_params);
idr_init(&ids->ipcs_idr);
ids->max_idx = -1;
+ ids->last_idx = -1;
#ifdef CONFIG_CHECKPOINT_RESTORE
ids->next_id = -1;
#endif
@@ -192,6 +193,10 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
*
* The caller must own kern_ipc_perm.lock.of the new object.
* On error, the function returns a (negative) error code.
+ *
+ * To conserve sequence number space, especially with extended ipc_mni,
+ * the sequence number is incremented only when the returned ID is less than
+ * the last one.
*/
static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new)
{
@@ -215,17 +220,42 @@ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new)
*/
if (next_id < 0) { /* !CHECKPOINT_RESTORE or next_id is unset */
- new->seq = ids->seq++;
- if (ids->seq > IPCID_SEQ_MAX)
- ids->seq = 0;
- idx = idr_alloc(&ids->ipcs_idr, new, 0, 0, GFP_NOWAIT);
+ int max_idx;
+
+ max_idx = max(ids->in_use*3/2, ipc_min_cycle);
+ max_idx = min(max_idx, ipc_mni);
+
+ /* allocate the idx, with a NULL struct kern_ipc_perm */
+ idx = idr_alloc_cyclic(&ids->ipcs_idr, NULL, 0, max_idx,
+ GFP_NOWAIT);
+
+ if (idx >= 0) {
+ /*
+ * idx got allocated successfully.
+ * Now calculate the sequence number and set the
+ * pointer for real.
+ */
+ if (idx <= ids->last_idx) {
+ ids->seq++;
+ if (ids->seq >= ipcid_seq_max())
+ ids->seq = 0;
+ }
+ ids->last_idx = idx;
+
+ new->seq = ids->seq;
+ /* no need for smp_wmb(), this is done
+ * inside idr_replace, as part of
+ * rcu_assign_pointer
+ */
+ idr_replace(&ids->ipcs_idr, new, idx);
+ }
} else {
new->seq = ipcid_to_seqx(next_id);
idx = idr_alloc(&ids->ipcs_idr, new, ipcid_to_idx(next_id),
0, GFP_NOWAIT);
}
if (idx >= 0)
- new->id = SEQ_MULTIPLIER * new->seq + idx;
+ new->id = (new->seq << ipcmni_seq_shift()) + idx;
return idx;
}
@@ -253,8 +283,8 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
/* 1) Initialize the refcount so that ipc_rcu_putref works */
refcount_set(&new->refcount, 1);
- if (limit > IPCMNI)
- limit = IPCMNI;
+ if (limit > ipc_mni)
+ limit = ipc_mni;
if (ids->in_use >= limit)
return -ENOSPC;
@@ -737,7 +767,7 @@ static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos,
if (total >= ids->in_use)
return NULL;
- for (; pos < IPCMNI; pos++) {
+ for (; pos < ipc_mni; pos++) {
ipc = idr_find(&ids->ipcs_idr, pos);
if (ipc != NULL) {
*new_pos = pos + 1;
diff --git a/ipc/util.h b/ipc/util.h
index e272be622ae7..0fcf8e719b76 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -15,8 +15,37 @@
#include <linux/err.h>
#include <linux/ipc_namespace.h>
-#define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */
-#define SEQ_MULTIPLIER (IPCMNI)
+/*
+ * The IPC ID contains 2 separate numbers - index and sequence number.
+ * By default,
+ * bits 0-14: index (32k, 15 bits)
+ * bits 15-30: sequence number (64k, 16 bits)
+ *
+ * When IPCMNI extension mode is turned on, the composition changes:
+ * bits 0-23: index (16M, 24 bits)
+ * bits 24-30: sequence number (128, 7 bits)
+ */
+#define IPCMNI_SHIFT 15
+#define IPCMNI_EXTEND_SHIFT 24
+#define IPCMNI_EXTEND_MIN_CYCLE (RADIX_TREE_MAP_SIZE * RADIX_TREE_MAP_SIZE)
+#define IPCMNI (1 << IPCMNI_SHIFT)
+#define IPCMNI_EXTEND (1 << IPCMNI_EXTEND_SHIFT)
+
+#ifdef CONFIG_SYSVIPC_SYSCTL
+extern int ipc_mni;
+extern int ipc_mni_shift;
+extern int ipc_min_cycle;
+
+#define ipcmni_seq_shift() ipc_mni_shift
+#define IPCMNI_IDX_MASK ((1 << ipc_mni_shift) - 1)
+
+#else /* CONFIG_SYSVIPC_SYSCTL */
+
+#define ipc_mni IPCMNI
+#define ipc_min_cycle ((int)RADIX_TREE_MAP_SIZE)
+#define ipcmni_seq_shift() IPCMNI_SHIFT
+#define IPCMNI_IDX_MASK ((1 << IPCMNI_SHIFT) - 1)
+#endif /* CONFIG_SYSVIPC_SYSCTL */
void sem_init(void);
void msg_init(void);
@@ -96,9 +125,9 @@ struct pid_namespace *ipc_seq_pid_ns(struct seq_file *);
#define IPC_MSG_IDS 1
#define IPC_SHM_IDS 2
-#define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER)
-#define ipcid_to_seqx(id) ((id) / SEQ_MULTIPLIER)
-#define IPCID_SEQ_MAX min_t(int, INT_MAX/SEQ_MULTIPLIER, USHRT_MAX)
+#define ipcid_to_idx(id) ((id) & IPCMNI_IDX_MASK)
+#define ipcid_to_seqx(id) ((id) >> ipcmni_seq_shift())
+#define ipcid_seq_max() (INT_MAX >> ipcmni_seq_shift())
/* must be called with ids->rwsem acquired for writing */
int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int);
@@ -123,8 +152,8 @@ static inline int ipc_get_maxidx(struct ipc_ids *ids)
if (ids->in_use == 0)
return -1;
- if (ids->in_use == IPCMNI)
- return IPCMNI - 1;
+ if (ids->in_use == ipc_mni)
+ return ipc_mni - 1;
return ids->max_idx;
}
@@ -216,10 +245,10 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
static inline int sem_check_semmni(struct ipc_namespace *ns) {
/*
- * Check semmni range [0, IPCMNI]
+ * Check semmni range [0, ipc_mni]
* semmni is the last element of sem_ctls[4] array
*/
- return ((ns->sem_ctls[3] < 0) || (ns->sem_ctls[3] > IPCMNI))
+ return ((ns->sem_ctls[3] < 0) || (ns->sem_ctls[3] > ipc_mni))
? -ERANGE : 0;
}