From 0b173bc4daa8f8ec03a85abf5e47b23502ff80af Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:46 -0700 Subject: mm: kill vma flag VM_CAN_NONLINEAR Move actual pte filling for non-linear file mappings into the new special vma operation: ->remap_pages(). Filesystems must implement this method to get non-linear mapping support, if it uses filemap_fault() then generic_file_remap_pages() can be used. Now device drivers can implement this method and obtain nonlinear vma support. Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf #arch/tile Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/vfs_file.c | 1 + fs/btrfs/file.c | 2 +- fs/ceph/addr.c | 2 +- fs/cifs/file.c | 1 + fs/ext4/file.c | 2 +- fs/fuse/file.c | 1 + fs/gfs2/file.c | 2 +- fs/nfs/file.c | 1 + fs/nilfs2/file.c | 2 +- fs/ocfs2/mmap.c | 2 +- fs/ubifs/file.c | 1 + fs/xfs/xfs_file.c | 2 +- 12 files changed, 12 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index dd6f7ee1e312..c2483e97beee 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -738,6 +738,7 @@ v9fs_cached_file_write(struct file *filp, const char __user * data, static const struct vm_operations_struct v9fs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = v9fs_vm_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5caf285c6e4d..f6b40e86121b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1599,6 +1599,7 @@ out: static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = btrfs_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) @@ -1610,7 +1611,6 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) file_accessed(filp); vma->vm_ops = &btrfs_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 22b6e4583faa..6690269f5dde 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1224,6 +1224,7 @@ out: static struct vm_operations_struct ceph_vmops = { .fault = filemap_fault, .page_mkwrite = ceph_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; int ceph_mmap(struct file *file, struct vm_area_struct *vma) @@ -1234,6 +1235,5 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma) return -ENOEXEC; file_accessed(file); vma->vm_ops = &ceph_vmops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 7d7bbdc4c8e7..edb25b4bbb95 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3003,6 +3003,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static struct vm_operations_struct cifs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = cifs_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index ca6f07afe601..bf3966bccd34 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -207,6 +207,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = ext4_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) @@ -217,7 +218,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) return -ENOEXEC; file_accessed(file); vma->vm_ops = &ext4_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index aba15f1b7ad2..78d2837bc940 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1379,6 +1379,7 @@ static const struct vm_operations_struct fuse_file_vm_ops = { .close = fuse_vma_close, .fault = filemap_fault, .page_mkwrite = fuse_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 30e21997a1a1..0def0504afc1 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -492,6 +492,7 @@ out: static const struct vm_operations_struct gfs2_vm_ops = { .fault = filemap_fault, .page_mkwrite = gfs2_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; /** @@ -526,7 +527,6 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) return error; } vma->vm_ops = &gfs2_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 6a7fcab7ecb3..f692be97676d 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -578,6 +578,7 @@ out: static const struct vm_operations_struct nfs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = nfs_vm_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int nfs_need_sync_write(struct file *filp, struct inode *inode) diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 5b387a4c293e..16f35f7423c5 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -135,13 +135,13 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static const struct vm_operations_struct nilfs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = nilfs_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &nilfs_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index d150372fd81d..47a87dda54ce 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -173,6 +173,7 @@ out: static const struct vm_operations_struct ocfs2_file_vm_ops = { .fault = ocfs2_fault, .page_mkwrite = ocfs2_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) @@ -188,7 +189,6 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level); out: vma->vm_ops = &ocfs2_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index ff48c5a85309..5bc77817f382 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1536,6 +1536,7 @@ out_unlock: static const struct vm_operations_struct ubifs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = ubifs_vm_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 1eaeb8be3aae..aa473fa640a2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -940,7 +940,6 @@ xfs_file_mmap( struct vm_area_struct *vma) { vma->vm_ops = &xfs_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; file_accessed(filp); return 0; @@ -1443,4 +1442,5 @@ const struct file_operations xfs_dir_file_operations = { static const struct vm_operations_struct xfs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = xfs_vm_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; -- cgit v1.2.3-59-g8ed1b From 0103bd16fb90bc741c7a03fd1ea4e8a505abad23 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:28:59 -0700 Subject: mm: prepare VM_DONTDUMP for using in drivers Rename VM_NODUMP into VM_DONTDUMP: this name matches other negative flags: VM_DONTEXPAND, VM_DONTCOPY. Currently this flag used only for sys_madvise. The next patch will use it for replacing the outdated flag VM_RESERVED. Also forbid madvise(MADV_DODUMP) for special kernel mappings VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 2 +- include/linux/mm.h | 2 +- mm/madvise.c | 8 ++++++-- 3 files changed, 8 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 28a64e769527..2b72d26e2e4b 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1123,7 +1123,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, if (always_dump_vma(vma)) goto whole; - if (vma->vm_flags & VM_NODUMP) + if (vma->vm_flags & VM_DONTDUMP) return 0; /* Hugetlb memory check */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 25ef49c1f2bd..dc08d558e058 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -102,7 +102,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ -#define VM_NODUMP 0x04000000 /* Do not include in the core dump */ +#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ diff --git a/mm/madvise.c b/mm/madvise.c index 14d260fa0d17..03dfa5c7adb3 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -69,10 +69,14 @@ static long madvise_behavior(struct vm_area_struct * vma, new_flags &= ~VM_DONTCOPY; break; case MADV_DONTDUMP: - new_flags |= VM_NODUMP; + new_flags |= VM_DONTDUMP; break; case MADV_DODUMP: - new_flags &= ~VM_NODUMP; + if (new_flags & VM_SPECIAL) { + error = -EINVAL; + goto out; + } + new_flags &= ~VM_DONTDUMP; break; case MADV_MERGEABLE: case MADV_UNMERGEABLE: -- cgit v1.2.3-59-g8ed1b From 314e51b9851b4f4e8ab302243ff5a6fc6147f379 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:29:02 -0700 Subject: mm: kill vma flag VM_RESERVED and mm->reserved_vm counter A long time ago, in v2.4, VM_RESERVED kept swapout process off VMA, currently it lost original meaning but still has some effects: | effect | alternative flags -+------------------------+--------------------------------------------- 1| account as reserved_vm | VM_IO 2| skip in core dump | VM_IO, VM_DONTDUMP 3| do not merge or expand | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP 4| do not mlock | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP This patch removes reserved_vm counter from mm_struct. Seems like nobody cares about it, it does not exported into userspace directly, it only reduces total_vm showed in proc. Thus VM_RESERVED can be replaced with VM_IO or pair VM_DONTEXPAND | VM_DONTDUMP. remap_pfn_range() and io_remap_pfn_range() set VM_IO|VM_DONTEXPAND|VM_DONTDUMP. remap_vmalloc_range() set VM_DONTEXPAND | VM_DONTDUMP. [akpm@linux-foundation.org: drivers/vfio/pci/vfio_pci.c fixup] Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/unevictable-lru.txt | 4 ++-- arch/alpha/kernel/pci-sysfs.c | 2 +- arch/ia64/kernel/perfmon.c | 2 +- arch/ia64/mm/init.c | 3 ++- arch/powerpc/kvm/book3s_hv.c | 2 +- arch/sparc/kernel/pci.c | 2 +- arch/unicore32/kernel/process.c | 2 +- arch/x86/xen/mmu.c | 3 +-- drivers/char/mbcs.c | 2 +- drivers/char/mem.c | 2 +- drivers/char/mspec.c | 2 +- drivers/gpu/drm/drm_gem.c | 2 +- drivers/gpu/drm/drm_vm.c | 10 ++-------- drivers/gpu/drm/exynos/exynos_drm_gem.c | 2 +- drivers/gpu/drm/gma500/framebuffer.c | 3 +-- drivers/gpu/drm/ttm/ttm_bo_vm.c | 4 ++-- drivers/gpu/drm/udl/udl_fb.c | 2 +- drivers/infiniband/hw/ehca/ehca_uverbs.c | 4 ++-- drivers/infiniband/hw/ipath/ipath_file_ops.c | 2 +- drivers/infiniband/hw/qib/qib_file_ops.c | 2 +- drivers/media/pci/meye/meye.c | 2 +- drivers/media/platform/omap/omap_vout.c | 2 +- drivers/media/platform/vino.c | 2 +- drivers/media/usb/sn9c102/sn9c102_core.c | 3 +-- drivers/media/usb/usbvision/usbvision-video.c | 3 +-- drivers/media/v4l2-core/videobuf-dma-sg.c | 2 +- drivers/media/v4l2-core/videobuf-vmalloc.c | 2 +- drivers/media/v4l2-core/videobuf2-memops.c | 2 +- drivers/misc/carma/carma-fpga.c | 2 -- drivers/misc/sgi-gru/grufile.c | 5 ++--- drivers/mtd/mtdchar.c | 2 +- drivers/scsi/sg.c | 2 +- drivers/staging/omapdrm/omap_gem_dmabuf.c | 2 +- drivers/staging/tidspbridge/rmgr/drv_interface.c | 2 +- drivers/uio/uio.c | 4 +--- drivers/usb/mon/mon_bin.c | 2 +- drivers/video/68328fb.c | 2 +- drivers/video/aty/atyfb_base.c | 3 +-- drivers/video/fb-puv3.c | 3 +-- drivers/video/fb_defio.c | 2 +- drivers/video/fbmem.c | 3 +-- drivers/video/gbefb.c | 2 +- drivers/video/omap2/omapfb/omapfb-main.c | 2 +- drivers/video/sbuslib.c | 5 ++--- drivers/video/smscufx.c | 1 - drivers/video/udlfb.c | 1 - drivers/video/vermilion/vermilion.c | 1 - drivers/video/vfb.c | 1 - drivers/xen/gntalloc.c | 2 +- drivers/xen/gntdev.c | 2 +- drivers/xen/privcmd.c | 3 ++- fs/binfmt_elf.c | 2 +- fs/binfmt_elf_fdpic.c | 2 +- fs/hugetlbfs/inode.c | 2 +- fs/proc/task_mmu.c | 2 +- include/linux/mempolicy.h | 2 +- include/linux/mm.h | 3 +-- include/linux/mm_types.h | 1 - kernel/events/core.c | 2 +- mm/ksm.c | 3 +-- mm/memory.c | 11 +++++------ mm/mlock.c | 2 +- mm/mmap.c | 2 -- mm/nommu.c | 2 +- mm/vmalloc.c | 3 +-- security/selinux/selinuxfs.c | 2 +- sound/core/pcm_native.c | 6 +++--- sound/usb/usx2y/us122l.c | 2 +- sound/usb/usx2y/usX2Yhwdep.c | 2 +- sound/usb/usx2y/usx2yhwdeppcm.c | 2 +- 70 files changed, 77 insertions(+), 105 deletions(-) (limited to 'fs') diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt index fa206cccf89f..323ff5dba1cc 100644 --- a/Documentation/vm/unevictable-lru.txt +++ b/Documentation/vm/unevictable-lru.txt @@ -371,8 +371,8 @@ mlock_fixup() filters several classes of "special" VMAs: mlock_fixup() will call make_pages_present() in the hugetlbfs VMA range to allocate the huge pages and populate the ptes. -3) VMAs with VM_DONTEXPAND or VM_RESERVED are generally userspace mappings of - kernel pages, such as the VDSO page, relay channel pages, etc. These pages +3) VMAs with VM_DONTEXPAND are generally userspace mappings of kernel pages, + such as the VDSO page, relay channel pages, etc. These pages are inherently unevictable and are not managed on the LRU lists. mlock_fixup() treats these VMAs the same as hugetlbfs VMAs. It calls make_pages_present() to populate the ptes. diff --git a/arch/alpha/kernel/pci-sysfs.c b/arch/alpha/kernel/pci-sysfs.c index 53649c7d0068..b51f7b4818cd 100644 --- a/arch/alpha/kernel/pci-sysfs.c +++ b/arch/alpha/kernel/pci-sysfs.c @@ -26,7 +26,7 @@ static int hose_mmap_page_range(struct pci_controller *hose, base = sparse ? hose->sparse_io_base : hose->dense_io_base; vma->vm_pgoff += base >> PAGE_SHIFT; - vma->vm_flags |= (VM_IO | VM_RESERVED); + vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; return io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, vma->vm_end - vma->vm_start, diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index f388b4e18a37..ea39eba61ef5 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2307,7 +2307,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t */ vma->vm_mm = mm; vma->vm_file = get_file(filp); - vma->vm_flags = VM_READ| VM_MAYREAD |VM_RESERVED; + vma->vm_flags = VM_READ|VM_MAYREAD|VM_DONTEXPAND|VM_DONTDUMP; vma->vm_page_prot = PAGE_READONLY; /* XXX may need to change */ /* diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 0eab454867a2..082e383c1b6f 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -138,7 +138,8 @@ ia64_init_addr_space (void) vma->vm_mm = current->mm; vma->vm_end = PAGE_SIZE; vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT); - vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | VM_RESERVED; + vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | + VM_DONTEXPAND | VM_DONTDUMP; down_write(¤t->mm->mmap_sem); if (insert_vm_struct(current->mm, vma)) { up_write(¤t->mm->mmap_sem); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 83e929e66f9d..721d4603a235 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1183,7 +1183,7 @@ static const struct vm_operations_struct kvm_rma_vm_ops = { static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma) { - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &kvm_rma_vm_ops; return 0; } diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c index acc8c838ff72..75b31bcdeadf 100644 --- a/arch/sparc/kernel/pci.c +++ b/arch/sparc/kernel/pci.c @@ -779,7 +779,7 @@ static int __pci_mmap_make_offset(struct pci_dev *pdev, static void __pci_mmap_set_flags(struct pci_dev *dev, struct vm_area_struct *vma, enum pci_mmap_state mmap_state) { - vma->vm_flags |= (VM_IO | VM_RESERVED); + vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; } /* Set vm_page_prot of VMA, as appropriate for this architecture, for a pci diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c index b6f0458c3143..b008586dad75 100644 --- a/arch/unicore32/kernel/process.c +++ b/arch/unicore32/kernel/process.c @@ -380,7 +380,7 @@ int vectors_user_mapping(void) return install_special_mapping(mm, 0xffff0000, PAGE_SIZE, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC | - VM_RESERVED, + VM_DONTEXPAND | VM_DONTDUMP, NULL); } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 5a16824cc2b3..fd28d86fe3d2 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2451,8 +2451,7 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma, prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); - BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) == - (VM_PFNMAP | VM_RESERVED | VM_IO))); + BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); rmd.mfn = mfn; rmd.prot = prot; diff --git a/drivers/char/mbcs.c b/drivers/char/mbcs.c index 0c7d340b9ab9..f74e892711dd 100644 --- a/drivers/char/mbcs.c +++ b/drivers/char/mbcs.c @@ -507,7 +507,7 @@ static int mbcs_gscr_mmap(struct file *fp, struct vm_area_struct *vma) vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - /* Remap-pfn-range will mark the range VM_IO and VM_RESERVED */ + /* Remap-pfn-range will mark the range VM_IO */ if (remap_pfn_range(vma, vma->vm_start, __pa(soft->gscr_addr) >> PAGE_SHIFT, diff --git a/drivers/char/mem.c b/drivers/char/mem.c index e5eedfa24c91..0537903c985b 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -322,7 +322,7 @@ static int mmap_mem(struct file *file, struct vm_area_struct *vma) vma->vm_ops = &mmap_mem_ops; - /* Remap-pfn-range will mark the range VM_IO and VM_RESERVED */ + /* Remap-pfn-range will mark the range VM_IO */ if (remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c index 845f97fd1832..e1f60f968fdd 100644 --- a/drivers/char/mspec.c +++ b/drivers/char/mspec.c @@ -286,7 +286,7 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma, atomic_set(&vdata->refcnt, 1); vma->vm_private_data = vdata; - vma->vm_flags |= (VM_IO | VM_RESERVED | VM_PFNMAP | VM_DONTEXPAND); + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; if (vdata->type == MSPEC_FETCHOP || vdata->type == MSPEC_UNCACHED) vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &mspec_vm_ops; diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index 92177d5aedee..24efae464e2c 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c @@ -706,7 +706,7 @@ int drm_gem_mmap(struct file *filp, struct vm_area_struct *vma) goto out_unlock; } - vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND; + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = obj->dev->driver->gem_vm_ops; vma->vm_private_data = map->handle; vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c index 23a824e6a22a..db7bd292410b 100644 --- a/drivers/gpu/drm/drm_vm.c +++ b/drivers/gpu/drm/drm_vm.c @@ -514,8 +514,7 @@ static int drm_mmap_dma(struct file *filp, struct vm_area_struct *vma) vma->vm_ops = &drm_vm_dma_ops; - vma->vm_flags |= VM_RESERVED; /* Don't swap */ - vma->vm_flags |= VM_DONTEXPAND; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; drm_vm_open_locked(dev, vma); return 0; @@ -643,21 +642,16 @@ int drm_mmap_locked(struct file *filp, struct vm_area_struct *vma) case _DRM_SHM: vma->vm_ops = &drm_vm_shm_ops; vma->vm_private_data = (void *)map; - /* Don't let this area swap. Change when - DRM_KERNEL advisory is supported. */ - vma->vm_flags |= VM_RESERVED; break; case _DRM_SCATTER_GATHER: vma->vm_ops = &drm_vm_sg_ops; vma->vm_private_data = (void *)map; - vma->vm_flags |= VM_RESERVED; vma->vm_page_prot = drm_dma_prot(map->type, vma); break; default: return -EINVAL; /* This should never happen. */ } - vma->vm_flags |= VM_RESERVED; /* Don't swap */ - vma->vm_flags |= VM_DONTEXPAND; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; drm_vm_open_locked(dev, vma); return 0; diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c index fcdbe46914f7..d2545560664f 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gem.c +++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c @@ -500,7 +500,7 @@ static int exynos_drm_gem_mmap_buffer(struct file *filp, DRM_DEBUG_KMS("%s\n", __FILE__); - vma->vm_flags |= (VM_IO | VM_RESERVED); + vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; update_vm_cache_attr(exynos_gem_obj, vma); diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c index 884ba73ac6ce..afded54dbb10 100644 --- a/drivers/gpu/drm/gma500/framebuffer.c +++ b/drivers/gpu/drm/gma500/framebuffer.c @@ -178,8 +178,7 @@ static int psbfb_mmap(struct fb_info *info, struct vm_area_struct *vma) */ vma->vm_ops = &psbfb_vm_ops; vma->vm_private_data = (void *)psbfb; - vma->vm_flags |= VM_RESERVED | VM_IO | - VM_MIXEDMAP | VM_DONTEXPAND; + vma->vm_flags |= VM_IO | VM_MIXEDMAP | VM_DONTEXPAND | VM_DONTDUMP; return 0; } diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index a877813571a4..3ba72dbdc4bd 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c @@ -285,7 +285,7 @@ int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma, */ vma->vm_private_data = bo; - vma->vm_flags |= VM_RESERVED | VM_IO | VM_MIXEDMAP | VM_DONTEXPAND; + vma->vm_flags |= VM_IO | VM_MIXEDMAP | VM_DONTEXPAND | VM_DONTDUMP; return 0; out_unref: ttm_bo_unref(&bo); @@ -300,7 +300,7 @@ int ttm_fbdev_mmap(struct vm_area_struct *vma, struct ttm_buffer_object *bo) vma->vm_ops = &ttm_bo_vm_ops; vma->vm_private_data = ttm_bo_reference(bo); - vma->vm_flags |= VM_RESERVED | VM_IO | VM_MIXEDMAP | VM_DONTEXPAND; + vma->vm_flags |= VM_IO | VM_MIXEDMAP | VM_DONTEXPAND; return 0; } EXPORT_SYMBOL(ttm_fbdev_mmap); diff --git a/drivers/gpu/drm/udl/udl_fb.c b/drivers/gpu/drm/udl/udl_fb.c index 67df842fbb33..69a2b16f42a6 100644 --- a/drivers/gpu/drm/udl/udl_fb.c +++ b/drivers/gpu/drm/udl/udl_fb.c @@ -243,7 +243,7 @@ static int udl_fb_mmap(struct fb_info *info, struct vm_area_struct *vma) size = 0; } - vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */ + /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ return 0; } diff --git a/drivers/infiniband/hw/ehca/ehca_uverbs.c b/drivers/infiniband/hw/ehca/ehca_uverbs.c index 45ee89b65c23..1a1d5d99fcf9 100644 --- a/drivers/infiniband/hw/ehca/ehca_uverbs.c +++ b/drivers/infiniband/hw/ehca/ehca_uverbs.c @@ -117,7 +117,7 @@ static int ehca_mmap_fw(struct vm_area_struct *vma, struct h_galpas *galpas, physical = galpas->user.fw_handle; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); ehca_gen_dbg("vsize=%llx physical=%llx", vsize, physical); - /* VM_IO | VM_RESERVED are set by remap_pfn_range() */ + /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ ret = remap_4k_pfn(vma, vma->vm_start, physical >> EHCA_PAGESHIFT, vma->vm_page_prot); if (unlikely(ret)) { @@ -139,7 +139,7 @@ static int ehca_mmap_queue(struct vm_area_struct *vma, struct ipz_queue *queue, u64 start, ofs; struct page *page; - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; start = vma->vm_start; for (ofs = 0; ofs < queue->queue_length; ofs += PAGE_SIZE) { u64 virt_addr = (u64)ipz_qeit_calc(queue, ofs); diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c index 736d9edbdbe7..3eb7e454849b 100644 --- a/drivers/infiniband/hw/ipath/ipath_file_ops.c +++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c @@ -1225,7 +1225,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT; vma->vm_ops = &ipath_file_vm_ops; - vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; ret = 1; bail: diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index faa44cb08071..959a5c4ff812 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -971,7 +971,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT; vma->vm_ops = &qib_file_vm_ops; - vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; ret = 1; bail: diff --git a/drivers/media/pci/meye/meye.c b/drivers/media/pci/meye/meye.c index 7bc775219f97..e5a76da86081 100644 --- a/drivers/media/pci/meye/meye.c +++ b/drivers/media/pci/meye/meye.c @@ -1647,7 +1647,7 @@ static int meye_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_ops = &meye_vm_ops; vma->vm_flags &= ~VM_IO; /* not I/O memory */ - vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */ + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_private_data = (void *) (offset / gbufsize); meye_vm_open(vma); diff --git a/drivers/media/platform/omap/omap_vout.c b/drivers/media/platform/omap/omap_vout.c index 66ac21d466af..134016f0e660 100644 --- a/drivers/media/platform/omap/omap_vout.c +++ b/drivers/media/platform/omap/omap_vout.c @@ -911,7 +911,7 @@ static int omap_vout_mmap(struct file *file, struct vm_area_struct *vma) q->bufs[i]->baddr = vma->vm_start; - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); vma->vm_ops = &omap_vout_vm_ops; vma->vm_private_data = (void *) vout; diff --git a/drivers/media/platform/vino.c b/drivers/media/platform/vino.c index 790d96cffeea..70b0bf4b2900 100644 --- a/drivers/media/platform/vino.c +++ b/drivers/media/platform/vino.c @@ -3950,7 +3950,7 @@ found: fb->map_count = 1; - vma->vm_flags |= VM_DONTEXPAND | VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_flags &= ~VM_IO; vma->vm_private_data = fb; vma->vm_file = file; diff --git a/drivers/media/usb/sn9c102/sn9c102_core.c b/drivers/media/usb/sn9c102/sn9c102_core.c index 19ea780b16ff..5bfc8e2f018f 100644 --- a/drivers/media/usb/sn9c102/sn9c102_core.c +++ b/drivers/media/usb/sn9c102/sn9c102_core.c @@ -2126,8 +2126,7 @@ static int sn9c102_mmap(struct file* filp, struct vm_area_struct *vma) return -EINVAL; } - vma->vm_flags |= VM_IO; - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; pos = cam->frame[i].bufmem; while (size > 0) { /* size is page-aligned */ diff --git a/drivers/media/usb/usbvision/usbvision-video.c b/drivers/media/usb/usbvision/usbvision-video.c index f67018ed3795..5c36a57e6590 100644 --- a/drivers/media/usb/usbvision/usbvision-video.c +++ b/drivers/media/usb/usbvision/usbvision-video.c @@ -1108,8 +1108,7 @@ static int usbvision_mmap(struct file *file, struct vm_area_struct *vma) } /* VM_IO is eventually going to replace PageReserved altogether */ - vma->vm_flags |= VM_IO; - vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */ + vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; pos = usbvision->frame[i].data; while (size > 0) { diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index f300deafd268..828e7c10bd70 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c @@ -582,7 +582,7 @@ static int __videobuf_mmap_mapper(struct videobuf_queue *q, map->count = 1; map->q = q; vma->vm_ops = &videobuf_vm_ops; - vma->vm_flags |= VM_DONTEXPAND | VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_flags &= ~VM_IO; /* using shared anonymous pages */ vma->vm_private_data = map; dprintk(1, "mmap %p: q=%p %08lx-%08lx pgoff %08lx bufs %d-%d\n", diff --git a/drivers/media/v4l2-core/videobuf-vmalloc.c b/drivers/media/v4l2-core/videobuf-vmalloc.c index df142580e44c..2ff7fcc77b11 100644 --- a/drivers/media/v4l2-core/videobuf-vmalloc.c +++ b/drivers/media/v4l2-core/videobuf-vmalloc.c @@ -270,7 +270,7 @@ static int __videobuf_mmap_mapper(struct videobuf_queue *q, } vma->vm_ops = &videobuf_vm_ops; - vma->vm_flags |= VM_DONTEXPAND | VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_private_data = map; dprintk(1, "mmap %p: q=%p %08lx-%08lx (%lx) pgoff %08lx buf %d\n", diff --git a/drivers/media/v4l2-core/videobuf2-memops.c b/drivers/media/v4l2-core/videobuf2-memops.c index 504cd4cbe29e..051ea3571b20 100644 --- a/drivers/media/v4l2-core/videobuf2-memops.c +++ b/drivers/media/v4l2-core/videobuf2-memops.c @@ -163,7 +163,7 @@ int vb2_mmap_pfn_range(struct vm_area_struct *vma, unsigned long paddr, return ret; } - vma->vm_flags |= VM_DONTEXPAND | VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_private_data = priv; vma->vm_ops = vm_ops; diff --git a/drivers/misc/carma/carma-fpga.c b/drivers/misc/carma/carma-fpga.c index 0c43297ed9ac..8835eabb3b87 100644 --- a/drivers/misc/carma/carma-fpga.c +++ b/drivers/misc/carma/carma-fpga.c @@ -1243,8 +1243,6 @@ static int data_mmap(struct file *filp, struct vm_area_struct *vma) return -EINVAL; } - /* IO memory (stop cacheing) */ - vma->vm_flags |= VM_IO | VM_RESERVED; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); return io_remap_pfn_range(vma, vma->vm_start, addr, vsize, diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c index ecafa4ba238b..492c8cac69ac 100644 --- a/drivers/misc/sgi-gru/grufile.c +++ b/drivers/misc/sgi-gru/grufile.c @@ -108,9 +108,8 @@ static int gru_file_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_end & (GRU_GSEG_PAGESIZE - 1)) return -EINVAL; - vma->vm_flags |= - (VM_IO | VM_DONTCOPY | VM_LOCKED | VM_DONTEXPAND | VM_PFNMAP | - VM_RESERVED); + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_LOCKED | + VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_page_prot = PAGE_SHARED; vma->vm_ops = &gru_vm_ops; diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c index a6e74514e662..73ae81a629f2 100644 --- a/drivers/mtd/mtdchar.c +++ b/drivers/mtd/mtdchar.c @@ -1182,7 +1182,7 @@ static int mtdchar_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; if (set_vm_offset(vma, off) < 0) return -EINVAL; - vma->vm_flags |= VM_IO | VM_RESERVED; + vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; #ifdef pgprot_noncached if (file->f_flags & O_DSYNC || off >= __pa(high_memory)) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 9c5c5f2b3962..be2c9a6561ff 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1257,7 +1257,7 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma) } sfp->mmap_called = 1; - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_private_data = sfp; vma->vm_ops = &sg_mmap_vm_ops; return 0; diff --git a/drivers/staging/omapdrm/omap_gem_dmabuf.c b/drivers/staging/omapdrm/omap_gem_dmabuf.c index 42728e0cc194..c6f3ef6f57b9 100644 --- a/drivers/staging/omapdrm/omap_gem_dmabuf.c +++ b/drivers/staging/omapdrm/omap_gem_dmabuf.c @@ -160,7 +160,7 @@ static int omap_gem_dmabuf_mmap(struct dma_buf *buffer, goto out_unlock; } - vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND; + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = obj->dev->driver->gem_vm_ops; vma->vm_private_data = obj; vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); diff --git a/drivers/staging/tidspbridge/rmgr/drv_interface.c b/drivers/staging/tidspbridge/rmgr/drv_interface.c index bddea1d3b2c3..701a11ac676d 100644 --- a/drivers/staging/tidspbridge/rmgr/drv_interface.c +++ b/drivers/staging/tidspbridge/rmgr/drv_interface.c @@ -261,7 +261,7 @@ static int bridge_mmap(struct file *filp, struct vm_area_struct *vma) { u32 status; - vma->vm_flags |= VM_RESERVED | VM_IO; + /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); dev_dbg(bridge, "%s: vm filp %p start %lx end %lx page_prot %ulx " diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index a783d533a1a6..5110f367f1f1 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -653,8 +653,6 @@ static int uio_mmap_physical(struct vm_area_struct *vma) if (mi < 0) return -EINVAL; - vma->vm_flags |= VM_IO | VM_RESERVED; - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); return remap_pfn_range(vma, @@ -666,7 +664,7 @@ static int uio_mmap_physical(struct vm_area_struct *vma) static int uio_mmap_logical(struct vm_area_struct *vma) { - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &uio_vm_ops; uio_vma_open(vma); return 0; diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c index 91cd85076a44..9a62e89d6dc0 100644 --- a/drivers/usb/mon/mon_bin.c +++ b/drivers/usb/mon/mon_bin.c @@ -1247,7 +1247,7 @@ static int mon_bin_mmap(struct file *filp, struct vm_area_struct *vma) { /* don't do anything here: "fault" will set up page table entries */ vma->vm_ops = &mon_bin_vm_ops; - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_private_data = filp->private_data; mon_bin_vma_open(vma); return 0; diff --git a/drivers/video/68328fb.c b/drivers/video/68328fb.c index a425d65d5ba2..fa44fbed397d 100644 --- a/drivers/video/68328fb.c +++ b/drivers/video/68328fb.c @@ -400,7 +400,7 @@ static int mc68x328fb_mmap(struct fb_info *info, struct vm_area_struct *vma) #ifndef MMU /* this is uClinux (no MMU) specific code */ - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_start = videomemory; return 0; diff --git a/drivers/video/aty/atyfb_base.c b/drivers/video/aty/atyfb_base.c index 3f2e8c13f1ca..868932f904ef 100644 --- a/drivers/video/aty/atyfb_base.c +++ b/drivers/video/aty/atyfb_base.c @@ -1942,8 +1942,7 @@ static int atyfb_mmap(struct fb_info *info, struct vm_area_struct *vma) off = vma->vm_pgoff << PAGE_SHIFT; size = vma->vm_end - vma->vm_start; - /* To stop the swapper from even considering these pages. */ - vma->vm_flags |= (VM_IO | VM_RESERVED); + /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ if (((vma->vm_pgoff == 0) && (size == info->fix.smem_len)) || ((off == info->fix.smem_len) && (size == PAGE_SIZE))) diff --git a/drivers/video/fb-puv3.c b/drivers/video/fb-puv3.c index 60a787fa32cf..7d106f1f4906 100644 --- a/drivers/video/fb-puv3.c +++ b/drivers/video/fb-puv3.c @@ -653,9 +653,8 @@ int unifb_mmap(struct fb_info *info, vma->vm_page_prot)) return -EAGAIN; - vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */ + /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ return 0; - } static struct fb_ops unifb_ops = { diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c index 64cda560c488..88cad6b8b479 100644 --- a/drivers/video/fb_defio.c +++ b/drivers/video/fb_defio.c @@ -166,7 +166,7 @@ static const struct address_space_operations fb_deferred_io_aops = { static int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma) { vma->vm_ops = &fb_deferred_io_vm_ops; - vma->vm_flags |= ( VM_RESERVED | VM_DONTEXPAND ); + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; if (!(info->flags & FBINFO_VIRTFB)) vma->vm_flags |= VM_IO; vma->vm_private_data = info; diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c index 0dff12a1daef..3ff0105a496a 100644 --- a/drivers/video/fbmem.c +++ b/drivers/video/fbmem.c @@ -1410,8 +1410,7 @@ fb_mmap(struct file *file, struct vm_area_struct * vma) return -EINVAL; off += start; vma->vm_pgoff = off >> PAGE_SHIFT; - /* This is an IO map - tell maydump to skip this VMA */ - vma->vm_flags |= VM_IO | VM_RESERVED; + /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by io_remap_pfn_range()*/ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); fb_pgprotect(file, vma, off); if (io_remap_pfn_range(vma, vma->vm_start, off >> PAGE_SHIFT, diff --git a/drivers/video/gbefb.c b/drivers/video/gbefb.c index 7e7b7a9ba274..05e2a8a99d8f 100644 --- a/drivers/video/gbefb.c +++ b/drivers/video/gbefb.c @@ -1024,7 +1024,7 @@ static int gbefb_mmap(struct fb_info *info, pgprot_val(vma->vm_page_prot) = pgprot_fb(pgprot_val(vma->vm_page_prot)); - vma->vm_flags |= VM_IO | VM_RESERVED; + /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ /* look for the starting tile */ tile = &gbe_tiles.cpu[offset >> TILE_SHIFT]; diff --git a/drivers/video/omap2/omapfb/omapfb-main.c b/drivers/video/omap2/omapfb/omapfb-main.c index 3c39aa8de928..15373f4aee19 100644 --- a/drivers/video/omap2/omapfb/omapfb-main.c +++ b/drivers/video/omap2/omapfb/omapfb-main.c @@ -1128,7 +1128,7 @@ static int omapfb_mmap(struct fb_info *fbi, struct vm_area_struct *vma) DBG("user mmap region start %lx, len %d, off %lx\n", start, len, off); vma->vm_pgoff = off >> PAGE_SHIFT; - vma->vm_flags |= VM_IO | VM_RESERVED; + /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); vma->vm_ops = &mmap_user_ops; vma->vm_private_data = rg; diff --git a/drivers/video/sbuslib.c b/drivers/video/sbuslib.c index 3c1de981a18c..296afae442f4 100644 --- a/drivers/video/sbuslib.c +++ b/drivers/video/sbuslib.c @@ -57,9 +57,8 @@ int sbusfb_mmap_helper(struct sbus_mmap_map *map, off = vma->vm_pgoff << PAGE_SHIFT; - /* To stop the swapper from even considering these pages */ - vma->vm_flags |= (VM_IO | VM_RESERVED); - + /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); /* Each page, see which map applies */ diff --git a/drivers/video/smscufx.c b/drivers/video/smscufx.c index 5533a32c6ca1..97bd6620c364 100644 --- a/drivers/video/smscufx.c +++ b/drivers/video/smscufx.c @@ -803,7 +803,6 @@ static int ufx_ops_mmap(struct fb_info *info, struct vm_area_struct *vma) size = 0; } - vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */ return 0; } diff --git a/drivers/video/udlfb.c b/drivers/video/udlfb.c index 8af64148294b..f45eba3d6150 100644 --- a/drivers/video/udlfb.c +++ b/drivers/video/udlfb.c @@ -345,7 +345,6 @@ static int dlfb_ops_mmap(struct fb_info *info, struct vm_area_struct *vma) size = 0; } - vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */ return 0; } diff --git a/drivers/video/vermilion/vermilion.c b/drivers/video/vermilion/vermilion.c index 970e43d13f52..89aef343e295 100644 --- a/drivers/video/vermilion/vermilion.c +++ b/drivers/video/vermilion/vermilion.c @@ -1018,7 +1018,6 @@ static int vmlfb_mmap(struct fb_info *info, struct vm_area_struct *vma) offset += vinfo->vram_start; pgprot_val(vma->vm_page_prot) |= _PAGE_PCD; pgprot_val(vma->vm_page_prot) &= ~_PAGE_PWT; - vma->vm_flags |= VM_RESERVED | VM_IO; if (remap_pfn_range(vma, vma->vm_start, offset >> PAGE_SHIFT, size, vma->vm_page_prot)) return -EAGAIN; diff --git a/drivers/video/vfb.c b/drivers/video/vfb.c index 501a922aa9dc..c7f692525b88 100644 --- a/drivers/video/vfb.c +++ b/drivers/video/vfb.c @@ -439,7 +439,6 @@ static int vfb_mmap(struct fb_info *info, size = 0; } - vma->vm_flags |= VM_RESERVED; /* avoid to swap out this VMA */ return 0; } diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c index 934985d14c24..4097987b330e 100644 --- a/drivers/xen/gntalloc.c +++ b/drivers/xen/gntalloc.c @@ -535,7 +535,7 @@ static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma) vma->vm_private_data = vm_priv; - vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &gntalloc_vmops; diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 5df9fd847b2e..610bfc6be177 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -720,7 +720,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) vma->vm_ops = &gntdev_vmops; - vma->vm_flags |= VM_RESERVED|VM_DONTEXPAND; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; if (use_ptemod) vma->vm_flags |= VM_DONTCOPY; diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index ef6389580b8c..8adb9cc267f9 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -455,7 +455,8 @@ static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) { /* DONTCOPY is essential for Xen because copy_page_range doesn't know * how to recreate these mappings */ - vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY | VM_PFNMAP; + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTCOPY | + VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &privcmd_vm_ops; vma->vm_private_data = NULL; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 2b72d26e2e4b..e800dec958c3 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1135,7 +1135,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, } /* Do not dump I/O mapped devices or special mappings */ - if (vma->vm_flags & (VM_IO | VM_RESERVED)) + if (vma->vm_flags & VM_IO) return 0; /* By default, dump shared memory if mapped from an anonymous file. */ diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 08d812b32282..262db114ff01 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1205,7 +1205,7 @@ static int maydump(struct vm_area_struct *vma, unsigned long mm_flags) int dump_ok; /* Do not dump I/O mapped devices or special mappings */ - if (vma->vm_flags & (VM_IO | VM_RESERVED)) { + if (vma->vm_flags & VM_IO) { kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags); return 0; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 9460120a5170..0a0ab8e21b19 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -110,7 +110,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * way when do_mmap_pgoff unwinds (may be important on powerpc * and ia64). */ - vma->vm_flags |= VM_HUGETLB | VM_RESERVED; + vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &hugetlb_vm_ops; if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4540b8f76f16..79827ce03e3b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -54,7 +54,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmPTE:\t%8lu kB\n" "VmSwap:\t%8lu kB\n", hiwater_vm << (PAGE_SHIFT-10), - (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), + total_vm << (PAGE_SHIFT-10), mm->locked_vm << (PAGE_SHIFT-10), mm->pinned_vm << (PAGE_SHIFT-10), hiwater_rss << (PAGE_SHIFT-10), diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 95b738c7abff..ba7a0ff19d39 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -239,7 +239,7 @@ extern int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, /* Check if a vma is migratable */ static inline int vma_migratable(struct vm_area_struct *vma) { - if (vma->vm_flags & (VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED)) + if (vma->vm_flags & (VM_IO | VM_HUGETLB | VM_PFNMAP)) return 0; /* * Migration allocates pages in the highest zone. If we cannot diff --git a/include/linux/mm.h b/include/linux/mm.h index dc08d558e058..0514fe9d3c84 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -96,7 +96,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ -#define VM_RESERVED 0x00080000 /* Count as reserved_vm like IO */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ @@ -148,7 +147,7 @@ extern unsigned int kobjsize(const void *objp); * Special vmas that are non-mergable, non-mlock()able. * Note: mm/huge_memory.c VM_NO_THP depends on this definition. */ -#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) +#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP) /* * mapping from the currently active vm_flags protection bits (the diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 58d3173eb365..a57a43f5ca7c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -349,7 +349,6 @@ struct mm_struct { unsigned long shared_vm; /* Shared pages (files) */ unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE */ unsigned long stack_vm; /* VM_GROWSUP/DOWN */ - unsigned long reserved_vm; /* VM_RESERVED|VM_IO pages */ unsigned long def_flags; unsigned long nr_ptes; /* Page table pages */ unsigned long start_code, end_code, start_data, end_data; diff --git a/kernel/events/core.c b/kernel/events/core.c index f16f3c58f11a..cda3ebd49e86 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3671,7 +3671,7 @@ unlock: atomic_inc(&event->mmap_count); mutex_unlock(&event->mmap_mutex); - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &perf_mmap_vmops; return ret; diff --git a/mm/ksm.c b/mm/ksm.c index f9ccb16559ee..9638620a7530 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1469,8 +1469,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, */ if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | VM_PFNMAP | VM_IO | VM_DONTEXPAND | - VM_RESERVED | VM_HUGETLB | - VM_NONLINEAR | VM_MIXEDMAP)) + VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP)) return 0; /* just ignore the advice */ #ifdef VM_SAO diff --git a/mm/memory.c b/mm/memory.c index 7b1e4feaec06..e09c04813186 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2297,14 +2297,13 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, * rest of the world about it: * VM_IO tells people not to look at these pages * (accesses can have side effects). - * VM_RESERVED is specified all over the place, because - * in 2.4 it kept swapout's vma scan off this vma; but - * in 2.6 the LRU scan won't even find its pages, so this - * flag means no more than count its pages in reserved_vm, - * and omit it from core dump, even when VM_IO turned off. * VM_PFNMAP tells the core MM that the base pages are just * raw PFN mappings, and do not have a "struct page" associated * with them. + * VM_DONTEXPAND + * Disable vma merging and expanding with mremap(). + * VM_DONTDUMP + * Omit vma from core dump, even when VM_IO turned off. * * There's a horrible special case to handle copy-on-write * behaviour that some programs depend on. We mark the "original" @@ -2321,7 +2320,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, if (err) return -EINVAL; - vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; diff --git a/mm/mlock.c b/mm/mlock.c index ef726e8aa8e9..a948be4b7ba7 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -227,7 +227,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, if (vma->vm_flags & (VM_IO | VM_PFNMAP)) goto no_mlock; - if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || + if (!((vma->vm_flags & VM_DONTEXPAND) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))) { diff --git a/mm/mmap.c b/mm/mmap.c index c1ad2e78ea58..a76042dc806d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -945,8 +945,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, mm->exec_vm += pages; } else if (flags & stack_flags) mm->stack_vm += pages; - if (flags & (VM_RESERVED|VM_IO)) - mm->reserved_vm += pages; } #endif /* CONFIG_PROC_FS */ diff --git a/mm/nommu.c b/mm/nommu.c index 9c4a7b63a4df..12e84e69dd06 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1811,7 +1811,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, if (addr != (pfn << PAGE_SHIFT)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; return 0; } EXPORT_SYMBOL(remap_pfn_range); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2bb90b1d241c..8de704679bfc 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2163,8 +2163,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, usize -= PAGE_SIZE; } while (usize > 0); - /* Prevent "things" like memory migration? VM_flags need a cleanup... */ - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; return 0; } diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 55af8c5b57e6..3a6e8731646c 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -485,7 +485,7 @@ static int sel_mmap_policy(struct file *filp, struct vm_area_struct *vma) return -EACCES; } - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &sel_mmap_policy_ops; return 0; diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 20554eff5a21..5e12e5bacbba 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -3039,7 +3039,7 @@ static int snd_pcm_mmap_status(struct snd_pcm_substream *substream, struct file return -EINVAL; area->vm_ops = &snd_pcm_vm_ops_status; area->vm_private_data = substream; - area->vm_flags |= VM_RESERVED; + area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; return 0; } @@ -3076,7 +3076,7 @@ static int snd_pcm_mmap_control(struct snd_pcm_substream *substream, struct file return -EINVAL; area->vm_ops = &snd_pcm_vm_ops_control; area->vm_private_data = substream; - area->vm_flags |= VM_RESERVED; + area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; return 0; } #else /* ! coherent mmap */ @@ -3170,7 +3170,7 @@ static const struct vm_operations_struct snd_pcm_vm_ops_data_fault = { int snd_pcm_lib_default_mmap(struct snd_pcm_substream *substream, struct vm_area_struct *area) { - area->vm_flags |= VM_RESERVED; + area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; #ifdef ARCH_HAS_DMA_MMAP_COHERENT if (!substream->ops->page && substream->dma_buffer.dev.type == SNDRV_DMA_TYPE_DEV) diff --git a/sound/usb/usx2y/us122l.c b/sound/usb/usx2y/us122l.c index c4fd3b1d9592..d0323a693ba2 100644 --- a/sound/usb/usx2y/us122l.c +++ b/sound/usb/usx2y/us122l.c @@ -262,7 +262,7 @@ static int usb_stream_hwdep_mmap(struct snd_hwdep *hw, } area->vm_ops = &usb_stream_hwdep_vm_ops; - area->vm_flags |= VM_RESERVED; + area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; area->vm_private_data = us122l; atomic_inc(&us122l->mmap_count); out: diff --git a/sound/usb/usx2y/usX2Yhwdep.c b/sound/usb/usx2y/usX2Yhwdep.c index 04aafb43a13c..0b34dbc8f302 100644 --- a/sound/usb/usx2y/usX2Yhwdep.c +++ b/sound/usb/usx2y/usX2Yhwdep.c @@ -82,7 +82,7 @@ static int snd_us428ctls_mmap(struct snd_hwdep * hw, struct file *filp, struct v us428->us428ctls_sharedmem->CtlSnapShotLast = -2; } area->vm_ops = &us428ctls_vm_ops; - area->vm_flags |= VM_RESERVED | VM_DONTEXPAND; + area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; area->vm_private_data = hw->private_data; return 0; } diff --git a/sound/usb/usx2y/usx2yhwdeppcm.c b/sound/usb/usx2y/usx2yhwdeppcm.c index 8e40b6e67e9e..cc56007791e0 100644 --- a/sound/usb/usx2y/usx2yhwdeppcm.c +++ b/sound/usb/usx2y/usx2yhwdeppcm.c @@ -723,7 +723,7 @@ static int snd_usX2Y_hwdep_pcm_mmap(struct snd_hwdep * hw, struct file *filp, st return -ENODEV; } area->vm_ops = &snd_usX2Y_hwdep_pcm_vm_ops; - area->vm_flags |= VM_RESERVED | VM_DONTEXPAND; + area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; area->vm_private_data = hw->private_data; return 0; } -- cgit v1.2.3-59-g8ed1b From 01dc52ebdf472f77cca623ca693ca24cfc0f1bbe Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Oct 2012 16:29:30 -0700 Subject: oom: remove deprecated oom_adj The deprecated /proc//oom_adj is scheduled for removal this month. Signed-off-by: Davidlohr Bueso Acked-by: David Rientjes Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/obsolete/proc-pid-oom_adj | 22 ------ Documentation/filesystems/proc.txt | 22 +----- fs/proc/base.c | 117 +--------------------------- include/linux/oom.h | 11 --- include/linux/sched.h | 1 - kernel/fork.c | 1 - mm/oom_kill.c | 4 +- 7 files changed, 7 insertions(+), 171 deletions(-) delete mode 100644 Documentation/ABI/obsolete/proc-pid-oom_adj (limited to 'fs') diff --git a/Documentation/ABI/obsolete/proc-pid-oom_adj b/Documentation/ABI/obsolete/proc-pid-oom_adj deleted file mode 100644 index 9a3cb88ade47..000000000000 --- a/Documentation/ABI/obsolete/proc-pid-oom_adj +++ /dev/null @@ -1,22 +0,0 @@ -What: /proc//oom_adj -When: August 2012 -Why: /proc//oom_adj allows userspace to influence the oom killer's - badness heuristic used to determine which task to kill when the kernel - is out of memory. - - The badness heuristic has since been rewritten since the introduction of - this tunable such that its meaning is deprecated. The value was - implemented as a bitshift on a score generated by the badness() - function that did not have any precise units of measure. With the - rewrite, the score is given as a proportion of available memory to the - task allocating pages, so using a bitshift which grows the score - exponentially is, thus, impossible to tune with fine granularity. - - A much more powerful interface, /proc//oom_score_adj, was - introduced with the oom killer rewrite that allows users to increase or - decrease the badness score linearly. This interface will replace - /proc//oom_adj. - - A warning will be emitted to the kernel log if an application uses this - deprecated interface. After it is printed once, future warnings will be - suppressed until the kernel is rebooted. diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index fb0a6aeb936c..a1793d670cd0 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -33,7 +33,7 @@ Table of Contents 2 Modifying System Parameters 3 Per-Process Parameters - 3.1 /proc//oom_adj & /proc//oom_score_adj - Adjust the oom-killer + 3.1 /proc//oom_score_adj - Adjust the oom-killer score 3.2 /proc//oom_score - Display current oom-killer score 3.3 /proc//io - Display the IO accounting fields @@ -1320,10 +1320,10 @@ of the kernel. CHAPTER 3: PER-PROCESS PARAMETERS ------------------------------------------------------------------------------ -3.1 /proc//oom_adj & /proc//oom_score_adj- Adjust the oom-killer score +3.1 /proc//oom_score_adj- Adjust the oom-killer score -------------------------------------------------------------------------------- -These file can be used to adjust the badness heuristic used to select which +This file can be used to adjust the badness heuristic used to select which process gets killed in out of memory conditions. The badness heuristic assigns a value to each candidate task ranging from 0 @@ -1361,22 +1361,10 @@ same system, cpuset, mempolicy, or memory controller resources to use at least equivalent to discounting 50% of the task's allowed memory from being considered as scoring against the task. -For backwards compatibility with previous kernels, /proc//oom_adj may also -be used to tune the badness score. Its acceptable values range from -16 -(OOM_ADJUST_MIN) to +15 (OOM_ADJUST_MAX) and a special value of -17 -(OOM_DISABLE) to disable oom killing entirely for that task. Its value is -scaled linearly with /proc//oom_score_adj. - -Writing to /proc//oom_score_adj or /proc//oom_adj will change the -other with its scaled value. - The value of /proc//oom_score_adj may be reduced no lower than the last value set by a CAP_SYS_RESOURCE process. To reduce the value any lower requires CAP_SYS_RESOURCE. -NOTICE: /proc//oom_adj is deprecated and will be removed, please see -Documentation/feature-removal-schedule.txt. - Caveat: when a parent task is selected, the oom killer will sacrifice any first generation children with separate address spaces instead, if possible. This avoids servers and important system daemons from being killed and loses the @@ -1387,9 +1375,7 @@ minimal amount of work. ------------------------------------------------------------- This file can be used to check the current score used by the oom-killer is for -any given . Use it together with /proc//oom_adj to tune which -process should be killed in an out-of-memory situation. - +any given . 3.3 /proc//io - Display the IO accounting fields ------------------------------------------------------- diff --git a/fs/proc/base.c b/fs/proc/base.c index d295af993677..ef5c84be66f9 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -873,111 +873,6 @@ static const struct file_operations proc_environ_operations = { .release = mem_release, }; -static ssize_t oom_adjust_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); - char buffer[PROC_NUMBUF]; - size_t len; - int oom_adjust = OOM_DISABLE; - unsigned long flags; - - if (!task) - return -ESRCH; - - if (lock_task_sighand(task, &flags)) { - oom_adjust = task->signal->oom_adj; - unlock_task_sighand(task, &flags); - } - - put_task_struct(task); - - len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); - - return simple_read_from_buffer(buf, count, ppos, buffer, len); -} - -static ssize_t oom_adjust_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct task_struct *task; - char buffer[PROC_NUMBUF]; - int oom_adjust; - unsigned long flags; - int err; - - memset(buffer, 0, sizeof(buffer)); - if (count > sizeof(buffer) - 1) - count = sizeof(buffer) - 1; - if (copy_from_user(buffer, buf, count)) { - err = -EFAULT; - goto out; - } - - err = kstrtoint(strstrip(buffer), 0, &oom_adjust); - if (err) - goto out; - if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && - oom_adjust != OOM_DISABLE) { - err = -EINVAL; - goto out; - } - - task = get_proc_task(file->f_path.dentry->d_inode); - if (!task) { - err = -ESRCH; - goto out; - } - - task_lock(task); - if (!task->mm) { - err = -EINVAL; - goto err_task_lock; - } - - if (!lock_task_sighand(task, &flags)) { - err = -ESRCH; - goto err_task_lock; - } - - if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) { - err = -EACCES; - goto err_sighand; - } - - /* - * Warn that /proc/pid/oom_adj is deprecated, see - * Documentation/feature-removal-schedule.txt. - */ - printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", - current->comm, task_pid_nr(current), task_pid_nr(task), - task_pid_nr(task)); - task->signal->oom_adj = oom_adjust; - /* - * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum - * value is always attainable. - */ - if (task->signal->oom_adj == OOM_ADJUST_MAX) - task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX; - else - task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / - -OOM_DISABLE; - trace_oom_score_adj_update(task); -err_sighand: - unlock_task_sighand(task, &flags); -err_task_lock: - task_unlock(task); - put_task_struct(task); -out: - return err < 0 ? err : count; -} - -static const struct file_operations proc_oom_adjust_operations = { - .read = oom_adjust_read, - .write = oom_adjust_write, - .llseek = generic_file_llseek, -}; - static ssize_t oom_score_adj_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -1051,15 +946,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) task->signal->oom_score_adj_min = oom_score_adj; trace_oom_score_adj_update(task); - /* - * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is - * always attainable. - */ - if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) - task->signal->oom_adj = OOM_DISABLE; - else - task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) / - OOM_SCORE_ADJ_MAX; + err_sighand: unlock_task_sighand(task, &flags); err_task_lock: @@ -2710,7 +2597,6 @@ static const struct pid_entry tgid_base_stuff[] = { REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif INF("oom_score", S_IRUGO, proc_oom_score), - REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), @@ -3077,7 +2963,6 @@ static const struct pid_entry tid_base_stuff[] = { REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif INF("oom_score", S_IRUGO, proc_oom_score), - REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), diff --git a/include/linux/oom.h b/include/linux/oom.h index 49a3031fda50..d36a8221f58b 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -1,17 +1,6 @@ #ifndef __INCLUDE_LINUX_OOM_H #define __INCLUDE_LINUX_OOM_H -/* - * /proc//oom_adj is deprecated, see - * Documentation/feature-removal-schedule.txt. - * - * /proc//oom_adj set to -17 protects from the oom-killer - */ -#define OOM_DISABLE (-17) -/* inclusive */ -#define OOM_ADJUST_MIN (-16) -#define OOM_ADJUST_MAX 15 - /* * /proc//oom_score_adj set to OOM_SCORE_ADJ_MIN disables oom killing for * pid. diff --git a/include/linux/sched.h b/include/linux/sched.h index 9c5612f0374b..c2070e92a9d6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -671,7 +671,6 @@ struct signal_struct { struct rw_semaphore group_rwsem; #endif - int oom_adj; /* OOM kill score adjustment (bit shift) */ int oom_score_adj; /* OOM kill score adjustment */ int oom_score_adj_min; /* OOM kill score adjustment minimum value. * Only settable by CAP_SYS_RESOURCE. */ diff --git a/kernel/fork.c b/kernel/fork.c index ec667f797af3..972762e01024 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1056,7 +1056,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) init_rwsem(&sig->group_rwsem); #endif - sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 198600861638..79e0f3e24831 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -428,8 +428,8 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, { task_lock(current); pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " - "oom_adj=%d, oom_score_adj=%d\n", - current->comm, gfp_mask, order, current->signal->oom_adj, + "oom_score_adj=%d\n", + current->comm, gfp_mask, order, current->signal->oom_score_adj); cpuset_print_task_mems_allowed(current); task_unlock(current); -- cgit v1.2.3-59-g8ed1b From 4c199a93a2d36b277a9fd209a0f2793f8460a215 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:30:32 -0700 Subject: rbtree: empty nodes have no color Empty nodes have no color. We can make use of this property to simplify the code emitted by the RB_EMPTY_NODE and RB_CLEAR_NODE macros. Also, we can get rid of the rb_init_node function which had been introduced by commit 88d19cf37952 ("timers: Add rb_init_node() to allow for stack allocated rb nodes") to avoid some issue with the empty node's color not being initialized. I'm not sure what the RB_EMPTY_NODE checks in rb_prev() / rb_next() are doing there, though. axboe introduced them in commit 10fd48f2376d ("rbtree: fixed reversed RB_EMPTY_NODE and rb_next/prev"). The way I see it, the 'empty node' abstraction is only used by rbtree users to flag nodes that they haven't inserted in any rbtree, so asking the predecessor or successor of such nodes doesn't make any sense. One final rb_init_node() caller was recently added in sysctl code to implement faster sysctl name lookups. This code doesn't make use of RB_EMPTY_NODE at all, and from what I could see it only called rb_init_node() under the mistaken assumption that such initialization was required before node insertion. [sfr@canb.auug.org.au: fix net/ceph/osd_client.c build] Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Acked-by: David Woodhouse Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Jens Axboe Cc: "Eric W. Biederman" Cc: John Stultz Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 4 +--- include/linux/rbtree.h | 15 +++++---------- include/linux/timerqueue.h | 2 +- lib/rbtree.c | 4 ++-- net/ceph/osd_client.c | 1 - 5 files changed, 9 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index dcd56f84db7e..fddc50729632 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -168,10 +168,8 @@ static void init_header(struct ctl_table_header *head, head->node = node; if (node) { struct ctl_table *entry; - for (entry = table; entry->procname; entry++, node++) { - rb_init_node(&node->node); + for (entry = table; entry->procname; entry++, node++) node->header = head; - } } } diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index e6a807720ded..2049087c43b7 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -67,17 +67,12 @@ static inline void rb_set_color(struct rb_node *rb, int color) #define RB_ROOT (struct rb_root) { NULL, } #define rb_entry(ptr, type, member) container_of(ptr, type, member) -#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) -#define RB_EMPTY_NODE(node) (rb_parent(node) == node) -#define RB_CLEAR_NODE(node) (rb_set_parent(node, node)) +#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) + +/* 'empty' nodes are nodes that are known not to be inserted in an rbree */ +#define RB_EMPTY_NODE(node) ((node)->rb_parent_color == (unsigned long)(node)) +#define RB_CLEAR_NODE(node) ((node)->rb_parent_color = (unsigned long)(node)) -static inline void rb_init_node(struct rb_node *rb) -{ - rb->rb_parent_color = 0; - rb->rb_right = NULL; - rb->rb_left = NULL; - RB_CLEAR_NODE(rb); -} extern void rb_insert_color(struct rb_node *, struct rb_root *); extern void rb_erase(struct rb_node *, struct rb_root *); diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h index 5088727478fd..a520fd70a59f 100644 --- a/include/linux/timerqueue.h +++ b/include/linux/timerqueue.h @@ -39,7 +39,7 @@ struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head) static inline void timerqueue_init(struct timerqueue_node *node) { - rb_init_node(&node->node); + RB_CLEAR_NODE(&node->node); } static inline void timerqueue_init_head(struct timerqueue_head *head) diff --git a/lib/rbtree.c b/lib/rbtree.c index d4175565dc2c..fe43c8c5f527 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -387,7 +387,7 @@ struct rb_node *rb_next(const struct rb_node *node) { struct rb_node *parent; - if (rb_parent(node) == node) + if (RB_EMPTY_NODE(node)) return NULL; /* If we have a right-hand child, go down and then left as far @@ -416,7 +416,7 @@ struct rb_node *rb_prev(const struct rb_node *node) { struct rb_node *parent; - if (rb_parent(node) == node) + if (RB_EMPTY_NODE(node)) return NULL; /* If we have a left-hand child, go down and then right as far diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ccbdfbba9e53..c1d756cc7448 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -221,7 +221,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, kref_init(&req->r_kref); init_completion(&req->r_completion); init_completion(&req->r_safe_completion); - rb_init_node(&req->r_node); INIT_LIST_HEAD(&req->r_unsafe_item); INIT_LIST_HEAD(&req->r_linger_item); INIT_LIST_HEAD(&req->r_linger_osd); -- cgit v1.2.3-59-g8ed1b From ea5272f5c94fb2ee62f4f15a5b88eef6184cd506 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:30:35 -0700 Subject: rbtree: fix incorrect rbtree node insertion in fs/proc/proc_sysctl.c The recently added code to use rbtrees in sysctl did not follow the proper rbtree interface on insertion - it was calling rb_link_node() which inserts a new node into the binary tree, but missed the call to rb_insert_color() which properly balances the rbtree and establishes all expected rbtree invariants. I found out about this only because faulty commit also used rb_init_node(), which I am removing within this patchset. But I think it's an easy mistake to make, and it makes me wonder if we should change the rbtree API so that insertions would be done with a single rb_insert() call (even if its implementation could still inline the rb_link_node() part and call a private __rb_insert_color function to do the rebalancing). Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Acked-by: David Woodhouse Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Jens Axboe Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index fddc50729632..a781bdf06694 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -142,6 +142,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) } rb_link_node(node, parent, p); + rb_insert_color(node, &head->parent->root); return 0; } -- cgit v1.2.3-59-g8ed1b From bf7ad8eeab995710c766df49c9c69a8592ca0216 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:30:37 -0700 Subject: rbtree: move some implementation details from rbtree.h to rbtree.c rbtree users must use the documented APIs to manipulate the tree structure. Low-level helpers to manipulate node colors and parenthood are not part of that API, so move them to lib/rbtree.c [dwmw2@infradead.org: fix jffs2 build issue due to renamed __rb_parent_color field] Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Acked-by: David Woodhouse Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Jens Axboe Cc: "Eric W. Biederman" Signed-off-by: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jffs2/readinode.c | 13 ++++++++----- include/linux/rbtree.h | 34 +++++++++------------------------- lib/rbtree.c | 20 +++++++++++++++++++- 3 files changed, 36 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 1ea349fff68b..ae81b01e6fd7 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -394,8 +394,11 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c, } /* Trivial function to remove the last node in the tree. Which by definition - has no right-hand -- so can be removed just by making its only child (if - any) take its place under its parent. */ + has no right-hand child — so can be removed just by making its left-hand + child (if any) take its place under its parent. Since this is only done + when we're consuming the whole tree, there's no need to use rb_erase() + and let it worry about adjusting colours and balancing the tree. That + would just be a waste of time. */ static void eat_last(struct rb_root *root, struct rb_node *node) { struct rb_node *parent = rb_parent(node); @@ -412,12 +415,12 @@ static void eat_last(struct rb_root *root, struct rb_node *node) link = &parent->rb_right; *link = node->rb_left; - /* Colour doesn't matter now. Only the parent pointer. */ if (node->rb_left) - node->rb_left->rb_parent_color = node->rb_parent_color; + node->rb_left->__rb_parent_color = node->__rb_parent_color; } -/* We put this in reverse order, so we can just use eat_last */ +/* We put the version tree in reverse order, so we can use the same eat_last() + function that we use to consume the tmpnode tree (tn_root). */ static void ver_insert(struct rb_root *ver_root, struct jffs2_tmp_dnode_info *tn) { struct rb_node **link = &ver_root->rb_node; diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 2049087c43b7..bf836a2c6a32 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -32,37 +32,19 @@ #include #include -struct rb_node -{ - unsigned long rb_parent_color; -#define RB_RED 0 -#define RB_BLACK 1 +struct rb_node { + unsigned long __rb_parent_color; struct rb_node *rb_right; struct rb_node *rb_left; } __attribute__((aligned(sizeof(long)))); /* The alignment might seem pointless, but allegedly CRIS needs it */ -struct rb_root -{ +struct rb_root { struct rb_node *rb_node; }; -#define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~3)) -#define rb_color(r) ((r)->rb_parent_color & 1) -#define rb_is_red(r) (!rb_color(r)) -#define rb_is_black(r) rb_color(r) -#define rb_set_red(r) do { (r)->rb_parent_color &= ~1; } while (0) -#define rb_set_black(r) do { (r)->rb_parent_color |= 1; } while (0) - -static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) -{ - rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p; -} -static inline void rb_set_color(struct rb_node *rb, int color) -{ - rb->rb_parent_color = (rb->rb_parent_color & ~1) | color; -} +#define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) #define RB_ROOT (struct rb_root) { NULL, } #define rb_entry(ptr, type, member) container_of(ptr, type, member) @@ -70,8 +52,10 @@ static inline void rb_set_color(struct rb_node *rb, int color) #define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) /* 'empty' nodes are nodes that are known not to be inserted in an rbree */ -#define RB_EMPTY_NODE(node) ((node)->rb_parent_color == (unsigned long)(node)) -#define RB_CLEAR_NODE(node) ((node)->rb_parent_color = (unsigned long)(node)) +#define RB_EMPTY_NODE(node) \ + ((node)->__rb_parent_color == (unsigned long)(node)) +#define RB_CLEAR_NODE(node) \ + ((node)->__rb_parent_color = (unsigned long)(node)) extern void rb_insert_color(struct rb_node *, struct rb_root *); @@ -98,7 +82,7 @@ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, static inline void rb_link_node(struct rb_node * node, struct rb_node * parent, struct rb_node ** rb_link) { - node->rb_parent_color = (unsigned long )parent; + node->__rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; *rb_link = node; diff --git a/lib/rbtree.c b/lib/rbtree.c index fe43c8c5f527..ccada9abe6f5 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -23,6 +23,24 @@ #include #include +#define RB_RED 0 +#define RB_BLACK 1 + +#define rb_color(r) ((r)->__rb_parent_color & 1) +#define rb_is_red(r) (!rb_color(r)) +#define rb_is_black(r) rb_color(r) +#define rb_set_red(r) do { (r)->__rb_parent_color &= ~1; } while (0) +#define rb_set_black(r) do { (r)->__rb_parent_color |= 1; } while (0) + +static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) +{ + rb->__rb_parent_color = rb_color(rb) | (unsigned long)p; +} +static inline void rb_set_color(struct rb_node *rb, int color) +{ + rb->__rb_parent_color = (rb->__rb_parent_color & ~1) | color; +} + static void __rb_rotate_left(struct rb_node *node, struct rb_root *root) { struct rb_node *right = node->rb_right; @@ -255,7 +273,7 @@ void rb_erase(struct rb_node *node, struct rb_root *root) rb_set_parent(old->rb_right, node); } - node->rb_parent_color = old->rb_parent_color; + node->__rb_parent_color = old->__rb_parent_color; node->rb_left = old->rb_left; rb_set_parent(old->rb_left, node); -- cgit v1.2.3-59-g8ed1b From 6b2dbba8b6ac4df26f72eda1e5ea7bab9f950e08 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:25 -0700 Subject: mm: replace vma prio_tree with an interval tree Implement an interval tree as a replacement for the VMA prio_tree. The algorithms are similar to lib/interval_tree.c; however that code can't be directly reused as the interval endpoints are not explicitly stored in the VMA. So instead, the common algorithm is moved into a template and the details (node type, how to get interval endpoints from the node, etc) are filled in using the C preprocessor. Once the interval tree functions are available, using them as a replacement to the VMA prio tree is a relatively simple, mechanical job. Signed-off-by: Michel Lespinasse Cc: Rik van Riel Cc: Hillf Danton Cc: Peter Zijlstra Cc: Catalin Marinas Cc: Andrea Arcangeli Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/fault-armv.c | 3 +- arch/arm/mm/flush.c | 3 +- arch/parisc/kernel/cache.c | 3 +- arch/x86/mm/hugetlbpage.c | 3 +- fs/hugetlbfs/inode.c | 9 +- fs/inode.c | 2 +- include/linux/fs.h | 6 +- include/linux/interval_tree_tmpl.h | 215 +++++++++++++++++++++++++++++++++++++ include/linux/mm.h | 30 +++--- include/linux/mm_types.h | 14 +-- kernel/events/uprobes.c | 3 +- kernel/fork.c | 2 +- lib/interval_tree.c | 166 ++-------------------------- lib/prio_tree.c | 19 +--- mm/Makefile | 4 +- mm/filemap_xip.c | 3 +- mm/fremap.c | 2 +- mm/hugetlb.c | 3 +- mm/interval_tree.c | 61 +++++++++++ mm/memory-failure.c | 3 +- mm/memory.c | 9 +- mm/mmap.c | 22 ++-- mm/nommu.c | 12 +-- mm/prio_tree.c | 208 ----------------------------------- mm/rmap.c | 18 ++-- 25 files changed, 357 insertions(+), 466 deletions(-) create mode 100644 include/linux/interval_tree_tmpl.h create mode 100644 mm/interval_tree.c delete mode 100644 mm/prio_tree.c (limited to 'fs') diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index 7599e2625c7d..2a5907b5c8d2 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c @@ -134,7 +134,6 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *mpnt; - struct prio_tree_iter iter; unsigned long offset; pgoff_t pgoff; int aliases = 0; @@ -147,7 +146,7 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma, * cache coherency. */ flush_dcache_mmap_lock(mapping); - vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) { /* * If this VMA is not in our MM, we can ignore it. * Note that we intentionally mask out the VMA diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 40ca11ed6e5f..1c8f7f564175 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c @@ -196,7 +196,6 @@ static void __flush_dcache_aliases(struct address_space *mapping, struct page *p { struct mm_struct *mm = current->active_mm; struct vm_area_struct *mpnt; - struct prio_tree_iter iter; pgoff_t pgoff; /* @@ -208,7 +207,7 @@ static void __flush_dcache_aliases(struct address_space *mapping, struct page *p pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); flush_dcache_mmap_lock(mapping); - vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) { unsigned long offset; /* diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index 9d181890a7e3..48e16dc20102 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -276,7 +276,6 @@ void flush_dcache_page(struct page *page) { struct address_space *mapping = page_mapping(page); struct vm_area_struct *mpnt; - struct prio_tree_iter iter; unsigned long offset; unsigned long addr, old_addr = 0; pgoff_t pgoff; @@ -299,7 +298,7 @@ void flush_dcache_page(struct page *page) * to flush one address here for them all to become coherent */ flush_dcache_mmap_lock(mapping); - vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) { offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT; addr = mpnt->vm_start + offset; diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index b91e48512425..937bff5cdaa7 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -71,7 +71,6 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - struct prio_tree_iter iter; struct vm_area_struct *svma; unsigned long saddr; pte_t *spte = NULL; @@ -81,7 +80,7 @@ huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) return (pte_t *)pmd_alloc(mm, pud, addr); mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { + vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 0a0ab8e21b19..c5bc355d8243 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -397,17 +397,16 @@ static void hugetlbfs_evict_inode(struct inode *inode) } static inline void -hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff) +hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff) { struct vm_area_struct *vma; - struct prio_tree_iter iter; - vma_prio_tree_foreach(vma, &iter, root, pgoff, ULONG_MAX) { + vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) { unsigned long v_offset; /* * Can the expression below overflow on 32-bit arches? - * No, because the prio_tree returns us only those vmas + * No, because the interval tree returns us only those vmas * which overlap the truncated area starting at pgoff, * and no vma on a 32-bit arch can span beyond the 4GB. */ @@ -432,7 +431,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) i_size_write(inode, offset); mutex_lock(&mapping->i_mmap_mutex); - if (!prio_tree_empty(&mapping->i_mmap)) + if (!RB_EMPTY_ROOT(&mapping->i_mmap)) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); mutex_unlock(&mapping->i_mmap_mutex); truncate_hugepages(inode, offset); diff --git a/fs/inode.c b/fs/inode.c index ac8d904b3f16..b03c71957246 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -348,7 +348,7 @@ void address_space_init_once(struct address_space *mapping) mutex_init(&mapping->i_mmap_mutex); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); - INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); + mapping->i_mmap = RB_ROOT; INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); } EXPORT_SYMBOL(address_space_init_once); diff --git a/include/linux/fs.h b/include/linux/fs.h index 5a8a273d5b2f..c617ed024df8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -401,7 +401,7 @@ struct inodes_stat_t { #include #include #include -#include +#include #include #include #include @@ -669,7 +669,7 @@ struct address_space { struct radix_tree_root page_tree; /* radix tree of all pages */ spinlock_t tree_lock; /* and lock protecting it */ unsigned int i_mmap_writable;/* count VM_SHARED mappings */ - struct prio_tree_root i_mmap; /* tree of private and shared mappings */ + struct rb_root i_mmap; /* tree of private and shared mappings */ struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ struct mutex i_mmap_mutex; /* protect tree, count, list */ /* Protected by tree_lock together with the radix tree */ @@ -741,7 +741,7 @@ int mapping_tagged(struct address_space *mapping, int tag); */ static inline int mapping_mapped(struct address_space *mapping) { - return !prio_tree_empty(&mapping->i_mmap) || + return !RB_EMPTY_ROOT(&mapping->i_mmap) || !list_empty(&mapping->i_mmap_nonlinear); } diff --git a/include/linux/interval_tree_tmpl.h b/include/linux/interval_tree_tmpl.h new file mode 100644 index 000000000000..c65deda31413 --- /dev/null +++ b/include/linux/interval_tree_tmpl.h @@ -0,0 +1,215 @@ +/* + Interval Trees + (C) 2012 Michel Lespinasse + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + include/linux/interval_tree_tmpl.h +*/ + +/* + * Template for implementing interval trees + * + * ITSTRUCT: struct type of the interval tree nodes + * ITRB: name of struct rb_node field within ITSTRUCT + * ITTYPE: type of the interval endpoints + * ITSUBTREE: name of ITTYPE field within ITSTRUCT holding last-in-subtree + * ITSTART(n): start endpoint of ITSTRUCT node n + * ITLAST(n): last endpoing of ITSTRUCT node n + * ITSTATIC: 'static' or empty + * ITPREFIX: prefix to use for the inline tree definitions + */ + +/* IT(name) -> ITPREFIX_name */ +#define _ITNAME(prefix, name) prefix ## _ ## name +#define ITNAME(prefix, name) _ITNAME(prefix, name) +#define IT(name) ITNAME(ITPREFIX, name) + +/* Callbacks for augmented rbtree insert and remove */ + +static inline ITTYPE IT(compute_subtree_last)(ITSTRUCT *node) +{ + ITTYPE max = ITLAST(node), subtree_last; + if (node->ITRB.rb_left) { + subtree_last = rb_entry(node->ITRB.rb_left, + ITSTRUCT, ITRB)->ITSUBTREE; + if (max < subtree_last) + max = subtree_last; + } + if (node->ITRB.rb_right) { + subtree_last = rb_entry(node->ITRB.rb_right, + ITSTRUCT, ITRB)->ITSUBTREE; + if (max < subtree_last) + max = subtree_last; + } + return max; +} + +static void IT(augment_propagate)(struct rb_node *rb, struct rb_node *stop) +{ + while (rb != stop) { + ITSTRUCT *node = rb_entry(rb, ITSTRUCT, ITRB); + ITTYPE subtree_last = IT(compute_subtree_last)(node); + if (node->ITSUBTREE == subtree_last) + break; + node->ITSUBTREE = subtree_last; + rb = rb_parent(&node->ITRB); + } +} + +static void IT(augment_copy)(struct rb_node *rb_old, struct rb_node *rb_new) +{ + ITSTRUCT *old = rb_entry(rb_old, ITSTRUCT, ITRB); + ITSTRUCT *new = rb_entry(rb_new, ITSTRUCT, ITRB); + + new->ITSUBTREE = old->ITSUBTREE; +} + +static void IT(augment_rotate)(struct rb_node *rb_old, struct rb_node *rb_new) +{ + ITSTRUCT *old = rb_entry(rb_old, ITSTRUCT, ITRB); + ITSTRUCT *new = rb_entry(rb_new, ITSTRUCT, ITRB); + + new->ITSUBTREE = old->ITSUBTREE; + old->ITSUBTREE = IT(compute_subtree_last)(old); +} + +static const struct rb_augment_callbacks IT(augment_callbacks) = { + IT(augment_propagate), IT(augment_copy), IT(augment_rotate) +}; + +/* Insert / remove interval nodes from the tree */ + +ITSTATIC void IT(insert)(ITSTRUCT *node, struct rb_root *root) +{ + struct rb_node **link = &root->rb_node, *rb_parent = NULL; + ITTYPE start = ITSTART(node), last = ITLAST(node); + ITSTRUCT *parent; + + while (*link) { + rb_parent = *link; + parent = rb_entry(rb_parent, ITSTRUCT, ITRB); + if (parent->ITSUBTREE < last) + parent->ITSUBTREE = last; + if (start < ITSTART(parent)) + link = &parent->ITRB.rb_left; + else + link = &parent->ITRB.rb_right; + } + + node->ITSUBTREE = last; + rb_link_node(&node->ITRB, rb_parent, link); + rb_insert_augmented(&node->ITRB, root, &IT(augment_callbacks)); +} + +ITSTATIC void IT(remove)(ITSTRUCT *node, struct rb_root *root) +{ + rb_erase_augmented(&node->ITRB, root, &IT(augment_callbacks)); +} + +/* + * Iterate over intervals intersecting [start;last] + * + * Note that a node's interval intersects [start;last] iff: + * Cond1: ITSTART(node) <= last + * and + * Cond2: start <= ITLAST(node) + */ + +static ITSTRUCT *IT(subtree_search)(ITSTRUCT *node, ITTYPE start, ITTYPE last) +{ + while (true) { + /* + * Loop invariant: start <= node->ITSUBTREE + * (Cond2 is satisfied by one of the subtree nodes) + */ + if (node->ITRB.rb_left) { + ITSTRUCT *left = rb_entry(node->ITRB.rb_left, + ITSTRUCT, ITRB); + if (start <= left->ITSUBTREE) { + /* + * Some nodes in left subtree satisfy Cond2. + * Iterate to find the leftmost such node N. + * If it also satisfies Cond1, that's the match + * we are looking for. Otherwise, there is no + * matching interval as nodes to the right of N + * can't satisfy Cond1 either. + */ + node = left; + continue; + } + } + if (ITSTART(node) <= last) { /* Cond1 */ + if (start <= ITLAST(node)) /* Cond2 */ + return node; /* node is leftmost match */ + if (node->ITRB.rb_right) { + node = rb_entry(node->ITRB.rb_right, + ITSTRUCT, ITRB); + if (start <= node->ITSUBTREE) + continue; + } + } + return NULL; /* No match */ + } +} + +ITSTATIC ITSTRUCT *IT(iter_first)(struct rb_root *root, + ITTYPE start, ITTYPE last) +{ + ITSTRUCT *node; + + if (!root->rb_node) + return NULL; + node = rb_entry(root->rb_node, ITSTRUCT, ITRB); + if (node->ITSUBTREE < start) + return NULL; + return IT(subtree_search)(node, start, last); +} + +ITSTATIC ITSTRUCT *IT(iter_next)(ITSTRUCT *node, ITTYPE start, ITTYPE last) +{ + struct rb_node *rb = node->ITRB.rb_right, *prev; + + while (true) { + /* + * Loop invariants: + * Cond1: ITSTART(node) <= last + * rb == node->ITRB.rb_right + * + * First, search right subtree if suitable + */ + if (rb) { + ITSTRUCT *right = rb_entry(rb, ITSTRUCT, ITRB); + if (start <= right->ITSUBTREE) + return IT(subtree_search)(right, start, last); + } + + /* Move up the tree until we come from a node's left child */ + do { + rb = rb_parent(&node->ITRB); + if (!rb) + return NULL; + prev = &node->ITRB; + node = rb_entry(rb, ITSTRUCT, ITRB); + rb = node->ITRB.rb_right; + } while (prev == rb); + + /* Check if the node intersects [start;last] */ + if (last < ITSTART(node)) /* !Cond1 */ + return NULL; + else if (start <= ITLAST(node)) /* Cond2 */ + return node; + } +} diff --git a/include/linux/mm.h b/include/linux/mm.h index 5ddb11b2b4bb..0f671ef09eba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include @@ -1355,22 +1354,27 @@ extern void zone_pcp_reset(struct zone *zone); extern atomic_long_t mmap_pages_allocated; extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); -/* prio_tree.c */ -void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old); -void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *); -void vma_prio_tree_remove(struct vm_area_struct *, struct prio_tree_root *); -struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma, - struct prio_tree_iter *iter); - -#define vma_prio_tree_foreach(vma, iter, root, begin, end) \ - for (prio_tree_iter_init(iter, root, begin, end), vma = NULL; \ - (vma = vma_prio_tree_next(vma, iter)); ) +/* interval_tree.c */ +void vma_interval_tree_add(struct vm_area_struct *vma, + struct vm_area_struct *old, + struct address_space *mapping); +void vma_interval_tree_insert(struct vm_area_struct *node, + struct rb_root *root); +void vma_interval_tree_remove(struct vm_area_struct *node, + struct rb_root *root); +struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root *root, + unsigned long start, unsigned long last); +struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node, + unsigned long start, unsigned long last); + +#define vma_interval_tree_foreach(vma, root, start, last) \ + for (vma = vma_interval_tree_iter_first(root, start, last); \ + vma; vma = vma_interval_tree_iter_next(vma, start, last)) static inline void vma_nonlinear_insert(struct vm_area_struct *vma, struct list_head *list) { - vma->shared.vm_set.parent = NULL; - list_add_tail(&vma->shared.vm_set.list, list); + list_add_tail(&vma->shared.nonlinear, list); } /* mmap.c */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a57a43f5ca7c..31f8a3af7d94 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include @@ -240,18 +239,15 @@ struct vm_area_struct { /* * For areas with an address space and backing store, - * linkage into the address_space->i_mmap prio tree, or - * linkage to the list of like vmas hanging off its node, or + * linkage into the address_space->i_mmap interval tree, or * linkage of vma in the address_space->i_mmap_nonlinear list. */ union { struct { - struct list_head list; - void *parent; /* aligns with prio_tree_node parent */ - struct vm_area_struct *head; - } vm_set; - - struct raw_prio_tree_node prio_tree_node; + struct rb_node rb; + unsigned long rb_subtree_last; + } linear; + struct list_head nonlinear; } shared; /* diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 912ef48d28ab..1d9c0a985960 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -735,7 +735,6 @@ static struct map_info * build_map_info(struct address_space *mapping, loff_t offset, bool is_register) { unsigned long pgoff = offset >> PAGE_SHIFT; - struct prio_tree_iter iter; struct vm_area_struct *vma; struct map_info *curr = NULL; struct map_info *prev = NULL; @@ -744,7 +743,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) again: mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (!valid_vma(vma, is_register)) continue; diff --git a/kernel/fork.c b/kernel/fork.c index 972762e01024..90dace52715e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -423,7 +423,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mapping->i_mmap_writable++; flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ - vma_prio_tree_add(tmp, mpnt); + vma_interval_tree_add(tmp, mpnt, mapping); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } diff --git a/lib/interval_tree.c b/lib/interval_tree.c index 6fd540b1e499..77a793e0644b 100644 --- a/lib/interval_tree.c +++ b/lib/interval_tree.c @@ -1,159 +1,13 @@ #include #include -/* Callbacks for augmented rbtree insert and remove */ - -static inline unsigned long -compute_subtree_last(struct interval_tree_node *node) -{ - unsigned long max = node->last, subtree_last; - if (node->rb.rb_left) { - subtree_last = rb_entry(node->rb.rb_left, - struct interval_tree_node, rb)->__subtree_last; - if (max < subtree_last) - max = subtree_last; - } - if (node->rb.rb_right) { - subtree_last = rb_entry(node->rb.rb_right, - struct interval_tree_node, rb)->__subtree_last; - if (max < subtree_last) - max = subtree_last; - } - return max; -} - -RB_DECLARE_CALLBACKS(static, augment_callbacks, struct interval_tree_node, rb, - unsigned long, __subtree_last, compute_subtree_last) - -/* Insert / remove interval nodes from the tree */ - -void interval_tree_insert(struct interval_tree_node *node, - struct rb_root *root) -{ - struct rb_node **link = &root->rb_node, *rb_parent = NULL; - unsigned long start = node->start, last = node->last; - struct interval_tree_node *parent; - - while (*link) { - rb_parent = *link; - parent = rb_entry(rb_parent, struct interval_tree_node, rb); - if (parent->__subtree_last < last) - parent->__subtree_last = last; - if (start < parent->start) - link = &parent->rb.rb_left; - else - link = &parent->rb.rb_right; - } - - node->__subtree_last = last; - rb_link_node(&node->rb, rb_parent, link); - rb_insert_augmented(&node->rb, root, &augment_callbacks); -} - -void interval_tree_remove(struct interval_tree_node *node, - struct rb_root *root) -{ - rb_erase_augmented(&node->rb, root, &augment_callbacks); -} - -/* - * Iterate over intervals intersecting [start;last] - * - * Note that a node's interval intersects [start;last] iff: - * Cond1: node->start <= last - * and - * Cond2: start <= node->last - */ - -static struct interval_tree_node * -subtree_search(struct interval_tree_node *node, - unsigned long start, unsigned long last) -{ - while (true) { - /* - * Loop invariant: start <= node->__subtree_last - * (Cond2 is satisfied by one of the subtree nodes) - */ - if (node->rb.rb_left) { - struct interval_tree_node *left = - rb_entry(node->rb.rb_left, - struct interval_tree_node, rb); - if (start <= left->__subtree_last) { - /* - * Some nodes in left subtree satisfy Cond2. - * Iterate to find the leftmost such node N. - * If it also satisfies Cond1, that's the match - * we are looking for. Otherwise, there is no - * matching interval as nodes to the right of N - * can't satisfy Cond1 either. - */ - node = left; - continue; - } - } - if (node->start <= last) { /* Cond1 */ - if (start <= node->last) /* Cond2 */ - return node; /* node is leftmost match */ - if (node->rb.rb_right) { - node = rb_entry(node->rb.rb_right, - struct interval_tree_node, rb); - if (start <= node->__subtree_last) - continue; - } - } - return NULL; /* No match */ - } -} - -struct interval_tree_node * -interval_tree_iter_first(struct rb_root *root, - unsigned long start, unsigned long last) -{ - struct interval_tree_node *node; - - if (!root->rb_node) - return NULL; - node = rb_entry(root->rb_node, struct interval_tree_node, rb); - if (node->__subtree_last < start) - return NULL; - return subtree_search(node, start, last); -} - -struct interval_tree_node * -interval_tree_iter_next(struct interval_tree_node *node, - unsigned long start, unsigned long last) -{ - struct rb_node *rb = node->rb.rb_right, *prev; - - while (true) { - /* - * Loop invariants: - * Cond1: node->start <= last - * rb == node->rb.rb_right - * - * First, search right subtree if suitable - */ - if (rb) { - struct interval_tree_node *right = - rb_entry(rb, struct interval_tree_node, rb); - if (start <= right->__subtree_last) - return subtree_search(right, start, last); - } - - /* Move up the tree until we come from a node's left child */ - do { - rb = rb_parent(&node->rb); - if (!rb) - return NULL; - prev = &node->rb; - node = rb_entry(rb, struct interval_tree_node, rb); - rb = node->rb.rb_right; - } while (prev == rb); - - /* Check if the node intersects [start;last] */ - if (last < node->start) /* !Cond1 */ - return NULL; - else if (start <= node->last) /* Cond2 */ - return node; - } -} +#define ITSTRUCT struct interval_tree_node +#define ITRB rb +#define ITTYPE unsigned long +#define ITSUBTREE __subtree_last +#define ITSTART(n) ((n)->start) +#define ITLAST(n) ((n)->last) +#define ITSTATIC +#define ITPREFIX interval_tree + +#include diff --git a/lib/prio_tree.c b/lib/prio_tree.c index 4e0d2edff2b4..bba37148c15e 100644 --- a/lib/prio_tree.c +++ b/lib/prio_tree.c @@ -44,27 +44,12 @@ * The following macros are used for implementing prio_tree for i_mmap */ -#define RADIX_INDEX(vma) ((vma)->vm_pgoff) -#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT) -/* avoid overflow */ -#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1)) - - static void get_index(const struct prio_tree_root *root, const struct prio_tree_node *node, unsigned long *radix, unsigned long *heap) { - if (root->raw) { - struct vm_area_struct *vma = prio_tree_entry( - node, struct vm_area_struct, shared.prio_tree_node); - - *radix = RADIX_INDEX(vma); - *heap = HEAP_INDEX(vma); - } - else { - *radix = node->start; - *heap = node->last; - } + *radix = node->start; + *heap = node->last; } static unsigned long index_bits_to_maxindex[BITS_PER_LONG]; diff --git a/mm/Makefile b/mm/Makefile index 92753e2d82da..6b025f80af34 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -14,9 +14,9 @@ endif obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ - prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ + util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ - compaction.o $(mmu-y) + compaction.o interval_tree.o $(mmu-y) obj-y += init-mm.o diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 91750227a191..a52daee11d3f 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -167,7 +167,6 @@ __xip_unmap (struct address_space * mapping, { struct vm_area_struct *vma; struct mm_struct *mm; - struct prio_tree_iter iter; unsigned long address; pte_t *pte; pte_t pteval; @@ -184,7 +183,7 @@ __xip_unmap (struct address_space * mapping, retry: mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { mm = vma->vm_mm; address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); diff --git a/mm/fremap.c b/mm/fremap.c index 3d731a498788..3899a86851ce 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -214,7 +214,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); vma->vm_flags |= VM_NONLINEAR; - vma_prio_tree_remove(vma, &mapping->i_mmap); + vma_interval_tree_remove(vma, &mapping->i_mmap); vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f1bb534254f6..c9b40e3a9936 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2474,7 +2474,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, struct hstate *h = hstate_vma(vma); struct vm_area_struct *iter_vma; struct address_space *mapping; - struct prio_tree_iter iter; pgoff_t pgoff; /* @@ -2491,7 +2490,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * __unmap_hugepage_range() is called as the lock is already held */ mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { /* Do not unmap the current VMA */ if (iter_vma == vma) continue; diff --git a/mm/interval_tree.c b/mm/interval_tree.c new file mode 100644 index 000000000000..7dc565660e56 --- /dev/null +++ b/mm/interval_tree.c @@ -0,0 +1,61 @@ +/* + * mm/interval_tree.c - interval tree for mapping->i_mmap + * + * Copyright (C) 2012, Michel Lespinasse + * + * This file is released under the GPL v2. + */ + +#include +#include + +#define ITSTRUCT struct vm_area_struct +#define ITRB shared.linear.rb +#define ITTYPE unsigned long +#define ITSUBTREE shared.linear.rb_subtree_last +#define ITSTART(n) ((n)->vm_pgoff) +#define ITLAST(n) ((n)->vm_pgoff + \ + (((n)->vm_end - (n)->vm_start) >> PAGE_SHIFT) - 1) +#define ITSTATIC +#define ITPREFIX vma_interval_tree + +#include + +/* Insert old immediately after vma in the interval tree */ +void vma_interval_tree_add(struct vm_area_struct *vma, + struct vm_area_struct *old, + struct address_space *mapping) +{ + struct rb_node **link; + struct vm_area_struct *parent; + unsigned long last; + + if (unlikely(vma->vm_flags & VM_NONLINEAR)) { + list_add(&vma->shared.nonlinear, &old->shared.nonlinear); + return; + } + + last = ITLAST(vma); + + if (!old->shared.linear.rb.rb_right) { + parent = old; + link = &old->shared.linear.rb.rb_right; + } else { + parent = rb_entry(old->shared.linear.rb.rb_right, + struct vm_area_struct, shared.linear.rb); + if (parent->shared.linear.rb_subtree_last < last) + parent->shared.linear.rb_subtree_last = last; + while (parent->shared.linear.rb.rb_left) { + parent = rb_entry(parent->shared.linear.rb.rb_left, + struct vm_area_struct, shared.linear.rb); + if (parent->shared.linear.rb_subtree_last < last) + parent->shared.linear.rb_subtree_last = last; + } + link = &parent->shared.linear.rb.rb_left; + } + + vma->shared.linear.rb_subtree_last = last; + rb_link_node(&vma->shared.linear.rb, &parent->shared.linear.rb, link); + rb_insert_augmented(&vma->shared.linear.rb, &mapping->i_mmap, + &vma_interval_tree_augment_callbacks); +} diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a6e2141a6610..c38a6257d082 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -431,7 +431,6 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, { struct vm_area_struct *vma; struct task_struct *tsk; - struct prio_tree_iter iter; struct address_space *mapping = page->mapping; mutex_lock(&mapping->i_mmap_mutex); @@ -442,7 +441,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, if (!task_early_kill(tsk)) continue; - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { /* * Send early kill signal to tasks where a vma covers diff --git a/mm/memory.c b/mm/memory.c index e09c04813186..d205e4381a34 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2801,14 +2801,13 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma, zap_page_range_single(vma, start_addr, end_addr - start_addr, details); } -static inline void unmap_mapping_range_tree(struct prio_tree_root *root, +static inline void unmap_mapping_range_tree(struct rb_root *root, struct zap_details *details) { struct vm_area_struct *vma; - struct prio_tree_iter iter; pgoff_t vba, vea, zba, zea; - vma_prio_tree_foreach(vma, &iter, root, + vma_interval_tree_foreach(vma, root, details->first_index, details->last_index) { vba = vma->vm_pgoff; @@ -2839,7 +2838,7 @@ static inline void unmap_mapping_range_list(struct list_head *head, * across *all* the pages in each nonlinear VMA, not just the pages * whose virtual address lies outside the file truncation point. */ - list_for_each_entry(vma, head, shared.vm_set.list) { + list_for_each_entry(vma, head, shared.nonlinear) { details->nonlinear_vma = vma; unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); } @@ -2883,7 +2882,7 @@ void unmap_mapping_range(struct address_space *mapping, mutex_lock(&mapping->i_mmap_mutex); - if (unlikely(!prio_tree_empty(&mapping->i_mmap))) + if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); diff --git a/mm/mmap.c b/mm/mmap.c index e3c365ff1b6a..5ac533f88e99 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -199,14 +199,14 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, flush_dcache_mmap_lock(mapping); if (unlikely(vma->vm_flags & VM_NONLINEAR)) - list_del_init(&vma->shared.vm_set.list); + list_del_init(&vma->shared.nonlinear); else - vma_prio_tree_remove(vma, &mapping->i_mmap); + vma_interval_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } /* - * Unlink a file-based vm structure from its prio_tree, to hide + * Unlink a file-based vm structure from its interval tree, to hide * vma from rmap and vmtruncate before freeing its page tables. */ void unlink_file_vma(struct vm_area_struct *vma) @@ -411,7 +411,7 @@ static void __vma_link_file(struct vm_area_struct *vma) if (unlikely(vma->vm_flags & VM_NONLINEAR)) vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); else - vma_prio_tree_insert(vma, &mapping->i_mmap); + vma_interval_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } } @@ -449,7 +449,7 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, /* * Helper for vma_adjust() in the split_vma insert case: insert a vma into the - * mm's list and rbtree. It has already been inserted into the prio_tree. + * mm's list and rbtree. It has already been inserted into the interval tree. */ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { @@ -491,7 +491,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, struct vm_area_struct *next = vma->vm_next; struct vm_area_struct *importer = NULL; struct address_space *mapping = NULL; - struct prio_tree_root *root = NULL; + struct rb_root *root = NULL; struct anon_vma *anon_vma = NULL; struct file *file = vma->vm_file; long adjust_next = 0; @@ -554,7 +554,7 @@ again: remove_next = 1 + (end > next->vm_end); mutex_lock(&mapping->i_mmap_mutex); if (insert) { /* - * Put into prio_tree now, so instantiated pages + * Put into interval tree now, so instantiated pages * are visible to arm/parisc __flush_dcache_page * throughout; but we cannot insert into address * space until vma start or end is updated. @@ -582,9 +582,9 @@ again: remove_next = 1 + (end > next->vm_end); if (root) { flush_dcache_mmap_lock(mapping); - vma_prio_tree_remove(vma, root); + vma_interval_tree_remove(vma, root); if (adjust_next) - vma_prio_tree_remove(next, root); + vma_interval_tree_remove(next, root); } vma->vm_start = start; @@ -597,8 +597,8 @@ again: remove_next = 1 + (end > next->vm_end); if (root) { if (adjust_next) - vma_prio_tree_insert(next, root); - vma_prio_tree_insert(vma, root); + vma_interval_tree_insert(next, root); + vma_interval_tree_insert(vma, root); flush_dcache_mmap_unlock(mapping); } diff --git a/mm/nommu.c b/mm/nommu.c index 12e84e69dd06..45131b41bcdb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -698,7 +698,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); - vma_prio_tree_insert(vma, &mapping->i_mmap); + vma_interval_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } @@ -764,7 +764,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) mutex_lock(&mapping->i_mmap_mutex); flush_dcache_mmap_lock(mapping); - vma_prio_tree_remove(vma, &mapping->i_mmap); + vma_interval_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } @@ -2044,7 +2044,6 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, size_t newsize) { struct vm_area_struct *vma; - struct prio_tree_iter iter; struct vm_region *region; pgoff_t low, high; size_t r_size, r_top; @@ -2056,8 +2055,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, mutex_lock(&inode->i_mapping->i_mmap_mutex); /* search for VMAs that fall within the dead zone */ - vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, - low, high) { + vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) { /* found one - only interested if it's shared out of the page * cache */ if (vma->vm_flags & VM_SHARED) { @@ -2073,8 +2071,8 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, * we don't check for any regions that start beyond the EOF as there * shouldn't be any */ - vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, - 0, ULONG_MAX) { + vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, + 0, ULONG_MAX) { if (!(vma->vm_flags & VM_SHARED)) continue; diff --git a/mm/prio_tree.c b/mm/prio_tree.c deleted file mode 100644 index 799dcfd7cd8c..000000000000 --- a/mm/prio_tree.c +++ /dev/null @@ -1,208 +0,0 @@ -/* - * mm/prio_tree.c - priority search tree for mapping->i_mmap - * - * Copyright (C) 2004, Rajesh Venkatasubramanian - * - * This file is released under the GPL v2. - * - * Based on the radix priority search tree proposed by Edward M. McCreight - * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985 - * - * 02Feb2004 Initial version - */ - -#include -#include -#include - -/* - * See lib/prio_tree.c for details on the general radix priority search tree - * code. - */ - -/* - * The following #defines are mirrored from lib/prio_tree.c. They're only used - * for debugging, and should be removed (along with the debugging code using - * them) when switching also VMAs to the regular prio_tree code. - */ - -#define RADIX_INDEX(vma) ((vma)->vm_pgoff) -#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT) -/* avoid overflow */ -#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1)) - -/* - * Radix priority search tree for address_space->i_mmap - * - * For each vma that map a unique set of file pages i.e., unique [radix_index, - * heap_index] value, we have a corresponding priority search tree node. If - * multiple vmas have identical [radix_index, heap_index] value, then one of - * them is used as a tree node and others are stored in a vm_set list. The tree - * node points to the first vma (head) of the list using vm_set.head. - * - * prio_tree_root - * | - * A vm_set.head - * / \ / - * L R -> H-I-J-K-M-N-O-P-Q-S - * ^ ^ <-- vm_set.list --> - * tree nodes - * - * We need some way to identify whether a vma is a tree node, head of a vm_set - * list, or just a member of a vm_set list. We cannot use vm_flags to store - * such information. The reason is, in the above figure, it is possible that - * vm_flags' of R and H are covered by the different mmap_sems. When R is - * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold - * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now. - * That's why some trick involving shared.vm_set.parent is used for identifying - * tree nodes and list head nodes. - * - * vma radix priority search tree node rules: - * - * vma->shared.vm_set.parent != NULL ==> a tree node - * vma->shared.vm_set.head != NULL ==> list of others mapping same range - * vma->shared.vm_set.head == NULL ==> no others map the same range - * - * vma->shared.vm_set.parent == NULL - * vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range - * vma->shared.vm_set.head == NULL ==> a list node - */ - -/* - * Add a new vma known to map the same set of pages as the old vma: - * useful for fork's dup_mmap as well as vma_prio_tree_insert below. - * Note that it just happens to work correctly on i_mmap_nonlinear too. - */ -void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old) -{ - /* Leave these BUG_ONs till prio_tree patch stabilizes */ - BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old)); - BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old)); - - vma->shared.vm_set.head = NULL; - vma->shared.vm_set.parent = NULL; - - if (!old->shared.vm_set.parent) - list_add(&vma->shared.vm_set.list, - &old->shared.vm_set.list); - else if (old->shared.vm_set.head) - list_add_tail(&vma->shared.vm_set.list, - &old->shared.vm_set.head->shared.vm_set.list); - else { - INIT_LIST_HEAD(&vma->shared.vm_set.list); - vma->shared.vm_set.head = old; - old->shared.vm_set.head = vma; - } -} - -void vma_prio_tree_insert(struct vm_area_struct *vma, - struct prio_tree_root *root) -{ - struct prio_tree_node *ptr; - struct vm_area_struct *old; - - vma->shared.vm_set.head = NULL; - - ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node); - if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) { - old = prio_tree_entry(ptr, struct vm_area_struct, - shared.prio_tree_node); - vma_prio_tree_add(vma, old); - } -} - -void vma_prio_tree_remove(struct vm_area_struct *vma, - struct prio_tree_root *root) -{ - struct vm_area_struct *node, *head, *new_head; - - if (!vma->shared.vm_set.head) { - if (!vma->shared.vm_set.parent) - list_del_init(&vma->shared.vm_set.list); - else - raw_prio_tree_remove(root, &vma->shared.prio_tree_node); - } else { - /* Leave this BUG_ON till prio_tree patch stabilizes */ - BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma); - if (vma->shared.vm_set.parent) { - head = vma->shared.vm_set.head; - if (!list_empty(&head->shared.vm_set.list)) { - new_head = list_entry( - head->shared.vm_set.list.next, - struct vm_area_struct, - shared.vm_set.list); - list_del_init(&head->shared.vm_set.list); - } else - new_head = NULL; - - raw_prio_tree_replace(root, &vma->shared.prio_tree_node, - &head->shared.prio_tree_node); - head->shared.vm_set.head = new_head; - if (new_head) - new_head->shared.vm_set.head = head; - - } else { - node = vma->shared.vm_set.head; - if (!list_empty(&vma->shared.vm_set.list)) { - new_head = list_entry( - vma->shared.vm_set.list.next, - struct vm_area_struct, - shared.vm_set.list); - list_del_init(&vma->shared.vm_set.list); - node->shared.vm_set.head = new_head; - new_head->shared.vm_set.head = node; - } else - node->shared.vm_set.head = NULL; - } - } -} - -/* - * Helper function to enumerate vmas that map a given file page or a set of - * contiguous file pages. The function returns vmas that at least map a single - * page in the given range of contiguous file pages. - */ -struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma, - struct prio_tree_iter *iter) -{ - struct prio_tree_node *ptr; - struct vm_area_struct *next; - - if (!vma) { - /* - * First call is with NULL vma - */ - ptr = prio_tree_next(iter); - if (ptr) { - next = prio_tree_entry(ptr, struct vm_area_struct, - shared.prio_tree_node); - prefetch(next->shared.vm_set.head); - return next; - } else - return NULL; - } - - if (vma->shared.vm_set.parent) { - if (vma->shared.vm_set.head) { - next = vma->shared.vm_set.head; - prefetch(next->shared.vm_set.list.next); - return next; - } - } else { - next = list_entry(vma->shared.vm_set.list.next, - struct vm_area_struct, shared.vm_set.list); - if (!next->shared.vm_set.head) { - prefetch(next->shared.vm_set.list.next); - return next; - } - } - - ptr = prio_tree_next(iter); - if (ptr) { - next = prio_tree_entry(ptr, struct vm_area_struct, - shared.prio_tree_node); - prefetch(next->shared.vm_set.head); - return next; - } else - return NULL; -} diff --git a/mm/rmap.c b/mm/rmap.c index 0f3b7cda2a24..7b5b51d25fc5 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -820,7 +820,6 @@ static int page_referenced_file(struct page *page, struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int referenced = 0; /* @@ -846,7 +845,7 @@ static int page_referenced_file(struct page *page, */ mapcount = page_mapcount(page); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) continue; @@ -945,13 +944,12 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int ret = 0; BUG_ON(PageAnon(page)); mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (vma->vm_flags & VM_SHARED) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) @@ -1547,7 +1545,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int ret = SWAP_AGAIN; unsigned long cursor; unsigned long max_nl_cursor = 0; @@ -1555,7 +1552,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) unsigned int mapcount; mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) continue; @@ -1576,7 +1573,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) goto out; list_for_each_entry(vma, &mapping->i_mmap_nonlinear, - shared.vm_set.list) { + shared.nonlinear) { cursor = (unsigned long) vma->vm_private_data; if (cursor > max_nl_cursor) max_nl_cursor = cursor; @@ -1608,7 +1605,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) do { list_for_each_entry(vma, &mapping->i_mmap_nonlinear, - shared.vm_set.list) { + shared.nonlinear) { cursor = (unsigned long) vma->vm_private_data; while ( cursor < max_nl_cursor && cursor < vma->vm_end - vma->vm_start) { @@ -1631,7 +1628,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) * in locked vmas). Reset cursor on all unreserved nonlinear * vmas, now forgetting on which ones it had fallen behind. */ - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) vma->vm_private_data = NULL; out: mutex_unlock(&mapping->i_mmap_mutex); @@ -1748,13 +1745,12 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; - struct prio_tree_iter iter; int ret = SWAP_AGAIN; if (!mapping) return ret; mutex_lock(&mapping->i_mmap_mutex); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); if (address == -EFAULT) continue; -- cgit v1.2.3-59-g8ed1b From 38a76013ad809beb0b52f60d365c960d035bd83c Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:31:50 -0700 Subject: mm: avoid taking rmap locks in move_ptes() During mremap(), the destination VMA is generally placed after the original vma in rmap traversal order: in move_vma(), we always have new_pgoff >= vma->vm_pgoff, and as a result new_vma->vm_pgoff >= vma->vm_pgoff unless vma_merge() merged the new vma with an adjacent one. When the destination VMA is placed after the original in rmap traversal order, we can avoid taking the rmap locks in move_ptes(). Essentially, this reintroduces the optimization that had been disabled in "mm anon rmap: remove anon_vma_moveto_tail". The difference is that we don't try to impose the rmap traversal order; instead we just rely on things being in the desired order in the common case and fall back to taking locks in the uncommon case. Also we skip the i_mmap_mutex in addition to the anon_vma lock: in both cases, the vmas are traversed in increasing vm_pgoff order with ties resolved in tree insertion order. Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- include/linux/mm.h | 6 ++++-- mm/mmap.c | 7 +++++-- mm/mremap.c | 57 +++++++++++++++++++++++++++++++++++++----------------- 4 files changed, 49 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 19f4fb80cd17..4f2bebc276c5 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -603,7 +603,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * process cleanup to remove whatever mess we made. */ if (length != move_page_tables(vma, old_start, - vma, new_start, length)) + vma, new_start, length, false)) return -ENOMEM; lru_add_drain(); diff --git a/include/linux/mm.h b/include/linux/mm.h index 0e6f9c9f2123..0d5f823ce3fc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1060,7 +1060,8 @@ vm_is_stack(struct task_struct *task, struct vm_area_struct *vma, int in_group); extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, - unsigned long new_addr, unsigned long len); + unsigned long new_addr, unsigned long len, + bool need_rmap_locks); extern unsigned long do_mremap(unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr); @@ -1410,7 +1411,8 @@ extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, struct rb_node **, struct rb_node *); extern void unlink_file_vma(struct vm_area_struct *); extern struct vm_area_struct *copy_vma(struct vm_area_struct **, - unsigned long addr, unsigned long len, pgoff_t pgoff); + unsigned long addr, unsigned long len, pgoff_t pgoff, + bool *need_rmap_locks); extern void exit_mmap(struct mm_struct *); extern int mm_take_all_locks(struct mm_struct *mm); diff --git a/mm/mmap.c b/mm/mmap.c index 81248992120d..2d942353d681 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2371,7 +2371,8 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) * prior to moving page table entries, to effect an mremap move. */ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, - unsigned long addr, unsigned long len, pgoff_t pgoff) + unsigned long addr, unsigned long len, pgoff_t pgoff, + bool *need_rmap_locks) { struct vm_area_struct *vma = *vmap; unsigned long vma_start = vma->vm_start; @@ -2413,8 +2414,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, * linear if there are no pages mapped yet. */ VM_BUG_ON(faulted_in_anon_vma); - *vmap = new_vma; + *vmap = vma = new_vma; } + *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); } else { new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (new_vma) { @@ -2434,6 +2436,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); vma_link(mm, new_vma, prev, rb_link, rb_parent); + *need_rmap_locks = false; } } return new_vma; diff --git a/mm/mremap.c b/mm/mremap.c index 5588bb6e9295..3b639a4b26bd 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -71,26 +71,42 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, unsigned long old_addr, unsigned long old_end, struct vm_area_struct *new_vma, pmd_t *new_pmd, - unsigned long new_addr) + unsigned long new_addr, bool need_rmap_locks) { struct address_space *mapping = NULL; - struct anon_vma *anon_vma = vma->anon_vma; + struct anon_vma *anon_vma = NULL; struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; spinlock_t *old_ptl, *new_ptl; - if (vma->vm_file) { - /* - * Subtle point from Rajesh Venkatasubramanian: before - * moving file-based ptes, we must lock truncate_pagecache - * out, since it might clean the dst vma before the src vma, - * and we propagate stale pages into the dst afterward. - */ - mapping = vma->vm_file->f_mapping; - mutex_lock(&mapping->i_mmap_mutex); + /* + * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma + * locks to ensure that rmap will always observe either the old or the + * new ptes. This is the easiest way to avoid races with + * truncate_pagecache(), page migration, etc... + * + * When need_rmap_locks is false, we use other ways to avoid + * such races: + * + * - During exec() shift_arg_pages(), we use a specially tagged vma + * which rmap call sites look for using is_vma_temporary_stack(). + * + * - During mremap(), new_vma is often known to be placed after vma + * in rmap traversal order. This ensures rmap will always observe + * either the old pte, or the new pte, or both (the page table locks + * serialize access to individual ptes, but only rmap traversal + * order guarantees that we won't miss both the old and new ptes). + */ + if (need_rmap_locks) { + if (vma->vm_file) { + mapping = vma->vm_file->f_mapping; + mutex_lock(&mapping->i_mmap_mutex); + } + if (vma->anon_vma) { + anon_vma = vma->anon_vma; + anon_vma_lock(anon_vma); + } } - if (anon_vma) - anon_vma_lock(anon_vma); /* * We don't have to worry about the ordering of src and dst @@ -127,7 +143,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, - unsigned long new_addr, unsigned long len) + unsigned long new_addr, unsigned long len, + bool need_rmap_locks) { unsigned long extent, next, old_end; pmd_t *old_pmd, *new_pmd; @@ -174,7 +191,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (extent > LATENCY_LIMIT) extent = LATENCY_LIMIT; move_ptes(vma, old_pmd, old_addr, old_addr + extent, - new_vma, new_pmd, new_addr); + new_vma, new_pmd, new_addr, need_rmap_locks); need_flush = true; } if (likely(need_flush)) @@ -198,6 +215,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, unsigned long hiwater_vm; int split = 0; int err; + bool need_rmap_locks; /* * We'd prefer to avoid failure later on in do_munmap: @@ -219,18 +237,21 @@ static unsigned long move_vma(struct vm_area_struct *vma, return err; new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); - new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); + new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff, + &need_rmap_locks); if (!new_vma) return -ENOMEM; - moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); + moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, + need_rmap_locks); if (moved_len < old_len) { /* * On error, move entries back from new area to old, * which will succeed since page tables still there, * and then proceed to unmap new area instead of old. */ - move_page_tables(new_vma, new_addr, vma, old_addr, moved_len); + move_page_tables(new_vma, new_addr, vma, old_addr, moved_len, + true); vma = new_vma; old_len = new_len; old_addr = new_addr; -- cgit v1.2.3-59-g8ed1b From cd8ed2a45a401cb692d769e92de7d73aa42fabce Mon Sep 17 00:00:00 2001 From: Yan Hong Date: Mon, 8 Oct 2012 16:33:45 -0700 Subject: fs/fs-writeback.c: remove unneccesary parameter of __writeback_single_inode() The parameter 'wb' is never used in this function. Signed-off-by: Yan Hong Acked-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 8e1d7b9e4a33..401b6c6248ae 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -439,8 +439,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * setting I_SYNC flag and calling inode_sync_complete() to clear it. */ static int -__writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, - struct writeback_control *wbc) +__writeback_single_inode(struct inode *inode, struct writeback_control *wbc) { struct address_space *mapping = inode->i_mapping; long nr_to_write = wbc->nr_to_write; @@ -527,7 +526,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, inode->i_state |= I_SYNC; spin_unlock(&inode->i_lock); - ret = __writeback_single_inode(inode, wb, wbc); + ret = __writeback_single_inode(inode, wbc); spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); @@ -670,7 +669,7 @@ static long writeback_sb_inodes(struct super_block *sb, * We use I_SYNC to pin the inode in memory. While it is set * evict_inode() will wait so the inode cannot be freed. */ - __writeback_single_inode(inode, wb, &wbc); + __writeback_single_inode(inode, &wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; wrote += write_chunk - wbc.nr_to_write; -- cgit v1.2.3-59-g8ed1b From 7a71932d5676b7410ab64d149bad8bde6b0d8632 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 8 Oct 2012 16:33:47 -0700 Subject: kpageflags: fix wrong KPF_THP on non-huge compound pages KPF_THP can be set on non-huge compound pages (like slab pages or pages allocated by drivers with __GFP_COMP) because PageTransCompound only checks PG_head and PG_tail. Obviously this is a bug and breaks user space applications which look for thp via /proc/kpageflags. This patch rules out setting KPF_THP wrongly by additionally checking PageLRU on the head pages. Signed-off-by: Naoya Horiguchi Acked-by: KOSAKI Motohiro Acked-by: David Rientjes Reviewed-by: Fengguang Wu Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/page.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/page.c b/fs/proc/page.c index 7fcd0d60a968..b8730d9ebaee 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -115,7 +115,13 @@ u64 stable_page_flags(struct page *page) u |= 1 << KPF_COMPOUND_TAIL; if (PageHuge(page)) u |= 1 << KPF_HUGE; - else if (PageTransCompound(page)) + /* + * PageTransCompound can be true for non-huge compound pages (slab + * pages or pages allocated by drivers with __GFP_COMP) because it + * just checks PG_head/PG_tail, so we need to check PageLRU to make + * sure a given page is a thp, not a non-huge compound page. + */ + else if (PageTransCompound(page) && PageLRU(compound_trans_head(page))) u |= 1 << KPF_THP; /* -- cgit v1.2.3-59-g8ed1b