aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64
diff options
context:
space:
mode:
authorAndi Kleen <ak@suse.de>2005-11-05 17:25:53 +0100
committerLinus Torvalds <torvalds@g5.osdl.org>2005-11-14 19:55:13 -0800
commita2f1b424900715ed9d1699c3bb88a434a2b42bc0 (patch)
tree8ef440f840656365166ff2d71aa445c224c53546 /arch/x86_64
parent[PATCH] x86_64: Update defconfig (diff)
downloadlinux-dev-a2f1b424900715ed9d1699c3bb88a434a2b42bc0.tar.xz
linux-dev-a2f1b424900715ed9d1699c3bb88a434a2b42bc0.zip
[PATCH] x86_64: Add 4GB DMA32 zone
Add a new 4GB GFP_DMA32 zone between the GFP_DMA and GFP_NORMAL zones. As a bit of historical background: when the x86-64 port was originally designed we had some discussion if we should use a 16MB DMA zone like i386 or a 4GB DMA zone like IA64 or both. Both was ruled out at this point because it was in early 2.4 when VM is still quite shakey and had bad troubles even dealing with one DMA zone. We settled on the 16MB DMA zone mainly because we worried about older soundcards and the floppy. But this has always caused problems since then because device drivers had trouble getting enough DMA able memory. These days the VM works much better and the wide use of NUMA has proven it can deal with many zones successfully. So this patch adds both zones. This helps drivers who need a lot of memory below 4GB because their hardware is not accessing more (graphic drivers - proprietary and free ones, video frame buffer drivers, sound drivers etc.). Previously they could only use IOMMU+16MB GFP_DMA, which was not enough memory. Another common problem is that hardware who has full memory addressing for >4GB misses it for some control structures in memory (like transmit rings or other metadata). They tended to allocate memory in the 16MB GFP_DMA or the IOMMU/swiotlb then using pci_alloc_consistent, but that can tie up a lot of precious 16MB GFPDMA/IOMMU/swiotlb memory (even on AMD systems the IOMMU tends to be quite small) especially if you have many devices. With the new zone pci_alloc_consistent can just put this stuff into memory below 4GB which works better. One argument was still if the zone should be 4GB or 2GB. The main motivation for 2GB would be an unnamed not so unpopular hardware raid controller (mostly found in older machines from a particular four letter company) who has a strange 2GB restriction in firmware. But that one works ok with swiotlb/IOMMU anyways, so it doesn't really need GFP_DMA32. I chose 4GB to be compatible with IA64 and because it seems to be the most common restriction. The new zone is so far added only for x86-64. For other architectures who don't set up this new zone nothing changes. Architectures can set a compatibility define in Kconfig CONFIG_DMA_IS_DMA32 that will define GFP_DMA32 as GFP_DMA. Otherwise it's a nop because on 32bit architectures it's normally not needed because GFP_NORMAL (=0) is DMA able enough. One problem is still that GFP_DMA means different things on different architectures. e.g. some drivers used to have #ifdef ia64 use GFP_DMA (trusting it to be 4GB) #elif __x86_64__ (use other hacks like the swiotlb because 16MB is not enough) ... . This was quite ugly and is now obsolete. These should be now converted to use GFP_DMA32 unconditionally. I haven't done this yet. Or best only use pci_alloc_consistent/dma_alloc_coherent which will use GFP_DMA32 transparently. Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch/x86_64')
-rw-r--r--arch/x86_64/mm/init.c65
-rw-r--r--arch/x86_64/mm/numa.c25
2 files changed, 47 insertions, 43 deletions
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index e60a1a848de8..a1ad4cc423a7 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -318,32 +318,51 @@ void zap_low_mappings(void)
flush_tlb_all();
}
+/* Compute zone sizes for the DMA and DMA32 zones in a node. */
+__init void
+size_zones(unsigned long *z, unsigned long *h,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ int i;
+ unsigned long w;
+
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ z[i] = 0;
+
+ if (start_pfn < MAX_DMA_PFN)
+ z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
+ if (start_pfn < MAX_DMA32_PFN) {
+ unsigned long dma32_pfn = MAX_DMA32_PFN;
+ if (dma32_pfn > end_pfn)
+ dma32_pfn = end_pfn;
+ z[ZONE_DMA32] = dma32_pfn - start_pfn;
+ }
+ z[ZONE_NORMAL] = end_pfn - start_pfn;
+
+ /* Remove lower zones from higher ones. */
+ w = 0;
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ if (z[i])
+ z[i] -= w;
+ w += z[i];
+ }
+
+ /* Compute holes */
+ w = 0;
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ unsigned long s = w;
+ w += z[i];
+ h[i] = e820_hole_size(s, w);
+ }
+}
+
#ifndef CONFIG_NUMA
void __init paging_init(void)
{
- {
- unsigned long zones_size[MAX_NR_ZONES];
- unsigned long holes[MAX_NR_ZONES];
- unsigned int max_dma;
-
- memset(zones_size, 0, sizeof(zones_size));
- memset(holes, 0, sizeof(holes));
-
- max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-
- if (end_pfn < max_dma) {
- zones_size[ZONE_DMA] = end_pfn;
- holes[ZONE_DMA] = e820_hole_size(0, end_pfn);
- } else {
- zones_size[ZONE_DMA] = max_dma;
- holes[ZONE_DMA] = e820_hole_size(0, max_dma);
- zones_size[ZONE_NORMAL] = end_pfn - max_dma;
- holes[ZONE_NORMAL] = e820_hole_size(max_dma, end_pfn);
- }
- free_area_init_node(0, NODE_DATA(0), zones_size,
- __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
- }
- return;
+ unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
+ size_zones(zones, holes, 0, end_pfn);
+ free_area_init_node(0, NODE_DATA(0), zones,
+ __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
}
#endif
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 214803821001..18e86e2eac2d 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -132,29 +132,14 @@ void __init setup_node_zones(int nodeid)
unsigned long start_pfn, end_pfn;
unsigned long zones[MAX_NR_ZONES];
unsigned long holes[MAX_NR_ZONES];
- unsigned long dma_end_pfn;
- memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES);
- memset(holes, 0, sizeof(unsigned long) * MAX_NR_ZONES);
+ start_pfn = node_start_pfn(nodeid);
+ end_pfn = node_end_pfn(nodeid);
- start_pfn = node_start_pfn(nodeid);
- end_pfn = node_end_pfn(nodeid);
+ Dprintk(KERN_INFO "setting up node %d %lx-%lx\n",
+ nodeid, start_pfn, end_pfn);
- Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn);
-
- /* All nodes > 0 have a zero length zone DMA */
- dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT;
- if (start_pfn < dma_end_pfn) {
- zones[ZONE_DMA] = dma_end_pfn - start_pfn;
- holes[ZONE_DMA] = e820_hole_size(start_pfn, dma_end_pfn);
- zones[ZONE_NORMAL] = end_pfn - dma_end_pfn;
- holes[ZONE_NORMAL] = e820_hole_size(dma_end_pfn, end_pfn);
-
- } else {
- zones[ZONE_NORMAL] = end_pfn - start_pfn;
- holes[ZONE_NORMAL] = e820_hole_size(start_pfn, end_pfn);
- }
-
+ size_zones(zones, holes, start_pfn, end_pfn);
free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
start_pfn, holes);
}