Skip to content

Commit 2f064f3

Browse files
Michal Hockotorvalds
Michal Hocko
authored andcommitted
mm: make page pfmemalloc check more robust
Commit c48a11c ("netvm: propagate page->pfmemalloc to skb") added checks for page->pfmemalloc to __skb_fill_page_desc(): if (page->pfmemalloc && !page->mapping) skb->pfmemalloc = true; It assumes page->mapping == NULL implies that page->pfmemalloc can be trusted. However, __delete_from_page_cache() can set set page->mapping to NULL and leave page->index value alone. Due to being in union, a non-zero page->index will be interpreted as true page->pfmemalloc. So the assumption is invalid if the networking code can see such a page. And it seems it can. We have encountered this with a NFS over loopback setup when such a page is attached to a new skbuf. There is no copying going on in this case so the page confuses __skb_fill_page_desc which interprets the index as pfmemalloc flag and the network stack drops packets that have been allocated using the reserves unless they are to be queued on sockets handling the swapping which is the case here and that leads to hangs when the nfs client waits for a response from the server which has been dropped and thus never arrive. The struct page is already heavily packed so rather than finding another hole to put it in, let's do a trick instead. We can reuse the index again but define it to an impossible value (-1UL). This is the page index so it should never see the value that large. Replace all direct users of page->pfmemalloc by page_is_pfmemalloc which will hide this nastiness from unspoiled eyes. The information will get lost if somebody wants to use page->index obviously but that was the case before and the original code expected that the information should be persisted somewhere else if that is really needed (e.g. what SLAB and SLUB do). [[email protected]: fix blooper in slub] Fixes: c48a11c ("netvm: propagate page->pfmemalloc to skb") Signed-off-by: Michal Hocko <[email protected]> Debugged-by: Vlastimil Babka <[email protected]> Debugged-by: Jiri Bohac <[email protected]> Cc: Eric Dumazet <[email protected]> Cc: David Miller <[email protected]> Acked-by: Mel Gorman <[email protected]> Cc: <[email protected]> [3.6+] Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent e45fc85 commit 2f064f3

File tree

11 files changed

+47
-29
lines changed

11 files changed

+47
-29
lines changed

drivers/net/ethernet/intel/fm10k/fm10k_main.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ static void fm10k_reuse_rx_page(struct fm10k_ring *rx_ring,
216216

217217
static inline bool fm10k_page_is_reserved(struct page *page)
218218
{
219-
return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc;
219+
return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
220220
}
221221

222222
static bool fm10k_can_reuse_rx_page(struct fm10k_rx_buffer *rx_buffer,

drivers/net/ethernet/intel/igb/igb_main.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -6566,7 +6566,7 @@ static void igb_reuse_rx_page(struct igb_ring *rx_ring,
65666566

65676567
static inline bool igb_page_is_reserved(struct page *page)
65686568
{
6569-
return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc;
6569+
return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
65706570
}
65716571

65726572
static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer,

drivers/net/ethernet/intel/ixgbe/ixgbe_main.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -1832,7 +1832,7 @@ static void ixgbe_reuse_rx_page(struct ixgbe_ring *rx_ring,
18321832

18331833
static inline bool ixgbe_page_is_reserved(struct page *page)
18341834
{
1835-
return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc;
1835+
return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
18361836
}
18371837

18381838
/**

drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -765,7 +765,7 @@ static void ixgbevf_reuse_rx_page(struct ixgbevf_ring *rx_ring,
765765

766766
static inline bool ixgbevf_page_is_reserved(struct page *page)
767767
{
768-
return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc;
768+
return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
769769
}
770770

771771
/**

include/linux/mm.h

+28
Original file line numberDiff line numberDiff line change
@@ -1002,6 +1002,34 @@ static inline int page_mapped(struct page *page)
10021002
return atomic_read(&(page)->_mapcount) >= 0;
10031003
}
10041004

1005+
/*
1006+
* Return true only if the page has been allocated with
1007+
* ALLOC_NO_WATERMARKS and the low watermark was not
1008+
* met implying that the system is under some pressure.
1009+
*/
1010+
static inline bool page_is_pfmemalloc(struct page *page)
1011+
{
1012+
/*
1013+
* Page index cannot be this large so this must be
1014+
* a pfmemalloc page.
1015+
*/
1016+
return page->index == -1UL;
1017+
}
1018+
1019+
/*
1020+
* Only to be called by the page allocator on a freshly allocated
1021+
* page.
1022+
*/
1023+
static inline void set_page_pfmemalloc(struct page *page)
1024+
{
1025+
page->index = -1UL;
1026+
}
1027+
1028+
static inline void clear_page_pfmemalloc(struct page *page)
1029+
{
1030+
page->index = 0;
1031+
}
1032+
10051033
/*
10061034
* Different kinds of faults, as returned by handle_mm_fault().
10071035
* Used to decide whether a process gets delivered SIGBUS or

include/linux/mm_types.h

-9
Original file line numberDiff line numberDiff line change
@@ -63,15 +63,6 @@ struct page {
6363
union {
6464
pgoff_t index; /* Our offset within mapping. */
6565
void *freelist; /* sl[aou]b first free object */
66-
bool pfmemalloc; /* If set by the page allocator,
67-
* ALLOC_NO_WATERMARKS was set
68-
* and the low watermark was not
69-
* met implying that the system
70-
* is under some pressure. The
71-
* caller should try ensure
72-
* this page is only used to
73-
* free other pages.
74-
*/
7566
};
7667

7768
union {

include/linux/skbuff.h

+5-9
Original file line numberDiff line numberDiff line change
@@ -1602,20 +1602,16 @@ static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
16021602
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
16031603

16041604
/*
1605-
* Propagate page->pfmemalloc to the skb if we can. The problem is
1606-
* that not all callers have unique ownership of the page. If
1607-
* pfmemalloc is set, we check the mapping as a mapping implies
1608-
* page->index is set (index and pfmemalloc share space).
1609-
* If it's a valid mapping, we cannot use page->pfmemalloc but we
1610-
* do not lose pfmemalloc information as the pages would not be
1611-
* allocated using __GFP_MEMALLOC.
1605+
* Propagate page pfmemalloc to the skb if we can. The problem is
1606+
* that not all callers have unique ownership of the page but rely
1607+
* on page_is_pfmemalloc doing the right thing(tm).
16121608
*/
16131609
frag->page.p = page;
16141610
frag->page_offset = off;
16151611
skb_frag_size_set(frag, size);
16161612

16171613
page = compound_head(page);
1618-
if (page->pfmemalloc && !page->mapping)
1614+
if (page_is_pfmemalloc(page))
16191615
skb->pfmemalloc = true;
16201616
}
16211617

@@ -2263,7 +2259,7 @@ static inline struct page *dev_alloc_page(void)
22632259
static inline void skb_propagate_pfmemalloc(struct page *page,
22642260
struct sk_buff *skb)
22652261
{
2266-
if (page && page->pfmemalloc)
2262+
if (page_is_pfmemalloc(page))
22672263
skb->pfmemalloc = true;
22682264
}
22692265

mm/page_alloc.c

+6-3
Original file line numberDiff line numberDiff line change
@@ -1343,12 +1343,15 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
13431343
set_page_owner(page, order, gfp_flags);
13441344

13451345
/*
1346-
* page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to
1346+
* page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
13471347
* allocate the page. The expectation is that the caller is taking
13481348
* steps that will free more memory. The caller should avoid the page
13491349
* being used for !PFMEMALLOC purposes.
13501350
*/
1351-
page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
1351+
if (alloc_flags & ALLOC_NO_WATERMARKS)
1352+
set_page_pfmemalloc(page);
1353+
else
1354+
clear_page_pfmemalloc(page);
13521355

13531356
return 0;
13541357
}
@@ -3345,7 +3348,7 @@ void *__alloc_page_frag(struct page_frag_cache *nc,
33453348
atomic_add(size - 1, &page->_count);
33463349

33473350
/* reset page count bias and offset to start of new frag */
3348-
nc->pfmemalloc = page->pfmemalloc;
3351+
nc->pfmemalloc = page_is_pfmemalloc(page);
33493352
nc->pagecnt_bias = size;
33503353
nc->offset = size;
33513354
}

mm/slab.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -1603,7 +1603,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
16031603
}
16041604

16051605
/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
1606-
if (unlikely(page->pfmemalloc))
1606+
if (page_is_pfmemalloc(page))
16071607
pfmemalloc_active = true;
16081608

16091609
nr_pages = (1 << cachep->gfporder);
@@ -1614,7 +1614,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
16141614
add_zone_page_state(page_zone(page),
16151615
NR_SLAB_UNRECLAIMABLE, nr_pages);
16161616
__SetPageSlab(page);
1617-
if (page->pfmemalloc)
1617+
if (page_is_pfmemalloc(page))
16181618
SetPageSlabPfmemalloc(page);
16191619

16201620
if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {

mm/slub.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -1427,7 +1427,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
14271427
inc_slabs_node(s, page_to_nid(page), page->objects);
14281428
page->slab_cache = s;
14291429
__SetPageSlab(page);
1430-
if (page->pfmemalloc)
1430+
if (page_is_pfmemalloc(page))
14311431
SetPageSlabPfmemalloc(page);
14321432

14331433
start = page_address(page);

net/core/skbuff.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,7 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
340340

341341
if (skb && frag_size) {
342342
skb->head_frag = 1;
343-
if (virt_to_head_page(data)->pfmemalloc)
343+
if (page_is_pfmemalloc(virt_to_head_page(data)))
344344
skb->pfmemalloc = 1;
345345
}
346346
return skb;

0 commit comments

Comments
 (0)