Skip to content

Commit 6cb0bd9

Browse files
committed
Merge tag 'trace-ringbuffer-v6.15-3' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace
Pull ring-buffer updates from Steven Rostedt: "Persistent buffer cleanups and simplifications. It was mistaken that the physical memory returned from "reserve_mem" had to be vmap()'d to get to it from a virtual address. But reserve_mem already maps the memory to the virtual address of the kernel so a simple phys_to_virt() can be used to get to the virtual address from the physical memory returned by "reserve_mem". With this new found knowledge, the code can be cleaned up and simplified. - Enforce that the persistent memory is page aligned As the buffers using the persistent memory are all going to be mapped via pages, make sure that the memory given to the tracing infrastructure is page aligned. If it is not, it will print a warning and fail to map the buffer. - Use phys_to_virt() to get the virtual address from reserve_mem Instead of calling vmap() on the physical memory returned from "reserve_mem", use phys_to_virt() instead. As the memory returned by "memmap" or any other means where a physical address is given to the tracing infrastructure, it still needs to be vmap(). Since this memory can never be returned back to the buddy allocator nor should it ever be memmory mapped to user space, flag this buffer and up the ref count. The ref count will keep it from ever being freed, and the flag will prevent it from ever being memory mapped to user space. - Use vmap_page_range() for memmap virtual address mapping For the memmap buffer, instead of allocating an array of struct pages, assigning them to the contiguous phsycial memory and then passing that to vmap(), use vmap_page_range() instead - Replace flush_dcache_folio() with flush_kernel_vmap_range() Instead of calling virt_to_folio() and passing that to flush_dcache_folio(), just call flush_kernel_vmap_range() directly. This also fixes a bug where if a subbuffer was bigger than PAGE_SIZE only the PAGE_SIZE portion would be flushed" * tag 'trace-ringbuffer-v6.15-3' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace: ring-buffer: Use flush_kernel_vmap_range() over flush_dcache_folio() tracing: Use vmap_page_range() to map memmap ring buffer tracing: Have reserve_mem use phys_to_virt() and separate from memmap buffer tracing: Enforce the persistent ring buffer to be page aligned
2 parents 949dd32 + e4d4b86 commit 6cb0bd9

File tree

5 files changed

+50
-26
lines changed

5 files changed

+50
-26
lines changed

Documentation/admin-guide/kernel-parameters.txt

+2
Original file line numberDiff line numberDiff line change
@@ -7288,6 +7288,8 @@
72887288
This is just one of many ways that can clear memory. Make sure your system
72897289
keeps the content of memory across reboots before relying on this option.
72907290

7291+
NB: Both the mapped address and size must be page aligned for the architecture.
7292+
72917293
See also Documentation/trace/debugging.rst
72927294

72937295

Documentation/trace/debugging.rst

+2
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,8 @@ kernel, so only the same kernel is guaranteed to work if the mapping is
136136
preserved. Switching to a different kernel version may find a different
137137
layout and mark the buffer as invalid.
138138

139+
NB: Both the mapped address and size must be page aligned for the architecture.
140+
139141
Using trace_printk() in the boot instance
140142
-----------------------------------------
141143
By default, the content of trace_printk() goes into the top level tracing

kernel/trace/ring_buffer.c

+3-2
Original file line numberDiff line numberDiff line change
@@ -6016,7 +6016,7 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
60166016
meta->read = cpu_buffer->read;
60176017

60186018
/* Some archs do not have data cache coherency between kernel and user-space */
6019-
flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page));
6019+
flush_kernel_vmap_range(cpu_buffer->meta_page, PAGE_SIZE);
60206020
}
60216021

60226022
static void
@@ -7319,7 +7319,8 @@ int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu)
73197319

73207320
out:
73217321
/* Some archs do not have data cache coherency between kernel and user-space */
7322-
flush_dcache_folio(virt_to_folio(cpu_buffer->reader_page->page));
7322+
flush_kernel_vmap_range(cpu_buffer->reader_page->page,
7323+
buffer->subbuf_size + BUF_PAGE_HDR_SIZE);
73237324

73247325
rb_update_meta_page(cpu_buffer);
73257326

kernel/trace/trace.c

+42-24
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
#include <linux/irq_work.h>
5151
#include <linux/workqueue.h>
5252
#include <linux/sort.h>
53+
#include <linux/io.h> /* vmap_page_range() */
5354

5455
#include <asm/setup.h> /* COMMAND_LINE_SIZE */
5556

@@ -8500,6 +8501,10 @@ static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma)
85008501
struct trace_iterator *iter = &info->iter;
85018502
int ret = 0;
85028503

8504+
/* A memmap'ed buffer is not supported for user space mmap */
8505+
if (iter->tr->flags & TRACE_ARRAY_FL_MEMMAP)
8506+
return -ENODEV;
8507+
85038508
/* Currently the boot mapped buffer is not supported for mmap */
85048509
if (iter->tr->flags & TRACE_ARRAY_FL_BOOT)
85058510
return -ENODEV;
@@ -9609,9 +9614,6 @@ static void free_trace_buffers(struct trace_array *tr)
96099614
#ifdef CONFIG_TRACER_MAX_TRACE
96109615
free_trace_buffer(&tr->max_buffer);
96119616
#endif
9612-
9613-
if (tr->range_addr_start)
9614-
vunmap((void *)tr->range_addr_start);
96159617
}
96169618

96179619
static void init_trace_flags_index(struct trace_array *tr)
@@ -9804,29 +9806,27 @@ static int instance_mkdir(const char *name)
98049806
return ret;
98059807
}
98069808

9807-
static u64 map_pages(u64 start, u64 size)
9809+
static u64 map_pages(unsigned long start, unsigned long size)
98089810
{
9809-
struct page **pages;
9810-
phys_addr_t page_start;
9811-
unsigned int page_count;
9812-
unsigned int i;
9813-
void *vaddr;
9814-
9815-
page_count = DIV_ROUND_UP(size, PAGE_SIZE);
9811+
unsigned long vmap_start, vmap_end;
9812+
struct vm_struct *area;
9813+
int ret;
98169814

9817-
page_start = start;
9818-
pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL);
9819-
if (!pages)
9815+
area = get_vm_area(size, VM_IOREMAP);
9816+
if (!area)
98209817
return 0;
98219818

9822-
for (i = 0; i < page_count; i++) {
9823-
phys_addr_t addr = page_start + i * PAGE_SIZE;
9824-
pages[i] = pfn_to_page(addr >> PAGE_SHIFT);
9819+
vmap_start = (unsigned long) area->addr;
9820+
vmap_end = vmap_start + size;
9821+
9822+
ret = vmap_page_range(vmap_start, vmap_end,
9823+
start, pgprot_nx(PAGE_KERNEL));
9824+
if (ret < 0) {
9825+
free_vm_area(area);
9826+
return 0;
98259827
}
9826-
vaddr = vmap(pages, page_count, VM_MAP, PAGE_KERNEL);
9827-
kfree(pages);
98289828

9829-
return (u64)(unsigned long)vaddr;
9829+
return (u64)vmap_start;
98309830
}
98319831

98329832
/**
@@ -10705,6 +10705,7 @@ static inline void do_allocate_snapshot(const char *name) { }
1070510705
__init static void enable_instances(void)
1070610706
{
1070710707
struct trace_array *tr;
10708+
bool memmap_area = false;
1070810709
char *curr_str;
1070910710
char *name;
1071010711
char *str;
@@ -10773,6 +10774,7 @@ __init static void enable_instances(void)
1077310774
name);
1077410775
continue;
1077510776
}
10777+
memmap_area = true;
1077610778
} else if (tok) {
1077710779
if (!reserve_mem_find_by_name(tok, &start, &size)) {
1077810780
start = 0;
@@ -10783,7 +10785,20 @@ __init static void enable_instances(void)
1078310785
}
1078410786

1078510787
if (start) {
10786-
addr = map_pages(start, size);
10788+
/* Start and size must be page aligned */
10789+
if (start & ~PAGE_MASK) {
10790+
pr_warn("Tracing: mapping start addr %pa is not page aligned\n", &start);
10791+
continue;
10792+
}
10793+
if (size & ~PAGE_MASK) {
10794+
pr_warn("Tracing: mapping size %pa is not page aligned\n", &size);
10795+
continue;
10796+
}
10797+
10798+
if (memmap_area)
10799+
addr = map_pages(start, size);
10800+
else
10801+
addr = (unsigned long)phys_to_virt(start);
1078710802
if (addr) {
1078810803
pr_info("Tracing: mapped boot instance %s at physical memory %pa of size 0x%lx\n",
1078910804
name, &start, (unsigned long)size);
@@ -10810,10 +10825,13 @@ __init static void enable_instances(void)
1081010825
update_printk_trace(tr);
1081110826

1081210827
/*
10813-
* If start is set, then this is a mapped buffer, and
10814-
* cannot be deleted by user space, so keep the reference
10815-
* to it.
10828+
* memmap'd buffers can not be freed.
1081610829
*/
10830+
if (memmap_area) {
10831+
tr->flags |= TRACE_ARRAY_FL_MEMMAP;
10832+
tr->ref++;
10833+
}
10834+
1081710835
if (start) {
1081810836
tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT;
1081910837
tr->range_name = no_free_ptr(rname);

kernel/trace/trace.h

+1
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,7 @@ enum {
447447
TRACE_ARRAY_FL_BOOT = BIT(1),
448448
TRACE_ARRAY_FL_LAST_BOOT = BIT(2),
449449
TRACE_ARRAY_FL_MOD_INIT = BIT(3),
450+
TRACE_ARRAY_FL_MEMMAP = BIT(4),
450451
};
451452

452453
#ifdef CONFIG_MODULES

0 commit comments

Comments
 (0)