aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/trace/ring_buffer.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/trace/ring_buffer.c')
-rw-r--r--kernel/trace/ring_buffer.c192
1 files changed, 164 insertions, 28 deletions
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2699e9e562b1..9712083832f4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -29,6 +29,14 @@
#include <asm/local.h>
+/*
+ * The "absolute" timestamp in the buffer is only 59 bits.
+ * If a clock has the 5 MSBs set, it needs to be saved and
+ * reinserted.
+ */
+#define TS_MSB (0xf8ULL << 56)
+#define ABS_TS_MASK (~TS_MSB)
+
static void update_pages_handler(struct work_struct *work);
/*
@@ -405,6 +413,7 @@ struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
wait_queue_head_t full_waiters;
+ long wait_index;
bool waiters_pending;
bool full_waiters_pending;
bool wakeup_full;
@@ -468,6 +477,7 @@ struct rb_time_struct {
local_t cnt;
local_t top;
local_t bottom;
+ local_t msb;
};
#else
#include <asm/local64.h>
@@ -569,7 +579,6 @@ struct ring_buffer_iter {
* For the ring buffer, 64 bit required operations for the time is
* the following:
*
- * - Only need 59 bits (uses 60 to make it even).
* - Reads may fail if it interrupted a modification of the time stamp.
* It will succeed if it did not interrupt another write even if
* the read itself is interrupted by a write.
@@ -594,6 +603,7 @@ struct ring_buffer_iter {
*/
#define RB_TIME_SHIFT 30
#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1)
+#define RB_TIME_MSB_SHIFT 60
static inline int rb_time_cnt(unsigned long val)
{
@@ -613,7 +623,7 @@ static inline u64 rb_time_val(unsigned long top, unsigned long bottom)
static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
{
- unsigned long top, bottom;
+ unsigned long top, bottom, msb;
unsigned long c;
/*
@@ -625,6 +635,7 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
c = local_read(&t->cnt);
top = local_read(&t->top);
bottom = local_read(&t->bottom);
+ msb = local_read(&t->msb);
} while (c != local_read(&t->cnt));
*cnt = rb_time_cnt(top);
@@ -633,7 +644,8 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
if (*cnt != rb_time_cnt(bottom))
return false;
- *ret = rb_time_val(top, bottom);
+ /* The shift to msb will lose its cnt bits */
+ *ret = rb_time_val(top, bottom) | ((u64)msb << RB_TIME_MSB_SHIFT);
return true;
}
@@ -649,10 +661,12 @@ static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt
return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT);
}
-static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom)
+static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom,
+ unsigned long *msb)
{
*top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK);
*bottom = (unsigned long)(val & RB_TIME_VAL_MASK);
+ *msb = (unsigned long)(val >> RB_TIME_MSB_SHIFT);
}
static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt)
@@ -663,15 +677,16 @@ static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long
static void rb_time_set(rb_time_t *t, u64 val)
{
- unsigned long cnt, top, bottom;
+ unsigned long cnt, top, bottom, msb;
- rb_time_split(val, &top, &bottom);
+ rb_time_split(val, &top, &bottom, &msb);
/* Writes always succeed with a valid number even if it gets interrupted. */
do {
cnt = local_inc_return(&t->cnt);
rb_time_val_set(&t->top, top, cnt);
rb_time_val_set(&t->bottom, bottom, cnt);
+ rb_time_val_set(&t->msb, val >> RB_TIME_MSB_SHIFT, cnt);
} while (cnt != local_read(&t->cnt));
}
@@ -686,8 +701,8 @@ rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
{
- unsigned long cnt, top, bottom;
- unsigned long cnt2, top2, bottom2;
+ unsigned long cnt, top, bottom, msb;
+ unsigned long cnt2, top2, bottom2, msb2;
u64 val;
/* The cmpxchg always fails if it interrupted an update */
@@ -703,16 +718,18 @@ static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
cnt2 = cnt + 1;
- rb_time_split(val, &top, &bottom);
+ rb_time_split(val, &top, &bottom, &msb);
top = rb_time_val_cnt(top, cnt);
bottom = rb_time_val_cnt(bottom, cnt);
- rb_time_split(set, &top2, &bottom2);
+ rb_time_split(set, &top2, &bottom2, &msb2);
top2 = rb_time_val_cnt(top2, cnt2);
bottom2 = rb_time_val_cnt(bottom2, cnt2);
if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2))
return false;
+ if (!rb_time_read_cmpxchg(&t->msb, msb, msb2))
+ return false;
if (!rb_time_read_cmpxchg(&t->top, top, top2))
return false;
if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2))
@@ -783,6 +800,24 @@ static inline void verify_event(struct ring_buffer_per_cpu *cpu_buffer,
}
#endif
+/*
+ * The absolute time stamp drops the 5 MSBs and some clocks may
+ * require them. The rb_fix_abs_ts() will take a previous full
+ * time stamp, and add the 5 MSB of that time stamp on to the
+ * saved absolute time stamp. Then they are compared in case of
+ * the unlikely event that the latest time stamp incremented
+ * the 5 MSB.
+ */
+static inline u64 rb_fix_abs_ts(u64 abs, u64 save_ts)
+{
+ if (save_ts & TS_MSB) {
+ abs |= save_ts & TS_MSB;
+ /* Check for overflow */
+ if (unlikely(abs < save_ts))
+ abs += 1ULL << 59;
+ }
+ return abs;
+}
static inline u64 rb_time_stamp(struct trace_buffer *buffer);
@@ -811,8 +846,10 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer,
u64 ts;
/* If the event includes an absolute time, then just use that */
- if (event->type_len == RINGBUF_TYPE_TIME_STAMP)
- return rb_event_time_stamp(event);
+ if (event->type_len == RINGBUF_TYPE_TIME_STAMP) {
+ ts = rb_event_time_stamp(event);
+ return rb_fix_abs_ts(ts, cpu_buffer->tail_page->page->time_stamp);
+ }
nest = local_read(&cpu_buffer->committing);
verify_event(cpu_buffer, event);
@@ -848,7 +885,7 @@ size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu)
}
/**
- * ring_buffer_nr_pages_dirty - get the number of used pages in the ring buffer
+ * ring_buffer_nr_dirty_pages - get the number of used pages in the ring buffer
* @buffer: The ring_buffer to get the number of pages from
* @cpu: The cpu of the ring_buffer to get the number of pages from
*
@@ -881,13 +918,56 @@ static void rb_wake_up_waiters(struct irq_work *work)
struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
wake_up_all(&rbwork->waiters);
- if (rbwork->wakeup_full) {
+ if (rbwork->full_waiters_pending || rbwork->wakeup_full) {
rbwork->wakeup_full = false;
+ rbwork->full_waiters_pending = false;
wake_up_all(&rbwork->full_waiters);
}
}
/**
+ * ring_buffer_wake_waiters - wake up any waiters on this ring buffer
+ * @buffer: The ring buffer to wake waiters on
+ *
+ * In the case of a file that represents a ring buffer is closing,
+ * it is prudent to wake up any waiters that are on this.
+ */
+void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct rb_irq_work *rbwork;
+
+ if (!buffer)
+ return;
+
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+
+ /* Wake up individual ones too. One level recursion */
+ for_each_buffer_cpu(buffer, cpu)
+ ring_buffer_wake_waiters(buffer, cpu);
+
+ rbwork = &buffer->irq_work;
+ } else {
+ if (WARN_ON_ONCE(!buffer->buffers))
+ return;
+ if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
+ return;
+
+ cpu_buffer = buffer->buffers[cpu];
+ /* The CPU buffer may not have been initialized yet */
+ if (!cpu_buffer)
+ return;
+ rbwork = &cpu_buffer->irq_work;
+ }
+
+ rbwork->wait_index++;
+ /* make sure the waiters see the new index */
+ smp_wmb();
+
+ rb_wake_up_waiters(&rbwork->work);
+}
+
+/**
* ring_buffer_wait - wait for input to the ring buffer
* @buffer: buffer to wait on
* @cpu: the cpu buffer to wait on
@@ -902,6 +982,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
struct ring_buffer_per_cpu *cpu_buffer;
DEFINE_WAIT(wait);
struct rb_irq_work *work;
+ long wait_index;
int ret = 0;
/*
@@ -920,6 +1001,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
work = &cpu_buffer->irq_work;
}
+ wait_index = READ_ONCE(work->wait_index);
while (true) {
if (full)
@@ -975,7 +1057,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
nr_pages = cpu_buffer->nr_pages;
dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
if (!cpu_buffer->shortest_full ||
- cpu_buffer->shortest_full < full)
+ cpu_buffer->shortest_full > full)
cpu_buffer->shortest_full = full;
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
if (!pagebusy &&
@@ -984,6 +1066,11 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
}
schedule();
+
+ /* Make sure to see the new wait index */
+ smp_rmb();
+ if (wait_index != work->wait_index)
+ break;
}
if (full)
@@ -2572,6 +2659,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
/* Mark the rest of the page with padding */
rb_event_set_padding(event);
+ /* Make sure the padding is visible before the write update */
+ smp_wmb();
+
/* Set the write back to the previous setting */
local_sub(length, &tail_page->write);
return;
@@ -2583,6 +2673,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
/* time delta must be non zero */
event->time_delta = 1;
+ /* Make sure the padding is visible before the tail_page->write update */
+ smp_wmb();
+
/* Set write to end of buffer */
length = (tail + length) - BUF_PAGE_SIZE;
local_sub(length, &tail_page->write);
@@ -2754,8 +2847,15 @@ static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
(RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE);
if (unlikely(info->delta > (1ULL << 59))) {
+ /*
+ * Some timers can use more than 59 bits, and when a timestamp
+ * is added to the buffer, it will lose those bits.
+ */
+ if (abs && (info->ts & TS_MSB)) {
+ info->delta &= ABS_TS_MASK;
+
/* did the clock go backwards */
- if (info->before == info->after && info->before > info->ts) {
+ } else if (info->before == info->after && info->before > info->ts) {
/* not interrupted */
static int once;
@@ -3304,7 +3404,7 @@ static void dump_buffer_page(struct buffer_data_page *bpage,
case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
- ts = delta;
+ ts = rb_fix_abs_ts(delta, ts);
pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta);
break;
@@ -3380,7 +3480,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
- ts = delta;
+ ts = rb_fix_abs_ts(delta, ts);
break;
case RINGBUF_TYPE_PADDING:
@@ -4367,6 +4467,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
+ delta = rb_fix_abs_ts(delta, cpu_buffer->read_stamp);
cpu_buffer->read_stamp = delta;
return;
@@ -4397,6 +4498,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
+ delta = rb_fix_abs_ts(delta, iter->read_stamp);
iter->read_stamp = delta;
return;
@@ -4542,6 +4644,33 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
arch_spin_unlock(&cpu_buffer->lock);
local_irq_restore(flags);
+ /*
+ * The writer has preempt disable, wait for it. But not forever
+ * Although, 1 second is pretty much "forever"
+ */
+#define USECS_WAIT 1000000
+ for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) {
+ /* If the write is past the end of page, a writer is still updating it */
+ if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE))
+ break;
+
+ udelay(1);
+
+ /* Get the latest version of the reader write value */
+ smp_rmb();
+ }
+
+ /* The writer is not moving forward? Something is wrong */
+ if (RB_WARN_ON(cpu_buffer, nr_loops == USECS_WAIT))
+ reader = NULL;
+
+ /*
+ * Make sure we see any padding after the write update
+ * (see rb_reset_tail())
+ */
+ smp_rmb();
+
+
return reader;
}
@@ -4650,6 +4779,7 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
case RINGBUF_TYPE_TIME_STAMP:
if (ts) {
*ts = rb_event_time_stamp(event);
+ *ts = rb_fix_abs_ts(*ts, reader->page->time_stamp);
ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
cpu_buffer->cpu, ts);
}
@@ -4741,6 +4871,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
case RINGBUF_TYPE_TIME_STAMP:
if (ts) {
*ts = rb_event_time_stamp(event);
+ *ts = rb_fix_abs_ts(*ts, iter->head_page->page->time_stamp);
ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
cpu_buffer->cpu, ts);
}
@@ -5185,7 +5316,7 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
/**
- * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ * ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer
* @buffer: The ring buffer to reset a per cpu buffer of
* @cpu: The CPU buffer to be reset
*/
@@ -5255,7 +5386,7 @@ void ring_buffer_reset(struct trace_buffer *buffer)
EXPORT_SYMBOL_GPL(ring_buffer_reset);
/**
- * rind_buffer_empty - is the ring buffer empty?
+ * ring_buffer_empty - is the ring buffer empty?
* @buffer: The ring buffer to test
*/
bool ring_buffer_empty(struct trace_buffer *buffer)
@@ -5569,7 +5700,15 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
unsigned int pos = 0;
unsigned int size;
- if (full)
+ /*
+ * If a full page is expected, this can still be returned
+ * if there's been a previous partial read and the
+ * rest of the page can be read and the commit page is off
+ * the reader page.
+ */
+ if (full &&
+ (!read || (len < (commit - read)) ||
+ cpu_buffer->reader_page == cpu_buffer->commit_page))
goto out_unlock;
if (len > (commit - read))
@@ -5898,16 +6037,13 @@ static __init int test_ringbuffer(void)
rb_data[cpu].buffer = buffer;
rb_data[cpu].cpu = cpu;
rb_data[cpu].cnt = cpu;
- rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu],
- "rbtester/%d", cpu);
+ rb_threads[cpu] = kthread_run_on_cpu(rb_test, &rb_data[cpu],
+ cpu, "rbtester/%u");
if (WARN_ON(IS_ERR(rb_threads[cpu]))) {
pr_cont("FAILED\n");
ret = PTR_ERR(rb_threads[cpu]);
goto out_free;
}
-
- kthread_bind(rb_threads[cpu], cpu);
- wake_up_process(rb_threads[cpu]);
}
/* Now create the rb hammer! */
@@ -6014,10 +6150,10 @@ static __init int test_ringbuffer(void)
pr_info(" total events: %ld\n", total_lost + total_read);
pr_info(" recorded len bytes: %ld\n", total_len);
pr_info(" recorded size bytes: %ld\n", total_size);
- if (total_lost)
+ if (total_lost) {
pr_info(" With dropped events, record len and size may not match\n"
" alloced and written from above\n");
- if (!total_lost) {
+ } else {
if (RB_WARN_ON(buffer, total_len != total_alloc ||
total_size != total_written))
break;