]> bbs.cooldavid.org Git - net-next-2.6.git/commitdiff
Merge branch 'tip/perf/ringbuffer-2' of git://git.kernel.org/pub/scm/linux/kernel...
authorIngo Molnar <mingo@elte.hu>
Tue, 26 Oct 2010 11:14:02 +0000 (13:14 +0200)
committerIngo Molnar <mingo@elte.hu>
Tue, 26 Oct 2010 11:14:02 +0000 (13:14 +0200)
1  2 
kernel/trace/ring_buffer.c

index c5a632a669e1f7a4ebc6a07889ef593265b3d76a,ad25490f8b4077e14016628230c21479f2596050..ec5c71005c14c6d6a0861ec9ff93fa4bd3a218e4
@@@ -224,6 -224,9 +224,9 @@@ enum 
        RB_LEN_TIME_STAMP = 16,
  };
  
+ #define skip_time_extend(event) \
+       ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
  static inline int rb_null_event(struct ring_buffer_event *event)
  {
        return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@@@ -248,8 -251,12 +251,12 @@@ rb_event_data_length(struct ring_buffer
        return length + RB_EVNT_HDR_SIZE;
  }
  
- /* inline for ring buffer fast paths */
- static unsigned
+ /*
+  * Return the length of the given event. Will return
+  * the length of the time extend if the event is a
+  * time extend.
+  */
+ static inline unsigned
  rb_event_length(struct ring_buffer_event *event)
  {
        switch (event->type_len) {
        return 0;
  }
  
+ /*
+  * Return total length of time extend and data,
+  *   or just the event length for all other events.
+  */
+ static inline unsigned
+ rb_event_ts_length(struct ring_buffer_event *event)
+ {
+       unsigned len = 0;
+       if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+               /* time extends include the data event after it */
+               len = RB_LEN_TIME_EXTEND;
+               event = skip_time_extend(event);
+       }
+       return len + rb_event_length(event);
+ }
  /**
   * ring_buffer_event_length - return the length of the event
   * @event: the event to get the length of
+  *
+  * Returns the size of the data load of a data event.
+  * If the event is something other than a data event, it
+  * returns the size of the event itself. With the exception
+  * of a TIME EXTEND, where it still returns the size of the
+  * data load of the data event after it.
   */
  unsigned ring_buffer_event_length(struct ring_buffer_event *event)
  {
-       unsigned length = rb_event_length(event);
+       unsigned length;
+       if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+               event = skip_time_extend(event);
+       length = rb_event_length(event);
        if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
                return length;
        length -= RB_EVNT_HDR_SIZE;
@@@ -294,6 -329,8 +329,8 @@@ EXPORT_SYMBOL_GPL(ring_buffer_event_len
  static void *
  rb_event_data(struct ring_buffer_event *event)
  {
+       if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+               event = skip_time_extend(event);
        BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
        /* If length is in len field, then array[0] has the data */
        if (event->type_len)
@@@ -404,9 -441,6 +441,6 @@@ static inline int test_time_stamp(u64 d
  /* Max payload is BUF_PAGE_SIZE - header (8bytes) */
  #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
  
- /* Max number of timestamps that can fit on a page */
- #define RB_TIMESTAMPS_PER_PAGE        (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND)
  int ring_buffer_print_page_header(struct trace_seq *s)
  {
        struct buffer_data_page field;
@@@ -1546,6 -1580,25 +1580,25 @@@ static void rb_inc_iter(struct ring_buf
        iter->head = 0;
  }
  
+ /* Slow path, do not inline */
+ static noinline struct ring_buffer_event *
+ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
+ {
+       event->type_len = RINGBUF_TYPE_TIME_EXTEND;
+       /* Not the first event on the page? */
+       if (rb_event_index(event)) {
+               event->time_delta = delta & TS_MASK;
+               event->array[0] = delta >> TS_SHIFT;
+       } else {
+               /* nope, just zero it */
+               event->time_delta = 0;
+               event->array[0] = 0;
+       }
+       return skip_time_extend(event);
+ }
  /**
   * ring_buffer_update_event - update event type and data
   * @event: the even to update
   * data field.
   */
  static void
- rb_update_event(struct ring_buffer_event *event,
-                        unsigned type, unsigned length)
+ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
+               struct ring_buffer_event *event, unsigned length,
+               int add_timestamp, u64 delta)
  {
-       event->type_len = type;
-       switch (type) {
-       case RINGBUF_TYPE_PADDING:
-       case RINGBUF_TYPE_TIME_EXTEND:
-       case RINGBUF_TYPE_TIME_STAMP:
-               break;
+       /* Only a commit updates the timestamp */
+       if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
+               delta = 0;
  
-       case 0:
-               length -= RB_EVNT_HDR_SIZE;
-               if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
-                       event->array[0] = length;
-               else
-                       event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
-               break;
-       default:
-               BUG();
+       /*
+        * If we need to add a timestamp, then we
+        * add it to the start of the resevered space.
+        */
+       if (unlikely(add_timestamp)) {
+               event = rb_add_time_stamp(event, delta);
+               length -= RB_LEN_TIME_EXTEND;
+               delta = 0;
        }
+       event->time_delta = delta;
+       length -= RB_EVNT_HDR_SIZE;
+       if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
+               event->type_len = 0;
+               event->array[0] = length;
+       } else
+               event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
  }
  
  /*
@@@ -1823,10 -1879,13 +1879,13 @@@ rb_reset_tail(struct ring_buffer_per_cp
        local_sub(length, &tail_page->write);
  }
  
- static struct ring_buffer_event *
+ /*
+  * This is the slow path, force gcc not to inline it.
+  */
+ static noinline struct ring_buffer_event *
  rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
             unsigned long length, unsigned long tail,
-            struct buffer_page *tail_page, u64 *ts)
+            struct buffer_page *tail_page, u64 ts)
  {
        struct buffer_page *commit_page = cpu_buffer->commit_page;
        struct ring_buffer *buffer = cpu_buffer->buffer;
                 * Nested commits always have zero deltas, so
                 * just reread the time stamp
                 */
-               *ts = rb_time_stamp(buffer);
-               next_page->page->time_stamp = *ts;
+               ts = rb_time_stamp(buffer);
+               next_page->page->time_stamp = ts;
        }
  
   out_again:
  
  static struct ring_buffer_event *
  __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
-                 unsigned type, unsigned long length, u64 *ts)
+                 unsigned long length, u64 ts,
+                 u64 delta, int add_timestamp)
  {
        struct buffer_page *tail_page;
        struct ring_buffer_event *event;
        unsigned long tail, write;
  
+       /*
+        * If the time delta since the last event is too big to
+        * hold in the time field of the event, then we append a
+        * TIME EXTEND event ahead of the data event.
+        */
+       if (unlikely(add_timestamp))
+               length += RB_LEN_TIME_EXTEND;
        tail_page = cpu_buffer->tail_page;
        write = local_add_return(length, &tail_page->write);
  
        tail = write - length;
  
        /* See if we shot pass the end of this buffer page */
-       if (write > BUF_PAGE_SIZE)
+       if (unlikely(write > BUF_PAGE_SIZE))
                return rb_move_tail(cpu_buffer, length, tail,
                                    tail_page, ts);
  
  
        event = __rb_page_index(tail_page, tail);
        kmemcheck_annotate_bitfield(event, bitfield);
-       rb_update_event(event, type, length);
+       rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
  
-       /* The passed in type is zero for DATA */
-       if (likely(!type))
-               local_inc(&tail_page->entries);
+       local_inc(&tail_page->entries);
  
        /*
         * If this is the first commit on the page, then update
         * its timestamp.
         */
        if (!tail)
-               tail_page->page->time_stamp = *ts;
+               tail_page->page->time_stamp = ts;
  
        return event;
  }
@@@ -1977,7 -2043,7 +2043,7 @@@ rb_try_to_discard(struct ring_buffer_pe
        unsigned long addr;
  
        new_index = rb_event_index(event);
-       old_index = new_index + rb_event_length(event);
+       old_index = new_index + rb_event_ts_length(event);
        addr = (unsigned long)event;
        addr &= PAGE_MASK;
  
        return 0;
  }
  
- static int
- rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
-                 u64 *ts, u64 *delta)
- {
-       struct ring_buffer_event *event;
-       int ret;
-       WARN_ONCE(*delta > (1ULL << 59),
-                 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
-                 (unsigned long long)*delta,
-                 (unsigned long long)*ts,
-                 (unsigned long long)cpu_buffer->write_stamp);
-       /*
-        * The delta is too big, we to add a
-        * new timestamp.
-        */
-       event = __rb_reserve_next(cpu_buffer,
-                                 RINGBUF_TYPE_TIME_EXTEND,
-                                 RB_LEN_TIME_EXTEND,
-                                 ts);
-       if (!event)
-               return -EBUSY;
-       if (PTR_ERR(event) == -EAGAIN)
-               return -EAGAIN;
-       /* Only a commited time event can update the write stamp */
-       if (rb_event_is_commit(cpu_buffer, event)) {
-               /*
-                * If this is the first on the page, then it was
-                * updated with the page itself. Try to discard it
-                * and if we can't just make it zero.
-                */
-               if (rb_event_index(event)) {
-                       event->time_delta = *delta & TS_MASK;
-                       event->array[0] = *delta >> TS_SHIFT;
-               } else {
-                       /* try to discard, since we do not need this */
-                       if (!rb_try_to_discard(cpu_buffer, event)) {
-                               /* nope, just zero it */
-                               event->time_delta = 0;
-                               event->array[0] = 0;
-                       }
-               }
-               cpu_buffer->write_stamp = *ts;
-               /* let the caller know this was the commit */
-               ret = 1;
-       } else {
-               /* Try to discard the event */
-               if (!rb_try_to_discard(cpu_buffer, event)) {
-                       /* Darn, this is just wasted space */
-                       event->time_delta = 0;
-                       event->array[0] = 0;
-               }
-               ret = 0;
-       }
-       *delta = 0;
-       return ret;
- }
  static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
  {
        local_inc(&cpu_buffer->committing);
        local_inc(&cpu_buffer->commits);
  }
  
- static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
+ static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
  {
        unsigned long commits;
  
@@@ -2110,9 -2113,10 +2113,10 @@@ rb_reserve_next_event(struct ring_buffe
                      unsigned long length)
  {
        struct ring_buffer_event *event;
-       u64 ts, delta = 0;
-       int commit = 0;
+       u64 ts, delta;
        int nr_loops = 0;
+       int add_timestamp;
+       u64 diff;
  
        rb_start_commit(cpu_buffer);
  
  
        length = rb_calculate_event_length(length);
   again:
+       add_timestamp = 0;
+       delta = 0;
        /*
         * We allow for interrupts to reenter here and do a trace.
         * If one does, it will cause this original code to loop
                goto out_fail;
  
        ts = rb_time_stamp(cpu_buffer->buffer);
+       diff = ts - cpu_buffer->write_stamp;
  
-       /*
-        * Only the first commit can update the timestamp.
-        * Yes there is a race here. If an interrupt comes in
-        * just after the conditional and it traces too, then it
-        * will also check the deltas. More than one timestamp may
-        * also be made. But only the entry that did the actual
-        * commit will be something other than zero.
-        */
-       if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
-                  rb_page_write(cpu_buffer->tail_page) ==
-                  rb_commit_index(cpu_buffer))) {
-               u64 diff;
-               diff = ts - cpu_buffer->write_stamp;
-               /* make sure this diff is calculated here */
-               barrier();
-               /* Did the write stamp get updated already? */
-               if (unlikely(ts < cpu_buffer->write_stamp))
-                       goto get_event;
+       /* make sure this diff is calculated here */
+       barrier();
  
+       /* Did the write stamp get updated already? */
+       if (likely(ts >= cpu_buffer->write_stamp)) {
                delta = diff;
                if (unlikely(test_time_stamp(delta))) {
-                       commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
-                       if (commit == -EBUSY)
-                               goto out_fail;
-                       if (commit == -EAGAIN)
-                               goto again;
-                       RB_WARN_ON(cpu_buffer, commit < 0);
+                       WARN_ONCE(delta > (1ULL << 59),
+                                 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
+                                 (unsigned long long)delta,
+                                 (unsigned long long)ts,
+                                 (unsigned long long)cpu_buffer->write_stamp);
+                       add_timestamp = 1;
                }
        }
  
-  get_event:
-       event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
+       event = __rb_reserve_next(cpu_buffer, length, ts,
+                                 delta, add_timestamp);
        if (unlikely(PTR_ERR(event) == -EAGAIN))
                goto again;
  
        if (!event)
                goto out_fail;
  
-       if (!rb_event_is_commit(cpu_buffer, event))
-               delta = 0;
-       event->time_delta = delta;
        return event;
  
   out_fail:
  
  #define TRACE_RECURSIVE_DEPTH 16
  
- static int trace_recursive_lock(void)
+ /* Keep this code out of the fast path cache */
+ static noinline void trace_recursive_fail(void)
  {
-       current->trace_recursion++;
-       if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
-               return 0;
        /* Disable all tracing before we do anything else */
        tracing_off_permanent();
  
                    in_nmi());
  
        WARN_ON_ONCE(1);
+ }
+ static inline int trace_recursive_lock(void)
+ {
+       current->trace_recursion++;
+       if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
+               return 0;
+       trace_recursive_fail();
        return -1;
  }
  
- static void trace_recursive_unlock(void)
+ static inline void trace_recursive_unlock(void)
  {
        WARN_ON_ONCE(!current->trace_recursion);
  
@@@ -2308,12 -2298,28 +2298,28 @@@ static voi
  rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
                      struct ring_buffer_event *event)
  {
+       u64 delta;
        /*
         * The event first in the commit queue updates the
         * time stamp.
         */
-       if (rb_event_is_commit(cpu_buffer, event))
-               cpu_buffer->write_stamp += event->time_delta;
+       if (rb_event_is_commit(cpu_buffer, event)) {
+               /*
+                * A commit event that is first on a page
+                * updates the write timestamp with the page stamp
+                */
+               if (!rb_event_index(event))
+                       cpu_buffer->write_stamp =
+                               cpu_buffer->commit_page->page->time_stamp;
+               else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+                       delta = event->array[0];
+                       delta <<= TS_SHIFT;
+                       delta += event->time_delta;
+                       cpu_buffer->write_stamp += delta;
+               } else
+                       cpu_buffer->write_stamp += event->time_delta;
+       }
  }
  
  static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
@@@ -2353,6 -2359,9 +2359,9 @@@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_co
  
  static inline void rb_event_discard(struct ring_buffer_event *event)
  {
+       if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+               event = skip_time_extend(event);
        /* array[0] holds the actual length for the discarded event */
        event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
        event->type_len = RINGBUF_TYPE_PADDING;
@@@ -2606,19 -2615,6 +2615,19 @@@ void ring_buffer_record_enable_cpu(stru
  }
  EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
  
 +/*
 + * The total entries in the ring buffer is the running counter
 + * of entries entered into the ring buffer, minus the sum of
 + * the entries read from the ring buffer and the number of
 + * entries that were overwritten.
 + */
 +static inline unsigned long
 +rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
 +{
 +      return local_read(&cpu_buffer->entries) -
 +              (local_read(&cpu_buffer->overrun) + cpu_buffer->read);
 +}
 +
  /**
   * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
   * @buffer: The ring buffer
  unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
  {
        struct ring_buffer_per_cpu *cpu_buffer;
 -      unsigned long ret;
  
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
                return 0;
  
        cpu_buffer = buffer->buffers[cpu];
 -      ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
 -              - cpu_buffer->read;
  
 -      return ret;
 +      return rb_num_of_entries(cpu_buffer);
  }
  EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
  
@@@ -2694,7 -2693,8 +2703,7 @@@ unsigned long ring_buffer_entries(struc
        /* if you care about this being correct, lock the buffer */
        for_each_buffer_cpu(buffer, cpu) {
                cpu_buffer = buffer->buffers[cpu];
 -              entries += (local_read(&cpu_buffer->entries) -
 -                          local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
 +              entries += rb_num_of_entries(cpu_buffer);
        }
  
        return entries;
@@@ -3049,12 -3049,12 +3058,12 @@@ rb_buffer_peek(struct ring_buffer_per_c
  
   again:
        /*
-        * We repeat when a timestamp is encountered. It is possible
-        * to get multiple timestamps from an interrupt entering just
-        * as one timestamp is about to be written, or from discarded
-        * commits. The most that we can have is the number on a single page.
+        * We repeat when a time extend is encountered.
+        * Since the time extend is always attached to a data event,
+        * we should never loop more than once.
+        * (We never hit the following condition more than twice).
         */
-       if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
+       if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
                return NULL;
  
        reader = rb_get_reader_page(cpu_buffer);
@@@ -3130,14 -3130,12 +3139,12 @@@ rb_iter_peek(struct ring_buffer_iter *i
                return NULL;
  
        /*
-        * We repeat when a timestamp is encountered.
-        * We can get multiple timestamps by nested interrupts or also
-        * if filtering is on (discarding commits). Since discarding
-        * commits can be frequent we can get a lot of timestamps.
-        * But we limit them by not adding timestamps if they begin
-        * at the start of a page.
+        * We repeat when a time extend is encountered.
+        * Since the time extend is always attached to a data event,
+        * we should never loop more than once.
+        * (We never hit the following condition more than twice).
         */
-       if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
+       if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
                return NULL;
  
        if (rb_per_cpu_empty(cpu_buffer))
@@@ -3835,7 -3833,8 +3842,8 @@@ int ring_buffer_read_page(struct ring_b
                if (len > (commit - read))
                        len = (commit - read);
  
-               size = rb_event_length(event);
+               /* Always keep the time extend and data together */
+               size = rb_event_ts_length(event);
  
                if (len < size)
                        goto out_unlock;
                                break;
  
                        event = rb_reader_event(cpu_buffer);
-                       size = rb_event_length(event);
+                       /* Always keep the time extend and data together */
+                       size = rb_event_ts_length(event);
                } while (len > size);
  
                /* update bpage */