Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 21 Oct 2010 19:55:43 +0000 (12:55 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 21 Oct 2010 19:55:43 +0000 (12:55 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 21 Oct 2010 19:55:43 +0000 (12:55 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 21 Oct 2010 19:55:43 +0000 (12:55 -0700)
diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl

index a0d479d1e1dd872bd1ae7b4d17d4582df03a384c..f66f4df186908f5d6ba79171e303949683107a1e 100644 (file)
--- a/Documentation/DocBook/kernel-locking.tmpl
+++ b/Documentation/DocBook/kernel-locking.tmpl
@@ -1645,7 +1645,9 @@ the amount of locking which needs to be done.
        all the readers who were traversing the list when we deleted the
        element are finished.  We use <function>call_rcu()</function> to
        register a callback which will actually destroy the object once
-      the readers are finished.
+      all pre-existing readers are finished.  Alternatively,
+      <function>synchronize_rcu()</function> may be used to block until
+      all pre-existing are finished.
      </para>
      <para>
        But how does Read Copy Update know when the readers are
@@ -1714,7 +1716,7 @@ the amount of locking which needs to be done.
  -        object_put(obj);
  +        list_del_rcu(&amp;obj-&gt;list);
           cache_num--;
-+        call_rcu(&amp;obj-&gt;rcu, cache_delete_rcu, obj);
++        call_rcu(&amp;obj-&gt;rcu, cache_delete_rcu);
   }
  
   /* Must be holding cache_lock */
@@ -1725,14 +1727,6 @@ the amount of locking which needs to be done.
           if (++cache_num > MAX_CACHE_SIZE) {
                   struct object *i, *outcast = NULL;
                   list_for_each_entry(i, &amp;cache, list) {
-@@ -85,6 +94,7 @@
-         obj-&gt;popularity = 0;
-         atomic_set(&amp;obj-&gt;refcnt, 1); /* The cache holds a reference */
-         spin_lock_init(&amp;obj-&gt;lock);
-+        INIT_RCU_HEAD(&amp;obj-&gt;rcu);
-
-         spin_lock_irqsave(&amp;cache_lock, flags);
-         __cache_add(obj);
  @@ -104,12 +114,11 @@
   struct object *cache_find(int id)
   {
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt

index 790d1a8123760211bdcb6427b75c1b4abf2b7210..0c134f8afc6f60b1316b9551577179f1b6dc3961 100644 (file)
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -218,13 +218,22 @@ over a rather long period of time, but improvements are always welcome!
         include:
  
         a.      Keeping a count of the number of data-structure elements
-               used by the RCU-protected data structure, including those
-               waiting for a grace period to elapse.  Enforce a limit
-               on this number, stalling updates as needed to allow
-               previously deferred frees to complete.
-
-               Alternatively, limit only the number awaiting deferred
-               free rather than the total number of elements.
+               used by the RCU-protected data structure, including
+               those waiting for a grace period to elapse.  Enforce a
+               limit on this number, stalling updates as needed to allow
+               previously deferred frees to complete.  Alternatively,
+               limit only the number awaiting deferred free rather than
+               the total number of elements.
+
+               One way to stall the updates is to acquire the update-side
+               mutex.  (Don't try this with a spinlock -- other CPUs
+               spinning on the lock could prevent the grace period
+               from ever ending.)  Another way to stall the updates
+               is for the updates to use a wrapper function around
+               the memory allocator, so that this wrapper function
+               simulates OOM when there is too much memory awaiting an
+               RCU grace period.  There are of course many other
+               variations on this theme.
  
         b.      Limiting update rate.  For example, if updates occur only
                 once per hour, then no explicit rate limiting is required,
@@ -365,3 +374,26 @@ over a rather long period of time, but improvements are always welcome!
         and the compiler to freely reorder code into and out of RCU
         read-side critical sections.  It is the responsibility of the
         RCU update-side primitives to deal with this.
+
+17.    Use CONFIG_PROVE_RCU, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and
+       the __rcu sparse checks to validate your RCU code.  These
+       can help find problems as follows:
+
+       CONFIG_PROVE_RCU: check that accesses to RCU-protected data
+               structures are carried out under the proper RCU
+               read-side critical section, while holding the right
+               combination of locks, or whatever other conditions
+               are appropriate.
+
+       CONFIG_DEBUG_OBJECTS_RCU_HEAD: check that you don't pass the
+               same object to call_rcu() (or friends) before an RCU
+               grace period has elapsed since the last time that you
+               passed that same object to call_rcu() (or friends).
+
+       __rcu sparse checks: tag the pointer to the RCU-protected data
+               structure with __rcu, and sparse will warn you if you
+               access that pointer without the services of one of the
+               variants of rcu_dereference().
+
+       These debugging aids can help you find problems that are
+       otherwise extremely difficult to spot.
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt

index 44c6dcc93d6dad8e9cee2cadb9f49fa992c5eb00..862c08ef1fde4436ddac8010ba4a1aae40328719 100644 (file)
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -80,6 +80,24 @@ o    A CPU looping with bottom halves disabled.  This condition can
  o      For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
         without invoking schedule().
  
+o      A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
+       happen to preempt a low-priority task in the middle of an RCU
+       read-side critical section.   This is especially damaging if
+       that low-priority task is not permitted to run on any other CPU,
+       in which case the next RCU grace period can never complete, which
+       will eventually cause the system to run out of memory and hang.
+       While the system is in the process of running itself out of
+       memory, you might see stall-warning messages.
+
+o      A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
+       is running at a higher priority than the RCU softirq threads.
+       This will prevent RCU callbacks from ever being invoked,
+       and in a CONFIG_TREE_PREEMPT_RCU kernel will further prevent
+       RCU grace periods from ever completing.  Either way, the
+       system will eventually run out of memory and hang.  In the
+       CONFIG_TREE_PREEMPT_RCU case, you might see stall-warning
+       messages.
+
  o      A bug in the RCU implementation.
  
  o      A hardware failure.  This is quite unlikely, but has occurred
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt

index efd8cc95c06b1470db165a74a0d99fd33ee17dea..a851118775d84c7a1d2356ba6a6c8e6208292887 100644 (file)
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -125,6 +125,17 @@ o  "b" is the batch limit for this CPU.  If more than this number
         of RCU callbacks is ready to invoke, then the remainder will
         be deferred.
  
+o      "ci" is the number of RCU callbacks that have been invoked for
+       this CPU.  Note that ci+ql is the number of callbacks that have
+       been registered in absence of CPU-hotplug activity.
+
+o      "co" is the number of RCU callbacks that have been orphaned due to
+       this CPU going offline.
+
+o      "ca" is the number of RCU callbacks that have been adopted due to
+       other CPUs going offline.  Note that ci+co-ca+ql is the number of
+       RCU callbacks registered on this CPU.
+
  There is also an rcu/rcudata.csv file with the same information in
  comma-separated-variable spreadsheet format.
  
@@ -180,7 +191,7 @@ o   "s" is the "signaled" state that drives force_quiescent_state()'s
  
  o      "jfq" is the number of jiffies remaining for this grace period
         before force_quiescent_state() is invoked to help push things
-       along.  Note that CPUs in dyntick-idle mode thoughout the grace
+       along.  Note that CPUs in dyntick-idle mode throughout the grace
         period will not report on their own, but rather must be check by
         some other CPU via force_quiescent_state().
  
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt

index 1762b81fcdf2ec4a235423687865453a16196aed..741fe66d6eca4852894cac79d6ad2d3162486c7a 100644 (file)
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -542,9 +542,11 @@ Kprobes does not use mutexes or allocate memory except during
  registration and unregistration.
  
  Probe handlers are run with preemption disabled.  Depending on the
-architecture, handlers may also run with interrupts disabled.  In any
-case, your handler should not yield the CPU (e.g., by attempting to
-acquire a semaphore).
+architecture and optimization state, handlers may also run with
+interrupts disabled (e.g., kretprobe handlers and optimized kprobe
+handlers run without interrupt disabled on x86/x86-64).  In any case,
+your handler should not yield the CPU (e.g., by attempting to acquire
+a semaphore).
  
  Since a return probe is implemented by replacing the return
  address with the trampoline's address, stack backtraces and calls
diff --git a/MAINTAINERS b/MAINTAINERS

index 7679bf32f7bbe382f5f30ed966732184873ce32b..3d4179fbc5263ff4b624753c9ce06004286ce246 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1527,6 +1527,8 @@ T:        git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
  S:     Supported
  F:     Documentation/filesystems/ceph.txt
  F:     fs/ceph
+F:     net/ceph
+F:     include/linux/ceph
  
  CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
  M:     David Vrabel <david.vrabel@csr.com>
@@ -3162,7 +3164,7 @@ F:        drivers/net/ioc3-eth.c
  
  IOC3 SERIAL DRIVER
  M:     Pat Gefre <pfg@sgi.com>
-L:     linux-mips@linux-mips.org
+L:     linux-serial@vger.kernel.org
  S:     Maintained
  F:     drivers/serial/ioc3_serial.c
  
@@ -4805,6 +4807,15 @@ F:       fs/qnx4/
  F:     include/linux/qnx4_fs.h
  F:     include/linux/qnxtypes.h
  
+RADOS BLOCK DEVICE (RBD)
+F:     include/linux/qnxtypes.h
+M:     Yehuda Sadeh <yehuda@hq.newdream.net>
+M:     Sage Weil <sage@newdream.net>
+M:     ceph-devel@vger.kernel.org
+S:     Supported
+F:     drivers/block/rbd.c
+F:     drivers/block/rbd_types.h
+
  RADEON FRAMEBUFFER DISPLAY DRIVER
  M:     Benjamin Herrenschmidt <benh@kernel.crashing.org>
  L:     linux-fbdev@vger.kernel.org
diff --git a/Makefile b/Makefile

index 77b5c6ed0ce5bf64764b97d2a45bd2e7b9fd3b11..d3c10719bbbd5b311c51f9a407184fd30847e6ea 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,8 @@
  VERSION = 2
  PATCHLEVEL = 6
  SUBLEVEL = 36
-EXTRAVERSION = -rc7
-NAME = Sheep on Meth
+EXTRAVERSION =
+NAME = Flesh-Eating Bats with Fangs
  
  # *DOCUMENTATION*
  # To see a list of typical targets execute "make help"
@@ -568,6 +568,12 @@ endif
  
  ifdef CONFIG_FUNCTION_TRACER
  KBUILD_CFLAGS  += -pg
+ifdef CONFIG_DYNAMIC_FTRACE
+       ifdef CONFIG_HAVE_C_RECORDMCOUNT
+               BUILD_C_RECORDMCOUNT := y
+               export BUILD_C_RECORDMCOUNT
+       endif
+endif
  endif
  
  # We trigger additional mismatches with less inlining
@@ -591,6 +597,11 @@ KBUILD_CFLAGS      += $(call cc-option,-fno-strict-overflow)
  # conserve stack if available
  KBUILD_CFLAGS   += $(call cc-option,-fconserve-stack)
  
+# check for 'asm goto'
+ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC)), y)
+       KBUILD_CFLAGS += -DCC_HAVE_ASM_GOTO
+endif
+
  # Add user supplied CPPFLAGS, AFLAGS and CFLAGS as the last assignments
  # But warn user when we do so
  warn-assign = \
diff --git a/arch/Kconfig b/arch/Kconfig

index fe48fc7a3ebaf85c9bf8c56391bf20d9c9b90bbf..53d7f619a1b9b46643c59cd3c7d4ace354148c20 100644 (file)
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -158,4 +158,7 @@ config HAVE_PERF_EVENTS_NMI
           subsystem.  Also has support for calculating CPU cycle events
           to determine how many clock cycles in a given period.
  
+config HAVE_ARCH_JUMP_LABEL
+       bool
+
  source "kernel/gcov/Kconfig"
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig

index b9647bb66d1388d9c13503748252c2a1df4ac830..d04ccd73af45f6487f442a0dba26921db884e218 100644 (file)
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -9,6 +9,7 @@ config ALPHA
         select HAVE_IDE
         select HAVE_OPROFILE
         select HAVE_SYSCALL_WRAPPERS
+       select HAVE_IRQ_WORK
         select HAVE_PERF_EVENTS
         select HAVE_DMA_ATTRS
         help
diff --git a/arch/alpha/include/asm/perf_event.h b/arch/alpha/include/asm/perf_event.h

index 4157cd3c44a96d2811410b334a35f11923ba6fbf..fe792ca818f64c4d9954e8b62e5f8064a5d5777e 100644 (file)
--- a/arch/alpha/include/asm/perf_event.h
+++ b/arch/alpha/include/asm/perf_event.h
@@ -1,11 +1,6 @@
  #ifndef __ASM_ALPHA_PERF_EVENT_H
  #define __ASM_ALPHA_PERF_EVENT_H
  
-/* Alpha only supports software events through this interface. */
-extern void set_perf_event_pending(void);
-
-#define PERF_EVENT_INDEX_OFFSET 0
-
  #ifdef CONFIG_PERF_EVENTS
  extern void init_hw_perf_events(void);
  #else
diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c

index 85d8e4f58c83ce612269162b9635bd49059c39dd..1cc49683fb69b2a5f96639e71a2f1af821479e77 100644 (file)
--- a/arch/alpha/kernel/perf_event.c
+++ b/arch/alpha/kernel/perf_event.c
@@ -307,7 +307,7 @@ again:
                              new_raw_count) != prev_raw_count)
                 goto again;
  
-       delta = (new_raw_count  - (prev_raw_count & alpha_pmu->pmc_count_mask[idx])) + ovf;
+       delta = (new_raw_count - (prev_raw_count & alpha_pmu->pmc_count_mask[idx])) + ovf;
  
         /* It is possible on very rare occasions that the PMC has overflowed
          * but the interrupt is yet to come.  Detect and fix this situation.
@@ -402,14 +402,13 @@ static void maybe_change_configuration(struct cpu_hw_events *cpuc)
                 struct hw_perf_event *hwc = &pe->hw;
                 int idx = hwc->idx;
  
-               if (cpuc->current_idx[j] != PMC_NO_INDEX) {
-                       cpuc->idx_mask |= (1<<cpuc->current_idx[j]);
-                       continue;
+               if (cpuc->current_idx[j] == PMC_NO_INDEX) {
+                       alpha_perf_event_set_period(pe, hwc, idx);
+                       cpuc->current_idx[j] = idx;
                 }
  
-               alpha_perf_event_set_period(pe, hwc, idx);
-               cpuc->current_idx[j] = idx;
-               cpuc->idx_mask |= (1<<cpuc->current_idx[j]);
+               if (!(hwc->state & PERF_HES_STOPPED))
+                       cpuc->idx_mask |= (1<<cpuc->current_idx[j]);
         }
         cpuc->config = cpuc->event[0]->hw.config_base;
  }
@@ -420,12 +419,13 @@ static void maybe_change_configuration(struct cpu_hw_events *cpuc)
   *  - this function is called from outside this module via the pmu struct
   *    returned from perf event initialisation.
   */
-static int alpha_pmu_enable(struct perf_event *event)
+static int alpha_pmu_add(struct perf_event *event, int flags)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
         int n0;
         int ret;
-       unsigned long flags;
+       unsigned long irq_flags;
  
         /*
          * The Sparc code has the IRQ disable first followed by the perf
@@ -435,8 +435,8 @@ static int alpha_pmu_enable(struct perf_event *event)
          * nevertheless we disable the PMCs first to enable a potential
          * final PMI to occur before we disable interrupts.
          */
-       perf_disable();
-       local_irq_save(flags);
+       perf_pmu_disable(event->pmu);
+       local_irq_save(irq_flags);
  
         /* Default to error to be returned */
         ret = -EAGAIN;
@@ -455,8 +455,12 @@ static int alpha_pmu_enable(struct perf_event *event)
                 }
         }
  
-       local_irq_restore(flags);
-       perf_enable();
+       hwc->state = PERF_HES_UPTODATE;
+       if (!(flags & PERF_EF_START))
+               hwc->state |= PERF_HES_STOPPED;
+
+       local_irq_restore(irq_flags);
+       perf_pmu_enable(event->pmu);
  
         return ret;
  }
@@ -467,15 +471,15 @@ static int alpha_pmu_enable(struct perf_event *event)
   *  - this function is called from outside this module via the pmu struct
   *    returned from perf event initialisation.
   */
-static void alpha_pmu_disable(struct perf_event *event)
+static void alpha_pmu_del(struct perf_event *event, int flags)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         struct hw_perf_event *hwc = &event->hw;
-       unsigned long flags;
+       unsigned long irq_flags;
         int j;
  
-       perf_disable();
-       local_irq_save(flags);
+       perf_pmu_disable(event->pmu);
+       local_irq_save(irq_flags);
  
         for (j = 0; j < cpuc->n_events; j++) {
                 if (event == cpuc->event[j]) {
@@ -501,8 +505,8 @@ static void alpha_pmu_disable(struct perf_event *event)
                 }
         }
  
-       local_irq_restore(flags);
-       perf_enable();
+       local_irq_restore(irq_flags);
+       perf_pmu_enable(event->pmu);
  }
  
  
@@ -514,13 +518,44 @@ static void alpha_pmu_read(struct perf_event *event)
  }
  
  
-static void alpha_pmu_unthrottle(struct perf_event *event)
+static void alpha_pmu_stop(struct perf_event *event, int flags)
+{
+       struct hw_perf_event *hwc = &event->hw;
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+       if (!(hwc->state & PERF_HES_STOPPED)) {
+               cpuc->idx_mask &= ~(1UL<<hwc->idx);
+               hwc->state |= PERF_HES_STOPPED;
+       }
+
+       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+               alpha_perf_event_update(event, hwc, hwc->idx, 0);
+               hwc->state |= PERF_HES_UPTODATE;
+       }
+
+       if (cpuc->enabled)
+               wrperfmon(PERFMON_CMD_DISABLE, (1UL<<hwc->idx));
+}
+
+
+static void alpha_pmu_start(struct perf_event *event, int flags)
  {
         struct hw_perf_event *hwc = &event->hw;
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
  
+       if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+               return;
+
+       if (flags & PERF_EF_RELOAD) {
+               WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+               alpha_perf_event_set_period(event, hwc, hwc->idx);
+       }
+
+       hwc->state = 0;
+
         cpuc->idx_mask |= 1UL<<hwc->idx;
-       wrperfmon(PERFMON_CMD_ENABLE, (1UL<<hwc->idx));
+       if (cpuc->enabled)
+               wrperfmon(PERFMON_CMD_ENABLE, (1UL<<hwc->idx));
  }
  
  
@@ -642,39 +677,36 @@ static int __hw_perf_event_init(struct perf_event *event)
         return 0;
  }
  
-static const struct pmu pmu = {
-       .enable         = alpha_pmu_enable,
-       .disable        = alpha_pmu_disable,
-       .read           = alpha_pmu_read,
-       .unthrottle     = alpha_pmu_unthrottle,
-};
-
-
  /*
   * Main entry point to initialise a HW performance event.
   */
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+static int alpha_pmu_event_init(struct perf_event *event)
  {
         int err;
  
+       switch (event->attr.type) {
+       case PERF_TYPE_RAW:
+       case PERF_TYPE_HARDWARE:
+       case PERF_TYPE_HW_CACHE:
+               break;
+
+       default:
+               return -ENOENT;
+       }
+
         if (!alpha_pmu)
-               return ERR_PTR(-ENODEV);
+               return -ENODEV;
  
         /* Do the real initialisation work. */
         err = __hw_perf_event_init(event);
  
-       if (err)
-               return ERR_PTR(err);
-
-       return &pmu;
+       return err;
  }
  
-
-
  /*
   * Main entry point - enable HW performance counters.
   */
-void hw_perf_enable(void)
+static void alpha_pmu_enable(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
  
@@ -700,7 +732,7 @@ void hw_perf_enable(void)
   * Main entry point - disable HW performance counters.
   */
  
-void hw_perf_disable(void)
+static void alpha_pmu_disable(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
  
@@ -713,6 +745,17 @@ void hw_perf_disable(void)
         wrperfmon(PERFMON_CMD_DISABLE, cpuc->idx_mask);
  }
  
+static struct pmu pmu = {
+       .pmu_enable     = alpha_pmu_enable,
+       .pmu_disable    = alpha_pmu_disable,
+       .event_init     = alpha_pmu_event_init,
+       .add            = alpha_pmu_add,
+       .del            = alpha_pmu_del,
+       .start          = alpha_pmu_start,
+       .stop           = alpha_pmu_stop,
+       .read           = alpha_pmu_read,
+};
+
  
  /*
   * Main entry point - don't know when this is called but it
@@ -766,7 +809,7 @@ static void alpha_perf_event_irq_handler(unsigned long la_ptr,
         wrperfmon(PERFMON_CMD_DISABLE, cpuc->idx_mask);
  
         /* la_ptr is the counter that overflowed. */
-       if (unlikely(la_ptr >= perf_max_events)) {
+       if (unlikely(la_ptr >= alpha_pmu->num_pmcs)) {
                 /* This should never occur! */
                 irq_err_count++;
                 pr_warning("PMI: silly index %ld\n", la_ptr);
@@ -807,7 +850,7 @@ static void alpha_perf_event_irq_handler(unsigned long la_ptr,
                         /* Interrupts coming too quickly; "throttle" the
                          * counter, i.e., disable it for a little while.
                          */
-                       cpuc->idx_mask &= ~(1UL<<idx);
+                       alpha_pmu_stop(event, 0);
                 }
         }
         wrperfmon(PERFMON_CMD_ENABLE, cpuc->idx_mask);
@@ -837,6 +880,7 @@ void __init init_hw_perf_events(void)
  
         /* And set up PMU specification */
         alpha_pmu = &ev67_pmu;
-       perf_max_events = alpha_pmu->num_pmcs;
+
+       perf_pmu_register(&pmu);
  }
  
diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c

index 396af1799ea44e3a80bd758226f31f0c3bced799..0f1d8493cfca9c498a5187b97ae7c9dff8195b71 100644 (file)
--- a/arch/alpha/kernel/time.c
+++ b/arch/alpha/kernel/time.c
@@ -41,7 +41,7 @@
  #include <linux/init.h>
  #include <linux/bcd.h>
  #include <linux/profile.h>
-#include <linux/perf_event.h>
+#include <linux/irq_work.h>
  
  #include <asm/uaccess.h>
  #include <asm/io.h>
@@ -83,25 +83,25 @@ static struct {
  
  unsigned long est_cycle_freq;
  
-#ifdef CONFIG_PERF_EVENTS
+#ifdef CONFIG_IRQ_WORK
  
-DEFINE_PER_CPU(u8, perf_event_pending);
+DEFINE_PER_CPU(u8, irq_work_pending);
  
-#define set_perf_event_pending_flag()  __get_cpu_var(perf_event_pending) = 1
-#define test_perf_event_pending()      __get_cpu_var(perf_event_pending)
-#define clear_perf_event_pending()     __get_cpu_var(perf_event_pending) = 0
+#define set_irq_work_pending_flag()  __get_cpu_var(irq_work_pending) = 1
+#define test_irq_work_pending()      __get_cpu_var(irq_work_pending)
+#define clear_irq_work_pending()     __get_cpu_var(irq_work_pending) = 0
  
-void set_perf_event_pending(void)
+void set_irq_work_pending(void)
  {
-       set_perf_event_pending_flag();
+       set_irq_work_pending_flag();
  }
  
-#else  /* CONFIG_PERF_EVENTS */
+#else  /* CONFIG_IRQ_WORK */
  
-#define test_perf_event_pending()      0
-#define clear_perf_event_pending()
+#define test_irq_work_pending()      0
+#define clear_irq_work_pending()
  
-#endif /* CONFIG_PERF_EVENTS */
+#endif /* CONFIG_IRQ_WORK */
  
  
  static inline __u32 rpcc(void)
@@ -191,9 +191,9 @@ irqreturn_t timer_interrupt(int irq, void *dev)
  
         write_sequnlock(&xtime_lock);
  
-       if (test_perf_event_pending()) {
-               clear_perf_event_pending();
-               perf_event_do_pending();
+       if (test_irq_work_pending()) {
+               clear_irq_work_pending();
+               irq_work_run();
         }
  
  #ifndef CONFIG_SMP
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig

index 9c26ba7244fb450b0c73f15ca2565336033e152b..9103904b3daba63e73815bb2980923bdc3cb045d 100644 (file)
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -23,6 +23,7 @@ config ARM
         select HAVE_KERNEL_GZIP
         select HAVE_KERNEL_LZO
         select HAVE_KERNEL_LZMA
+       select HAVE_IRQ_WORK
         select HAVE_PERF_EVENTS
         select PERF_USE_VMALLOC
         select HAVE_REGS_AND_STACK_ACCESS_API
diff --git a/arch/arm/include/asm/perf_event.h b/arch/arm/include/asm/perf_event.h

index b5799a3b7117d5480eec6456008f79e01d1064ae..c4aa4e8c6af9cda0b7e88a46cbed94dc6e3b565d 100644 (file)
--- a/arch/arm/include/asm/perf_event.h
+++ b/arch/arm/include/asm/perf_event.h
@@ -12,18 +12,6 @@
  #ifndef __ARM_PERF_EVENT_H__
  #define __ARM_PERF_EVENT_H__
  
-/*
- * NOP: on *most* (read: all supported) ARM platforms, the performance
- * counter interrupts are regular interrupts and not an NMI. This
- * means that when we receive the interrupt we can call
- * perf_event_do_pending() that handles all of the work with
- * interrupts disabled.
- */
-static inline void
-set_perf_event_pending(void)
-{
-}
-
  /* ARM performance counters start from 1 (in the cp15 accesses) so use the
   * same indexes here for consistency. */
  #define PERF_EVENT_INDEX_OFFSET 1
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c

index ecbb0288e5dd95c80b420635dffee6ef600f86a8..49643b1467e62d529d4edd661990bb64da1e1f73 100644 (file)
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -123,6 +123,12 @@ armpmu_get_max_events(void)
  }
  EXPORT_SYMBOL_GPL(armpmu_get_max_events);
  
+int perf_num_counters(void)
+{
+       return armpmu_get_max_events();
+}
+EXPORT_SYMBOL_GPL(perf_num_counters);
+
  #define HW_OP_UNSUPPORTED              0xFFFF
  
  #define C(_x) \
@@ -221,46 +227,56 @@ again:
  }
  
  static void
-armpmu_disable(struct perf_event *event)
+armpmu_read(struct perf_event *event)
  {
-       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         struct hw_perf_event *hwc = &event->hw;
-       int idx = hwc->idx;
-
-       WARN_ON(idx < 0);
-
-       clear_bit(idx, cpuc->active_mask);
-       armpmu->disable(hwc, idx);
-
-       barrier();
  
-       armpmu_event_update(event, hwc, idx);
-       cpuc->events[idx] = NULL;
-       clear_bit(idx, cpuc->used_mask);
+       /* Don't read disabled counters! */
+       if (hwc->idx < 0)
+               return;
  
-       perf_event_update_userpage(event);
+       armpmu_event_update(event, hwc, hwc->idx);
  }
  
  static void
-armpmu_read(struct perf_event *event)
+armpmu_stop(struct perf_event *event, int flags)
  {
         struct hw_perf_event *hwc = &event->hw;
  
-       /* Don't read disabled counters! */
-       if (hwc->idx < 0)
+       if (!armpmu)
                 return;
  
-       armpmu_event_update(event, hwc, hwc->idx);
+       /*
+        * ARM pmu always has to update the counter, so ignore
+        * PERF_EF_UPDATE, see comments in armpmu_start().
+        */
+       if (!(hwc->state & PERF_HES_STOPPED)) {
+               armpmu->disable(hwc, hwc->idx);
+               barrier(); /* why? */
+               armpmu_event_update(event, hwc, hwc->idx);
+               hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+       }
  }
  
  static void
-armpmu_unthrottle(struct perf_event *event)
+armpmu_start(struct perf_event *event, int flags)
  {
         struct hw_perf_event *hwc = &event->hw;
  
+       if (!armpmu)
+               return;
+
+       /*
+        * ARM pmu always has to reprogram the period, so ignore
+        * PERF_EF_RELOAD, see the comment below.
+        */
+       if (flags & PERF_EF_RELOAD)
+               WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+
+       hwc->state = 0;
         /*
          * Set the period again. Some counters can't be stopped, so when we
-        * were throttled we simply disabled the IRQ source and the counter
+        * were stopped we simply disabled the IRQ source and the counter
          * may have been left counting. If we don't do this step then we may
          * get an interrupt too soon or *way* too late if the overflow has
          * happened since disabling.
@@ -269,14 +285,33 @@ armpmu_unthrottle(struct perf_event *event)
         armpmu->enable(hwc, hwc->idx);
  }
  
+static void
+armpmu_del(struct perf_event *event, int flags)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+       int idx = hwc->idx;
+
+       WARN_ON(idx < 0);
+
+       clear_bit(idx, cpuc->active_mask);
+       armpmu_stop(event, PERF_EF_UPDATE);
+       cpuc->events[idx] = NULL;
+       clear_bit(idx, cpuc->used_mask);
+
+       perf_event_update_userpage(event);
+}
+
  static int
-armpmu_enable(struct perf_event *event)
+armpmu_add(struct perf_event *event, int flags)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         struct hw_perf_event *hwc = &event->hw;
         int idx;
         int err = 0;
  
+       perf_pmu_disable(event->pmu);
+
         /* If we don't have a space for the counter then finish early. */
         idx = armpmu->get_event_idx(cpuc, hwc);
         if (idx < 0) {
@@ -293,25 +328,19 @@ armpmu_enable(struct perf_event *event)
         cpuc->events[idx] = event;
         set_bit(idx, cpuc->active_mask);
  
-       /* Set the period for the event. */
-       armpmu_event_set_period(event, hwc, idx);
-
-       /* Enable the event. */
-       armpmu->enable(hwc, idx);
+       hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+       if (flags & PERF_EF_START)
+               armpmu_start(event, PERF_EF_RELOAD);
  
         /* Propagate our changes to the userspace mapping. */
         perf_event_update_userpage(event);
  
  out:
+       perf_pmu_enable(event->pmu);
         return err;
  }
  
-static struct pmu pmu = {
-       .enable     = armpmu_enable,
-       .disable    = armpmu_disable,
-       .unthrottle = armpmu_unthrottle,
-       .read       = armpmu_read,
-};
+static struct pmu pmu;
  
  static int
  validate_event(struct cpu_hw_events *cpuc,
@@ -491,20 +520,29 @@ __hw_perf_event_init(struct perf_event *event)
         return err;
  }
  
-const struct pmu *
-hw_perf_event_init(struct perf_event *event)
+static int armpmu_event_init(struct perf_event *event)
  {
         int err = 0;
  
+       switch (event->attr.type) {
+       case PERF_TYPE_RAW:
+       case PERF_TYPE_HARDWARE:
+       case PERF_TYPE_HW_CACHE:
+               break;
+
+       default:
+               return -ENOENT;
+       }
+
         if (!armpmu)
-               return ERR_PTR(-ENODEV);
+               return -ENODEV;
  
         event->destroy = hw_perf_event_destroy;
  
         if (!atomic_inc_not_zero(&active_events)) {
-               if (atomic_read(&active_events) > perf_max_events) {
+               if (atomic_read(&active_events) > armpmu->num_events) {
                         atomic_dec(&active_events);
-                       return ERR_PTR(-ENOSPC);
+                       return -ENOSPC;
                 }
  
                 mutex_lock(&pmu_reserve_mutex);
@@ -518,17 +556,16 @@ hw_perf_event_init(struct perf_event *event)
         }
  
         if (err)
-               return ERR_PTR(err);
+               return err;
  
         err = __hw_perf_event_init(event);
         if (err)
                 hw_perf_event_destroy(event);
  
-       return err ? ERR_PTR(err) : &pmu;
+       return err;
  }
  
-void
-hw_perf_enable(void)
+static void armpmu_enable(struct pmu *pmu)
  {
         /* Enable all of the perf events on hardware. */
         int idx;
@@ -549,13 +586,23 @@ hw_perf_enable(void)
         armpmu->start();
  }
  
-void
-hw_perf_disable(void)
+static void armpmu_disable(struct pmu *pmu)
  {
         if (armpmu)
                 armpmu->stop();
  }
  
+static struct pmu pmu = {
+       .pmu_enable     = armpmu_enable,
+       .pmu_disable    = armpmu_disable,
+       .event_init     = armpmu_event_init,
+       .add            = armpmu_add,
+       .del            = armpmu_del,
+       .start          = armpmu_start,
+       .stop           = armpmu_stop,
+       .read           = armpmu_read,
+};
+
  /*
   * ARMv6 Performance counter handling code.
   *
@@ -1045,7 +1092,7 @@ armv6pmu_handle_irq(int irq_num,
          * platforms that can have the PMU interrupts raised as an NMI, this
          * will not work.
          */
-       perf_event_do_pending();
+       irq_work_run();
  
         return IRQ_HANDLED;
  }
@@ -2021,7 +2068,7 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev)
          * platforms that can have the PMU interrupts raised as an NMI, this
          * will not work.
          */
-       perf_event_do_pending();
+       irq_work_run();
  
         return IRQ_HANDLED;
  }
@@ -2389,7 +2436,7 @@ xscale1pmu_handle_irq(int irq_num, void *dev)
                         armpmu->disable(hwc, idx);
         }
  
-       perf_event_do_pending();
+       irq_work_run();
  
         /*
          * Re-enable the PMU.
@@ -2716,7 +2763,7 @@ xscale2pmu_handle_irq(int irq_num, void *dev)
                         armpmu->disable(hwc, idx);
         }
  
-       perf_event_do_pending();
+       irq_work_run();
  
         /*
          * Re-enable the PMU.
@@ -2933,14 +2980,12 @@ init_hw_perf_events(void)
                         armpmu = &armv6pmu;
                         memcpy(armpmu_perf_cache_map, armv6_perf_cache_map,
                                         sizeof(armv6_perf_cache_map));
-                       perf_max_events = armv6pmu.num_events;
                         break;
                 case 0xB020:    /* ARM11mpcore */
                         armpmu = &armv6mpcore_pmu;
                         memcpy(armpmu_perf_cache_map,
                                armv6mpcore_perf_cache_map,
                                sizeof(armv6mpcore_perf_cache_map));
-                       perf_max_events = armv6mpcore_pmu.num_events;
                         break;
                 case 0xC080:    /* Cortex-A8 */
                         armv7pmu.id = ARM_PERF_PMU_ID_CA8;
@@ -2952,7 +2997,6 @@ init_hw_perf_events(void)
                         /* Reset PMNC and read the nb of CNTx counters
                             supported */
                         armv7pmu.num_events = armv7_reset_read_pmnc();
-                       perf_max_events = armv7pmu.num_events;
                         break;
                 case 0xC090:    /* Cortex-A9 */
                         armv7pmu.id = ARM_PERF_PMU_ID_CA9;
@@ -2964,7 +3008,6 @@ init_hw_perf_events(void)
                         /* Reset PMNC and read the nb of CNTx counters
                             supported */
                         armv7pmu.num_events = armv7_reset_read_pmnc();
-                       perf_max_events = armv7pmu.num_events;
                         break;
                 }
         /* Intel CPUs [xscale]. */
@@ -2975,13 +3018,11 @@ init_hw_perf_events(void)
                         armpmu = &xscale1pmu;
                         memcpy(armpmu_perf_cache_map, xscale_perf_cache_map,
                                         sizeof(xscale_perf_cache_map));
-                       perf_max_events = xscale1pmu.num_events;
                         break;
                 case 2:
                         armpmu = &xscale2pmu;
                         memcpy(armpmu_perf_cache_map, xscale_perf_cache_map,
                                         sizeof(xscale_perf_cache_map));
-                       perf_max_events = xscale2pmu.num_events;
                         break;
                 }
         }
@@ -2991,9 +3032,10 @@ init_hw_perf_events(void)
                                 arm_pmu_names[armpmu->id], armpmu->num_events);
         } else {
                 pr_info("no hardware support available\n");
-               perf_max_events = -1;
         }
  
+       perf_pmu_register(&pmu);
+
         return 0;
  }
  arch_initcall(init_hw_perf_events);
@@ -3001,13 +3043,6 @@ arch_initcall(init_hw_perf_events);
  /*
   * Callchain handling code.
   */
-static inline void
-callchain_store(struct perf_callchain_entry *entry,
-               u64 ip)
-{
-       if (entry->nr < PERF_MAX_STACK_DEPTH)
-               entry->ip[entry->nr++] = ip;
-}
  
  /*
   * The registers we're interested in are at the end of the variable
@@ -3039,7 +3074,7 @@ user_backtrace(struct frame_tail *tail,
         if (__copy_from_user_inatomic(&buftail, tail, sizeof(buftail)))
                 return NULL;
  
-       callchain_store(entry, buftail.lr);
+       perf_callchain_store(entry, buftail.lr);
  
         /*
          * Frame pointers should strictly progress back up the stack
@@ -3051,16 +3086,11 @@ user_backtrace(struct frame_tail *tail,
         return buftail.fp - 1;
  }
  
-static void
-perf_callchain_user(struct pt_regs *regs,
-                   struct perf_callchain_entry *entry)
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
  {
         struct frame_tail *tail;
  
-       callchain_store(entry, PERF_CONTEXT_USER);
-
-       if (!user_mode(regs))
-               regs = task_pt_regs(current);
  
         tail = (struct frame_tail *)regs->ARM_fp - 1;
  
@@ -3078,56 +3108,18 @@ callchain_trace(struct stackframe *fr,
                 void *data)
  {
         struct perf_callchain_entry *entry = data;
-       callchain_store(entry, fr->pc);
+       perf_callchain_store(entry, fr->pc);
         return 0;
  }
  
-static void
-perf_callchain_kernel(struct pt_regs *regs,
-                     struct perf_callchain_entry *entry)
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
  {
         struct stackframe fr;
  
-       callchain_store(entry, PERF_CONTEXT_KERNEL);
         fr.fp = regs->ARM_fp;
         fr.sp = regs->ARM_sp;
         fr.lr = regs->ARM_lr;
         fr.pc = regs->ARM_pc;
         walk_stackframe(&fr, callchain_trace, entry);
  }
-
-static void
-perf_do_callchain(struct pt_regs *regs,
-                 struct perf_callchain_entry *entry)
-{
-       int is_user;
-
-       if (!regs)
-               return;
-
-       is_user = user_mode(regs);
-
-       if (!current || !current->pid)
-               return;
-
-       if (is_user && current->state != TASK_RUNNING)
-               return;
-
-       if (!is_user)
-               perf_callchain_kernel(regs, entry);
-
-       if (current->mm)
-               perf_callchain_user(regs, entry);
-}
-
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
-
-struct perf_callchain_entry *
-perf_callchain(struct pt_regs *regs)
-{
-       struct perf_callchain_entry *entry = &__get_cpu_var(pmc_irq_entry);
-
-       entry->nr = 0;
-       perf_do_callchain(regs, entry);
-       return entry;
-}
diff --git a/arch/arm/mach-bcmring/dma.c b/arch/arm/mach-bcmring/dma.c

index 29c0a911df262f6d70b5c369afa4788c44f0c07a..77eb35c89cd02161ee1678b049b6d80490423c09 100644 (file)
--- a/arch/arm/mach-bcmring/dma.c
+++ b/arch/arm/mach-bcmring/dma.c
@@ -691,7 +691,7 @@ int dma_init(void)
  
         memset(&gDMA, 0, sizeof(gDMA));
  
-       init_MUTEX_LOCKED(&gDMA.lock);
+       sema_init(&gDMA.lock, 0);
         init_waitqueue_head(&gDMA.freeChannelQ);
  
         /* Initialize the Hardware */
@@ -1574,7 +1574,7 @@ int dma_init_mem_map(DMA_MemMap_t *memMap)
  {
         memset(memMap, 0, sizeof(*memMap));
  
-       init_MUTEX(&memMap->lock);
+       sema_init(&memMap->lock, 1);
  
         return 0;
  }
diff --git a/arch/arm/oprofile/Makefile b/arch/arm/oprofile/Makefile

index e666eafed15295aa5feeedf60d374a704b2caa15..b2215c61cdf02a18df71849611007d97cfea4470 100644 (file)
--- a/arch/arm/oprofile/Makefile
+++ b/arch/arm/oprofile/Makefile
@@ -6,4 +6,8 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
                 oprofilefs.o oprofile_stats.o \
                 timer_int.o )
  
+ifeq ($(CONFIG_HW_PERF_EVENTS),y)
+DRIVER_OBJS += $(addprefix ../../../drivers/oprofile/, oprofile_perf.o)
+endif
+
  oprofile-y                             := $(DRIVER_OBJS) common.o
diff --git a/arch/arm/oprofile/common.c b/arch/arm/oprofile/common.c

index 72e09eb642dd7c7a00d47fce210bb1290e926fd9..8aa974491dfcd555a6962461c0d2aa5f3287b653 100644 (file)
--- a/arch/arm/oprofile/common.c
+++ b/arch/arm/oprofile/common.c
@@ -25,139 +25,10 @@
  #include <asm/ptrace.h>
  
  #ifdef CONFIG_HW_PERF_EVENTS
-/*
- * Per performance monitor configuration as set via oprofilefs.
- */
-struct op_counter_config {
-       unsigned long count;
-       unsigned long enabled;
-       unsigned long event;
-       unsigned long unit_mask;
-       unsigned long kernel;
-       unsigned long user;
-       struct perf_event_attr attr;
-};
-
-static int op_arm_enabled;
-static DEFINE_MUTEX(op_arm_mutex);
-
-static struct op_counter_config *counter_config;
-static struct perf_event **perf_events[nr_cpumask_bits];
-static int perf_num_counters;
-
-/*
- * Overflow callback for oprofile.
- */
-static void op_overflow_handler(struct perf_event *event, int unused,
-                       struct perf_sample_data *data, struct pt_regs *regs)
+char *op_name_from_perf_id(void)
  {
-       int id;
-       u32 cpu = smp_processor_id();
-
-       for (id = 0; id < perf_num_counters; ++id)
-               if (perf_events[cpu][id] == event)
-                       break;
-
-       if (id != perf_num_counters)
-               oprofile_add_sample(regs, id);
-       else
-               pr_warning("oprofile: ignoring spurious overflow "
-                               "on cpu %u\n", cpu);
-}
-
-/*
- * Called by op_arm_setup to create perf attributes to mirror the oprofile
- * settings in counter_config. Attributes are created as `pinned' events and
- * so are permanently scheduled on the PMU.
- */
-static void op_perf_setup(void)
-{
-       int i;
-       u32 size = sizeof(struct perf_event_attr);
-       struct perf_event_attr *attr;
-
-       for (i = 0; i < perf_num_counters; ++i) {
-               attr = &counter_config[i].attr;
-               memset(attr, 0, size);
-               attr->type              = PERF_TYPE_RAW;
-               attr->size              = size;
-               attr->config            = counter_config[i].event;
-               attr->sample_period     = counter_config[i].count;
-               attr->pinned            = 1;
-       }
-}
-
-static int op_create_counter(int cpu, int event)
-{
-       int ret = 0;
-       struct perf_event *pevent;
-
-       if (!counter_config[event].enabled || (perf_events[cpu][event] != NULL))
-               return ret;
-
-       pevent = perf_event_create_kernel_counter(&counter_config[event].attr,
-                                                 cpu, -1,
-                                                 op_overflow_handler);
-
-       if (IS_ERR(pevent)) {
-               ret = PTR_ERR(pevent);
-       } else if (pevent->state != PERF_EVENT_STATE_ACTIVE) {
-               perf_event_release_kernel(pevent);
-               pr_warning("oprofile: failed to enable event %d "
-                               "on CPU %d\n", event, cpu);
-               ret = -EBUSY;
-       } else {
-               perf_events[cpu][event] = pevent;
-       }
-
-       return ret;
-}
+       enum arm_perf_pmu_ids id = armpmu_get_pmu_id();
  
-static void op_destroy_counter(int cpu, int event)
-{
-       struct perf_event *pevent = perf_events[cpu][event];
-
-       if (pevent) {
-               perf_event_release_kernel(pevent);
-               perf_events[cpu][event] = NULL;
-       }
-}
-
-/*
- * Called by op_arm_start to create active perf events based on the
- * perviously configured attributes.
- */
-static int op_perf_start(void)
-{
-       int cpu, event, ret = 0;
-
-       for_each_online_cpu(cpu) {
-               for (event = 0; event < perf_num_counters; ++event) {
-                       ret = op_create_counter(cpu, event);
-                       if (ret)
-                               goto out;
-               }
-       }
-
-out:
-       return ret;
-}
-
-/*
- * Called by op_arm_stop at the end of a profiling run.
- */
-static void op_perf_stop(void)
-{
-       int cpu, event;
-
-       for_each_online_cpu(cpu)
-               for (event = 0; event < perf_num_counters; ++event)
-                       op_destroy_counter(cpu, event);
-}
-
-
-static char *op_name_from_perf_id(enum arm_perf_pmu_ids id)
-{
         switch (id) {
         case ARM_PERF_PMU_ID_XSCALE1:
                 return "arm/xscale1";
@@ -176,116 +47,6 @@ static char *op_name_from_perf_id(enum arm_perf_pmu_ids id)
         }
  }
  
-static int op_arm_create_files(struct super_block *sb, struct dentry *root)
-{
-       unsigned int i;
-
-       for (i = 0; i < perf_num_counters; i++) {
-               struct dentry *dir;
-               char buf[4];
-
-               snprintf(buf, sizeof buf, "%d", i);
-               dir = oprofilefs_mkdir(sb, root, buf);
-               oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled);
-               oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event);
-               oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count);
-               oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask);
-               oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel);
-               oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user);
-       }
-
-       return 0;
-}
-
-static int op_arm_setup(void)
-{
-       spin_lock(&oprofilefs_lock);
-       op_perf_setup();
-       spin_unlock(&oprofilefs_lock);
-       return 0;
-}
-
-static int op_arm_start(void)
-{
-       int ret = -EBUSY;
-
-       mutex_lock(&op_arm_mutex);
-       if (!op_arm_enabled) {
-               ret = 0;
-               op_perf_start();
-               op_arm_enabled = 1;
-       }
-       mutex_unlock(&op_arm_mutex);
-       return ret;
-}
-
-static void op_arm_stop(void)
-{
-       mutex_lock(&op_arm_mutex);
-       if (op_arm_enabled)
-               op_perf_stop();
-       op_arm_enabled = 0;
-       mutex_unlock(&op_arm_mutex);
-}
-
-#ifdef CONFIG_PM
-static int op_arm_suspend(struct platform_device *dev, pm_message_t state)
-{
-       mutex_lock(&op_arm_mutex);
-       if (op_arm_enabled)
-               op_perf_stop();
-       mutex_unlock(&op_arm_mutex);
-       return 0;
-}
-
-static int op_arm_resume(struct platform_device *dev)
-{
-       mutex_lock(&op_arm_mutex);
-       if (op_arm_enabled && op_perf_start())
-               op_arm_enabled = 0;
-       mutex_unlock(&op_arm_mutex);
-       return 0;
-}
-
-static struct platform_driver oprofile_driver = {
-       .driver         = {
-               .name           = "arm-oprofile",
-       },
-       .resume         = op_arm_resume,
-       .suspend        = op_arm_suspend,
-};
-
-static struct platform_device *oprofile_pdev;
-
-static int __init init_driverfs(void)
-{
-       int ret;
-
-       ret = platform_driver_register(&oprofile_driver);
-       if (ret)
-               goto out;
-
-       oprofile_pdev = platform_device_register_simple(
-                               oprofile_driver.driver.name, 0, NULL, 0);
-       if (IS_ERR(oprofile_pdev)) {
-               ret = PTR_ERR(oprofile_pdev);
-               platform_driver_unregister(&oprofile_driver);
-       }
-
-out:
-       return ret;
-}
-
-static void  exit_driverfs(void)
-{
-       platform_device_unregister(oprofile_pdev);
-       platform_driver_unregister(&oprofile_driver);
-}
-#else
-static int __init init_driverfs(void) { return 0; }
-#define exit_driverfs() do { } while (0)
-#endif /* CONFIG_PM */
-
  static int report_trace(struct stackframe *frame, void *d)
  {
         unsigned int *depth = d;
@@ -350,74 +111,14 @@ static void arm_backtrace(struct pt_regs * const regs, unsigned int depth)
  
  int __init oprofile_arch_init(struct oprofile_operations *ops)
  {
-       int cpu, ret = 0;
-
-       perf_num_counters = armpmu_get_max_events();
-
-       counter_config = kcalloc(perf_num_counters,
-                       sizeof(struct op_counter_config), GFP_KERNEL);
-
-       if (!counter_config) {
-               pr_info("oprofile: failed to allocate %d "
-                               "counters\n", perf_num_counters);
-               return -ENOMEM;
-       }
-
-       ret = init_driverfs();
-       if (ret) {
-               kfree(counter_config);
-               counter_config = NULL;
-               return ret;
-       }
-
-       for_each_possible_cpu(cpu) {
-               perf_events[cpu] = kcalloc(perf_num_counters,
-                               sizeof(struct perf_event *), GFP_KERNEL);
-               if (!perf_events[cpu]) {
-                       pr_info("oprofile: failed to allocate %d perf events "
-                                       "for cpu %d\n", perf_num_counters, cpu);
-                       while (--cpu >= 0)
-                               kfree(perf_events[cpu]);
-                       return -ENOMEM;
-               }
-       }
-
         ops->backtrace          = arm_backtrace;
-       ops->create_files       = op_arm_create_files;
-       ops->setup              = op_arm_setup;
-       ops->start              = op_arm_start;
-       ops->stop               = op_arm_stop;
-       ops->shutdown           = op_arm_stop;
-       ops->cpu_type           = op_name_from_perf_id(armpmu_get_pmu_id());
-
-       if (!ops->cpu_type)
-               ret = -ENODEV;
-       else
-               pr_info("oprofile: using %s\n", ops->cpu_type);
  
-       return ret;
+       return oprofile_perf_init(ops);
  }
  
-void oprofile_arch_exit(void)
+void __exit oprofile_arch_exit(void)
  {
-       int cpu, id;
-       struct perf_event *event;
-
-       if (*perf_events) {
-               for_each_possible_cpu(cpu) {
-                       for (id = 0; id < perf_num_counters; ++id) {
-                               event = perf_events[cpu][id];
-                               if (event != NULL)
-                                       perf_event_release_kernel(event);
-                       }
-                       kfree(perf_events[cpu]);
-               }
-       }
-
-       if (counter_config) {
-               kfree(counter_config);
-               exit_driverfs();
-       }
+       oprofile_perf_exit();
  }
  #else
  int __init oprofile_arch_init(struct oprofile_operations *ops)
@@ -425,5 +126,5 @@ int __init oprofile_arch_init(struct oprofile_operations *ops)
         pr_info("oprofile: hardware counters not available\n");
         return -ENODEV;
  }
-void oprofile_arch_exit(void) {}
+void __exit oprofile_arch_exit(void) {}
  #endif /* CONFIG_HW_PERF_EVENTS */
diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig

index 16399bd249930c1a9e2e06f7d0da247b288b6d34..0f2417df63230fb90114d96e7cb5f583d48fab34 100644 (file)
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -7,6 +7,7 @@ config FRV
         default y
         select HAVE_IDE
         select HAVE_ARCH_TRACEHOOK
+       select HAVE_IRQ_WORK
         select HAVE_PERF_EVENTS
  
  config ZONE_DMA
diff --git a/arch/frv/lib/Makefile b/arch/frv/lib/Makefile

index f4709756d0d9da2046c89fb12d5af54aa032492f..4ff2fb1e6b1694848eb688700e8be330a41a9c51 100644 (file)
--- a/arch/frv/lib/Makefile
+++ b/arch/frv/lib/Makefile
@@ -5,4 +5,4 @@
  lib-y := \
         __ashldi3.o __lshrdi3.o __muldi3.o __ashrdi3.o __negdi2.o __ucmpdi2.o \
         checksum.o memcpy.o memset.o atomic-ops.o atomic64-ops.o \
-       outsl_ns.o outsl_sw.o insl_ns.o insl_sw.o cache.o perf_event.o
+       outsl_ns.o outsl_sw.o insl_ns.o insl_sw.o cache.o
diff --git a/arch/frv/lib/perf_event.c b/arch/frv/lib/perf_event.c

deleted file mode 100644 (file)

index 9ac5acf..0000000
--- a/arch/frv/lib/perf_event.c
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Performance event handling
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/perf_event.h>
-
-/*
- * mark the performance event as pending
- */
-void set_perf_event_pending(void)
-{
-}
diff --git a/arch/ia64/include/asm/hardirq.h b/arch/ia64/include/asm/hardirq.h

index d514cd9edb49f45c9ac5871d407ec7ff65b0927e..8fb7d33a661f9e6de23ec01be699e4300ad4f363 100644 (file)
--- a/arch/ia64/include/asm/hardirq.h
+++ b/arch/ia64/include/asm/hardirq.h
@@ -6,12 +6,6 @@
   *     David Mosberger-Tang <davidm@hpl.hp.com>
   */
  
-
-#include <linux/threads.h>
-#include <linux/irq.h>
-
-#include <asm/processor.h>
-
  /*
   * No irq_cpustat_t for IA-64.  The data is held in the per-CPU data structure.
   */
@@ -20,6 +14,11 @@
  
  #define local_softirq_pending()                (local_cpu_data->softirq_pending)
  
+#include <linux/threads.h>
+#include <linux/irq.h>
+
+#include <asm/processor.h>
+
  extern void __iomem *ipi_base_addr;
  
  void ack_bad_irq(unsigned int irq);
diff --git a/arch/m32r/include/asm/elf.h b/arch/m32r/include/asm/elf.h

index 2f85412ef7302a0aedaf1b5e26f63719c69dc033..b8da7d0574d20635f489315ea8caf957d063be08 100644 (file)
--- a/arch/m32r/include/asm/elf.h
+++ b/arch/m32r/include/asm/elf.h
@@ -82,9 +82,9 @@ typedef elf_fpreg_t elf_fpregset_t;
   * These are used to set parameters in the core dumps.
   */
  #define ELF_CLASS      ELFCLASS32
-#if defined(__LITTLE_ENDIAN)
+#if defined(__LITTLE_ENDIAN__)
  #define ELF_DATA       ELFDATA2LSB
-#elif defined(__BIG_ENDIAN)
+#elif defined(__BIG_ENDIAN__)
  #define ELF_DATA       ELFDATA2MSB
  #else
  #error no endian defined
diff --git a/arch/m32r/kernel/.gitignore b/arch/m32r/kernel/.gitignore

new file mode 100644 (file)

index 0000000..c5f676c
--- /dev/null
+++ b/arch/m32r/kernel/.gitignore
@@ -0,0 +1 @@
+vmlinux.lds
diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c

index 7bbe38645ed5559395f85c5b5fce93eb3f4992e3..a08697f0886d7988b012fa727c777d8bbed7b3fb 100644 (file)
--- a/arch/m32r/kernel/signal.c
+++ b/arch/m32r/kernel/signal.c
@@ -28,6 +28,8 @@
  
  #define DEBUG_SIG 0
  
+#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
+
  asmlinkage int
  sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
                 unsigned long r2, unsigned long r3, unsigned long r4,
@@ -254,7 +256,7 @@ give_sigsegv:
  static int prev_insn(struct pt_regs *regs)
  {
         u16 inst;
-       if (get_user(&inst, (u16 __user *)(regs->bpc - 2)))
+       if (get_user(inst, (u16 __user *)(regs->bpc - 2)))
                 return -EFAULT;
         if ((inst & 0xfff0) == 0x10f0)  /* trap ? */
                 regs->bpc -= 2;
diff --git a/arch/mips/Kbuild b/arch/mips/Kbuild

index e322d65f33a41e0085e5d352410ca0b5c3f37e26..7dd65cfae83759562e43ab20bb03be071ae56ce5 100644 (file)
--- a/arch/mips/Kbuild
+++ b/arch/mips/Kbuild
@@ -7,6 +7,10 @@ subdir-ccflags-y := -Werror
  include arch/mips/Kbuild.platforms
  obj-y := $(platform-y)
  
+# make clean traverses $(obj-) without having included .config, so
+# everything ends up here
+obj- := $(platform-)
+
  # mips object files
  # The object files are linked as core-y files would be linked
  
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig

index 5526faabfc21433cf3e3fa679c9fcf802646c96c..4c9f402295dd3d9b11548ab47d26853761df62f1 100644 (file)
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -881,11 +881,15 @@ config NO_IOPORT
  config GENERIC_ISA_DMA
         bool
         select ZONE_DMA if GENERIC_ISA_DMA_SUPPORT_BROKEN=n
+       select ISA_DMA_API
  
  config GENERIC_ISA_DMA_SUPPORT_BROKEN
         bool
         select GENERIC_ISA_DMA
  
+config ISA_DMA_API
+       bool
+
  config GENERIC_GPIO
         bool
  
diff --git a/arch/mips/boot/compressed/Makefile b/arch/mips/boot/compressed/Makefile

index 5fd7f7a58b7eb9b02446e8ea2aa3e2e9b1e48e8f..5042d51b0512a087b02b85853aad2d2b38d9d7d6 100644 (file)
--- a/arch/mips/boot/compressed/Makefile
+++ b/arch/mips/boot/compressed/Makefile
@@ -105,4 +105,4 @@ OBJCOPYFLAGS_vmlinuz.srec := $(OBJCOPYFLAGS) -S -O srec
  vmlinuz.srec: vmlinuz
         $(call cmd,objcopy)
  
-clean-files := $(objtree)/vmlinuz.*
+clean-files := $(objtree)/vmlinuz $(objtree)/vmlinuz.{32,ecoff,bin,srec}
diff --git a/arch/mips/dec/Platform b/arch/mips/dec/Platform

index 3adbcbd95db1efd4dbb18fba91bc978153079d7f..cf55a6f4e720c4e831733ceab40d820fa136a2c8 100644 (file)
--- a/arch/mips/dec/Platform
+++ b/arch/mips/dec/Platform
@@ -1,7 +1,7 @@
  #
  # DECstation family
  #
-platform-$(CONFIG_MACH_DECSTATION)     = dec/
+platform-$(CONFIG_MACH_DECSTATION)     += dec/
  cflags-$(CONFIG_MACH_DECSTATION)       += \
                         -I$(srctree)/arch/mips/include/asm/mach-dec
  libs-$(CONFIG_MACH_DECSTATION)         += arch/mips/dec/prom/
diff --git a/arch/mips/include/asm/fcntl.h b/arch/mips/include/asm/fcntl.h

index e482fe90fe8850609ed05f79d4caa9423e31031a..75eddedcfc3ee31a5ba500089fb92215311f51eb 100644 (file)
--- a/arch/mips/include/asm/fcntl.h
+++ b/arch/mips/include/asm/fcntl.h
@@ -56,6 +56,7 @@
   */
  
  #ifdef CONFIG_32BIT
+#include <linux/types.h>
  
  struct flock {
         short   l_type;
diff --git a/arch/mips/jz4740/Platform b/arch/mips/jz4740/Platform

index 6a97230e3d05ee4a53478c2a4625c51d92b26bd2..ba91be9c21ef405f65e0ae87fafe92a66ac8b06f 100644 (file)
--- a/arch/mips/jz4740/Platform
+++ b/arch/mips/jz4740/Platform
@@ -1,3 +1,3 @@
-core-$(CONFIG_MACH_JZ4740)     += arch/mips/jz4740/
+platform-$(CONFIG_MACH_JZ4740) += jz4740/
  cflags-$(CONFIG_MACH_JZ4740)   += -I$(srctree)/arch/mips/include/asm/mach-jz4740
  load-$(CONFIG_MACH_JZ4740)     += 0xffffffff80010000
diff --git a/arch/mips/kernel/branch.c b/arch/mips/kernel/branch.c

index 0176ed015c895644bc72fc30bc661c2ad555b515..32103cc2a2576877592d91050b86c8944a2479d9 100644 (file)
--- a/arch/mips/kernel/branch.c
+++ b/arch/mips/kernel/branch.c
@@ -40,7 +40,6 @@ int __compute_return_epc(struct pt_regs *regs)
                 return -EFAULT;
         }
  
-       regs->regs[0] = 0;
         switch (insn.i_format.opcode) {
         /*
          * jr and jalr are in r_format format.
diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c

index 2340f11dc29cc8de689593c8b49315b613a6aae8..9a526ba6f25766f3ab6b58bc70b7f792c4011ff9 100644 (file)
--- a/arch/mips/kernel/mips-mt-fpaff.c
+++ b/arch/mips/kernel/mips-mt-fpaff.c
@@ -103,7 +103,7 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len,
         if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
                 goto out_unlock;
  
-       retval = security_task_setscheduler(p, 0, NULL);
+       retval = security_task_setscheduler(p)
         if (retval)
                 goto out_unlock;
  
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c

index c51b95ff86443e2fcb1eed618afe0861fd781eef..c8777333e19833667fe882110fe40d954fee5eeb 100644 (file)
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -536,7 +536,7 @@ asmlinkage void do_syscall_trace(struct pt_regs *regs, int entryexit)
  {
         /* do the secure computing check first */
         if (!entryexit)
-               secure_computing(regs->regs[0]);
+               secure_computing(regs->regs[2]);
  
         if (unlikely(current->audit_context) && entryexit)
                 audit_syscall_exit(AUDITSC_RESULT(regs->regs[2]),
@@ -565,7 +565,7 @@ asmlinkage void do_syscall_trace(struct pt_regs *regs, int entryexit)
  
  out:
         if (unlikely(current->audit_context) && !entryexit)
-               audit_syscall_entry(audit_arch(), regs->regs[0],
+               audit_syscall_entry(audit_arch(), regs->regs[2],
                                     regs->regs[4], regs->regs[5],
                                     regs->regs[6], regs->regs[7]);
  }
diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S

index 584415eef8c92ae9168790b4ef824470c2034f73..fbaabad0e6e28466aa1098d4f50c516d520b0c7d 100644 (file)
--- a/arch/mips/kernel/scall32-o32.S
+++ b/arch/mips/kernel/scall32-o32.S
@@ -63,9 +63,9 @@ stack_done:
         sw      t0, PT_R7(sp)           # set error flag
         beqz    t0, 1f
  
+       lw      t1, PT_R2(sp)           # syscall number
         negu    v0                      # error
-       sw      v0, PT_R0(sp)           # set flag for syscall
-                                       # restarting
+       sw      t1, PT_R0(sp)           # save it for syscall restarting
  1:     sw      v0, PT_R2(sp)           # result
  
  o32_syscall_exit:
@@ -104,9 +104,9 @@ syscall_trace_entry:
         sw      t0, PT_R7(sp)           # set error flag
         beqz    t0, 1f
  
+       lw      t1, PT_R2(sp)           # syscall number
         negu    v0                      # error
-       sw      v0, PT_R0(sp)           # set flag for syscall
-                                       # restarting
+       sw      t1, PT_R0(sp)           # save it for syscall restarting
  1:     sw      v0, PT_R2(sp)           # result
  
         j       syscall_exit
@@ -169,8 +169,7 @@ stackargs:
          * We probably should handle this case a bit more drastic.
          */
  bad_stack:
-       negu    v0                              # error
-       sw      v0, PT_R0(sp)
+       li      v0, EFAULT
         sw      v0, PT_R2(sp)
         li      t0, 1                           # set error flag
         sw      t0, PT_R7(sp)
diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S

index 5573f8e4e326910b3873007c2b216fb31fe566d4..3f4179283207b1cc21e7fc14d9fea4da0c38bb28 100644 (file)
--- a/arch/mips/kernel/scall64-64.S
+++ b/arch/mips/kernel/scall64-64.S
@@ -66,9 +66,9 @@ NESTED(handle_sys64, PT_SIZE, sp)
         sd      t0, PT_R7(sp)           # set error flag
         beqz    t0, 1f
  
+       ld      t1, PT_R2(sp)           # syscall number
         dnegu   v0                      # error
-       sd      v0, PT_R0(sp)           # set flag for syscall
-                                       # restarting
+       sd      t1, PT_R0(sp)           # save it for syscall restarting
  1:     sd      v0, PT_R2(sp)           # result
  
  n64_syscall_exit:
@@ -109,8 +109,9 @@ syscall_trace_entry:
         sd      t0, PT_R7(sp)           # set error flag
         beqz    t0, 1f
  
+       ld      t1, PT_R2(sp)           # syscall number
         dnegu   v0                      # error
-       sd      v0, PT_R0(sp)           # set flag for syscall restarting
+       sd      t1, PT_R0(sp)           # save it for syscall restarting
  1:     sd      v0, PT_R2(sp)           # result
  
         j       syscall_exit
diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S

index 1e38ec97672ed82d30b238ad7bbd6adbb2c10b9f..f08ece6d8acc7f3aa78ecbca23f801ae76727f6a 100644 (file)
--- a/arch/mips/kernel/scall64-n32.S
+++ b/arch/mips/kernel/scall64-n32.S
@@ -65,8 +65,9 @@ NESTED(handle_sysn32, PT_SIZE, sp)
         sd      t0, PT_R7(sp)           # set error flag
         beqz    t0, 1f
  
+       ld      t1, PT_R2(sp)           # syscall number
         dnegu   v0                      # error
-       sd      v0, PT_R0(sp)           # set flag for syscall restarting
+       sd      t1, PT_R0(sp)           # save it for syscall restarting
  1:     sd      v0, PT_R2(sp)           # result
  
         local_irq_disable               # make sure need_resched and
@@ -106,8 +107,9 @@ n32_syscall_trace_entry:
         sd      t0, PT_R7(sp)           # set error flag
         beqz    t0, 1f
  
+       ld      t1, PT_R2(sp)           # syscall number
         dnegu   v0                      # error
-       sd      v0, PT_R0(sp)           # set flag for syscall restarting
+       sd      t1, PT_R0(sp)           # save it for syscall restarting
  1:     sd      v0, PT_R2(sp)           # result
  
         j       syscall_exit
@@ -320,10 +322,10 @@ EXPORT(sysn32_call_table)
         PTR     sys_cacheflush
         PTR     sys_cachectl
         PTR     sys_sysmips
-       PTR     sys_io_setup                    /* 6200 */
+       PTR     compat_sys_io_setup                     /* 6200 */
         PTR     sys_io_destroy
-       PTR     sys_io_getevents
-       PTR     sys_io_submit
+       PTR     compat_sys_io_getevents
+       PTR     compat_sys_io_submit
         PTR     sys_io_cancel
         PTR     sys_exit_group                  /* 6205 */
         PTR     sys_lookup_dcookie
diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S

index 171979fc98e59a5169049d44001ad75383268684..78d768a3e19da78fc986e9170f73b240ee98c1c2 100644 (file)
--- a/arch/mips/kernel/scall64-o32.S
+++ b/arch/mips/kernel/scall64-o32.S
@@ -93,8 +93,9 @@ NESTED(handle_sys, PT_SIZE, sp)
         sd      t0, PT_R7(sp)           # set error flag
         beqz    t0, 1f
  
+       ld      t1, PT_R2(sp)           # syscall number
         dnegu   v0                      # error
-       sd      v0, PT_R0(sp)           # flag for syscall restarting
+       sd      t1, PT_R0(sp)           # save it for syscall restarting
  1:     sd      v0, PT_R2(sp)           # result
  
  o32_syscall_exit:
@@ -142,8 +143,9 @@ trace_a_syscall:
         sd      t0, PT_R7(sp)           # set error flag
         beqz    t0, 1f
  
+       ld      t1, PT_R2(sp)           # syscall number
         dnegu   v0                      # error
-       sd      v0, PT_R0(sp)           # set flag for syscall restarting
+       sd      t1, PT_R0(sp)           # save it for syscall restarting
  1:     sd      v0, PT_R2(sp)           # result
  
         j       syscall_exit
@@ -154,8 +156,7 @@ trace_a_syscall:
          * The stackpointer for a call with more than 4 arguments is bad.
          */
  bad_stack:
-       dnegu   v0                      # error
-       sd      v0, PT_R0(sp)
+       li      v0, EFAULT
         sd      v0, PT_R2(sp)
         li      t0, 1                   # set error flag
         sd      t0, PT_R7(sp)
@@ -444,10 +445,10 @@ sys_call_table:
         PTR     compat_sys_futex
         PTR     compat_sys_sched_setaffinity
         PTR     compat_sys_sched_getaffinity    /* 4240 */
-       PTR     sys_io_setup
+       PTR     compat_sys_io_setup
         PTR     sys_io_destroy
-       PTR     sys_io_getevents
-       PTR     sys_io_submit
+       PTR     compat_sys_io_getevents
+       PTR     compat_sys_io_submit
         PTR     sys_io_cancel                   /* 4245 */
         PTR     sys_exit_group
         PTR     sys32_lookup_dcookie
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c

index 2099d5a4c4b78224f85ee5f9b175907be3d8f15c..5922342bca3991d4b7ab4a9d6f8483fb3e416779 100644 (file)
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -390,7 +390,6 @@ asmlinkage void sys_rt_sigreturn(nabi_no_regargs struct pt_regs regs)
  {
         struct rt_sigframe __user *frame;
         sigset_t set;
-       stack_t st;
         int sig;
  
         frame = (struct rt_sigframe __user *) regs.regs[29];
@@ -411,11 +410,9 @@ asmlinkage void sys_rt_sigreturn(nabi_no_regargs struct pt_regs regs)
         else if (sig)
                 force_sig(sig, current);
  
-       if (__copy_from_user(&st, &frame->rs_uc.uc_stack, sizeof(st)))
-               goto badframe;
         /* It is more difficult to avoid calling this function than to
            call it and ignore errors.  */
-       do_sigaltstack((stack_t __user *)&st, NULL, regs.regs[29]);
+       do_sigaltstack(&frame->rs_uc.uc_stack, NULL, regs.regs[29]);
  
         /*
          * Don't let your children do this ...
@@ -550,23 +547,26 @@ static int handle_signal(unsigned long sig, siginfo_t *info,
         struct mips_abi *abi = current->thread.abi;
         void *vdso = current->mm->context.vdso;
  
-       switch(regs->regs[0]) {
-       case ERESTART_RESTARTBLOCK:
-       case ERESTARTNOHAND:
-               regs->regs[2] = EINTR;
-               break;
-       case ERESTARTSYS:
-               if (!(ka->sa.sa_flags & SA_RESTART)) {
+       if (regs->regs[0]) {
+               switch(regs->regs[2]) {
+               case ERESTART_RESTARTBLOCK:
+               case ERESTARTNOHAND:
                         regs->regs[2] = EINTR;
                         break;
+               case ERESTARTSYS:
+                       if (!(ka->sa.sa_flags & SA_RESTART)) {
+                               regs->regs[2] = EINTR;
+                               break;
+                       }
+               /* fallthrough */
+               case ERESTARTNOINTR:
+                       regs->regs[7] = regs->regs[26];
+                       regs->regs[2] = regs->regs[0];
+                       regs->cp0_epc -= 4;
                 }
-       /* fallthrough */
-       case ERESTARTNOINTR:            /* Userland will reload $v0.  */
-               regs->regs[7] = regs->regs[26];
-               regs->cp0_epc -= 8;
-       }
  
-       regs->regs[0] = 0;              /* Don't deal with this again.  */
+               regs->regs[0] = 0;              /* Don't deal with this again.  */
+       }
  
         if (sig_uses_siginfo(ka))
                 ret = abi->setup_rt_frame(vdso + abi->rt_signal_return_offset,
@@ -575,6 +575,9 @@ static int handle_signal(unsigned long sig, siginfo_t *info,
                 ret = abi->setup_frame(vdso + abi->signal_return_offset,
                                        ka, regs, sig, oldset);
  
+       if (ret)
+               return ret;
+
         spin_lock_irq(&current->sighand->siglock);
         sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
         if (!(ka->sa.sa_flags & SA_NODEFER))
@@ -622,17 +625,13 @@ static void do_signal(struct pt_regs *regs)
                 return;
         }
  
-       /*
-        * Who's code doesn't conform to the restartable syscall convention
-        * dies here!!!  The li instruction, a single machine instruction,
-        * must directly be followed by the syscall instruction.
-        */
         if (regs->regs[0]) {
                 if (regs->regs[2] == ERESTARTNOHAND ||
                     regs->regs[2] == ERESTARTSYS ||
                     regs->regs[2] == ERESTARTNOINTR) {
+                       regs->regs[2] = regs->regs[0];
                         regs->regs[7] = regs->regs[26];
-                       regs->cp0_epc -= 8;
+                       regs->cp0_epc -= 4;
                 }
                 if (regs->regs[2] == ERESTART_RESTARTBLOCK) {
                         regs->regs[2] = current->thread.abi->restart;
diff --git a/arch/mips/kernel/signal_n32.c b/arch/mips/kernel/signal_n32.c

index 2c5df818c65ae0395768264f1192059b792aaf02..ee24d814d5b91bb474ff3ff114e49f86618cdeae 100644 (file)
--- a/arch/mips/kernel/signal_n32.c
+++ b/arch/mips/kernel/signal_n32.c
@@ -109,6 +109,7 @@ asmlinkage int sysn32_rt_sigsuspend(nabi_no_regargs struct pt_regs regs)
  asmlinkage void sysn32_rt_sigreturn(nabi_no_regargs struct pt_regs regs)
  {
         struct rt_sigframe_n32 __user *frame;
+       mm_segment_t old_fs;
         sigset_t set;
         stack_t st;
         s32 sp;
@@ -143,7 +144,11 @@ asmlinkage void sysn32_rt_sigreturn(nabi_no_regargs struct pt_regs regs)
  
         /* It is more difficult to avoid calling this function than to
            call it and ignore errors.  */
+       old_fs = get_fs();
+       set_fs(KERNEL_DS);
         do_sigaltstack((stack_t __user *)&st, NULL, regs.regs[29]);
+       set_fs(old_fs);
+
  
         /*
          * Don't let your children do this ...
diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c

index 69b039ca8d8337e60ecead9e32fbe7bd64659a64..33d5a5ce4a29d56037a38abb346a99e0212c16c2 100644 (file)
--- a/arch/mips/kernel/unaligned.c
+++ b/arch/mips/kernel/unaligned.c
@@ -109,8 +109,6 @@ static void emulate_load_store_insn(struct pt_regs *regs,
         unsigned long value;
         unsigned int res;
  
-       regs->regs[0] = 0;
-
         /*
          * This load never faults.
          */
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig

index 907417d187e15039bb4bb6d96075cd950a045dbd..79a04a9394d5ad33b768d79ce1c2230b24e1c9bb 100644 (file)
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -16,6 +16,7 @@ config PARISC
         select RTC_DRV_GENERIC
         select INIT_ALL_POSSIBLE
         select BUG
+       select HAVE_IRQ_WORK
         select HAVE_PERF_EVENTS
         select GENERIC_ATOMIC64 if !64BIT
         help
diff --git a/arch/parisc/include/asm/perf_event.h b/arch/parisc/include/asm/perf_event.h

index cc146427d8f9acf2219aa14813d76e52835bb83f..1e0fd8ba6c033e5f277afc02075fc7de57b1adc6 100644 (file)
--- a/arch/parisc/include/asm/perf_event.h
+++ b/arch/parisc/include/asm/perf_event.h
@@ -1,7 +1,6 @@
  #ifndef __ASM_PARISC_PERF_EVENT_H
  #define __ASM_PARISC_PERF_EVENT_H
  
-/* parisc only supports software events through this interface. */
-static inline void set_perf_event_pending(void) { }
+/* Empty, just to avoid compiling error */
  
  #endif /* __ASM_PARISC_PERF_EVENT_H */
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig

index 631e5a0fb6abcf2e124f0dfc6d1ed95d9b9f4566..4b1e521d966f0facaf7d1b70be55ebaf83df3954 100644 (file)
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -138,6 +138,7 @@ config PPC
         select HAVE_OPROFILE
         select HAVE_SYSCALL_WRAPPERS if PPC64
         select GENERIC_ATOMIC64 if PPC32
+       select HAVE_IRQ_WORK
         select HAVE_PERF_EVENTS
         select HAVE_REGS_AND_STACK_ACCESS_API
         select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h

index 1ff6662f7faf28ffce6577ac40f0a237f2b866e8..9b287fdd8ea335352cb7034a600225b6161d6a30 100644 (file)
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -129,7 +129,7 @@ struct paca_struct {
         u8 soft_enabled;                /* irq soft-enable flag */
         u8 hard_enabled;                /* set if irqs are enabled in MSR */
         u8 io_sync;                     /* writel() needs spin_unlock sync */
-       u8 perf_event_pending;          /* PM interrupt while soft-disabled */
+       u8 irq_work_pending;            /* IRQ_WORK interrupt while soft-disable */
  
         /* Stuff for accurate time accounting */
         u64 user_time;                  /* accumulated usermode TB ticks */
diff --git a/arch/powerpc/kernel/perf_callchain.c b/arch/powerpc/kernel/perf_callchain.c

index 95ad9dad298e9d4773117b0406bc4a3378d77e5e..d05ae4204bbf3d3ddcc84266476b736a6790715c 100644 (file)
--- a/arch/powerpc/kernel/perf_callchain.c
+++ b/arch/powerpc/kernel/perf_callchain.c
@@ -23,18 +23,6 @@
  #include "ppc32.h"
  #endif
  
-/*
- * Store another value in a callchain_entry.
- */
-static inline void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
-       unsigned int nr = entry->nr;
-
-       if (nr < PERF_MAX_STACK_DEPTH) {
-               entry->ip[nr] = ip;
-               entry->nr = nr + 1;
-       }
-}
  
  /*
   * Is sp valid as the address of the next kernel stack frame after prev_sp?
@@ -58,8 +46,8 @@ static int valid_next_sp(unsigned long sp, unsigned long prev_sp)
         return 0;
  }
  
-static void perf_callchain_kernel(struct pt_regs *regs,
-                                 struct perf_callchain_entry *entry)
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
  {
         unsigned long sp, next_sp;
         unsigned long next_ip;
@@ -69,8 +57,7 @@ static void perf_callchain_kernel(struct pt_regs *regs,
  
         lr = regs->link;
         sp = regs->gpr[1];
-       callchain_store(entry, PERF_CONTEXT_KERNEL);
-       callchain_store(entry, regs->nip);
+       perf_callchain_store(entry, regs->nip);
  
         if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD))
                 return;
@@ -89,7 +76,7 @@ static void perf_callchain_kernel(struct pt_regs *regs,
                         next_ip = regs->nip;
                         lr = regs->link;
                         level = 0;
-                       callchain_store(entry, PERF_CONTEXT_KERNEL);
+                       perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
  
                 } else {
                         if (level == 0)
@@ -111,7 +98,7 @@ static void perf_callchain_kernel(struct pt_regs *regs,
                         ++level;
                 }
  
-               callchain_store(entry, next_ip);
+               perf_callchain_store(entry, next_ip);
                 if (!valid_next_sp(next_sp, sp))
                         return;
                 sp = next_sp;
@@ -233,8 +220,8 @@ static int sane_signal_64_frame(unsigned long sp)
                 puc == (unsigned long) &sf->uc;
  }
  
-static void perf_callchain_user_64(struct pt_regs *regs,
-                                  struct perf_callchain_entry *entry)
+static void perf_callchain_user_64(struct perf_callchain_entry *entry,
+                                  struct pt_regs *regs)
  {
         unsigned long sp, next_sp;
         unsigned long next_ip;
@@ -246,8 +233,7 @@ static void perf_callchain_user_64(struct pt_regs *regs,
         next_ip = regs->nip;
         lr = regs->link;
         sp = regs->gpr[1];
-       callchain_store(entry, PERF_CONTEXT_USER);
-       callchain_store(entry, next_ip);
+       perf_callchain_store(entry, next_ip);
  
         for (;;) {
                 fp = (unsigned long __user *) sp;
@@ -276,14 +262,14 @@ static void perf_callchain_user_64(struct pt_regs *regs,
                             read_user_stack_64(&uregs[PT_R1], &sp))
                                 return;
                         level = 0;
-                       callchain_store(entry, PERF_CONTEXT_USER);
-                       callchain_store(entry, next_ip);
+                       perf_callchain_store(entry, PERF_CONTEXT_USER);
+                       perf_callchain_store(entry, next_ip);
                         continue;
                 }
  
                 if (level == 0)
                         next_ip = lr;
-               callchain_store(entry, next_ip);
+               perf_callchain_store(entry, next_ip);
                 ++level;
                 sp = next_sp;
         }
@@ -315,8 +301,8 @@ static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
         return __get_user_inatomic(*ret, ptr);
  }
  
-static inline void perf_callchain_user_64(struct pt_regs *regs,
-                                         struct perf_callchain_entry *entry)
+static inline void perf_callchain_user_64(struct perf_callchain_entry *entry,
+                                         struct pt_regs *regs)
  {
  }
  
@@ -435,8 +421,8 @@ static unsigned int __user *signal_frame_32_regs(unsigned int sp,
         return mctx->mc_gregs;
  }
  
-static void perf_callchain_user_32(struct pt_regs *regs,
-                                  struct perf_callchain_entry *entry)
+static void perf_callchain_user_32(struct perf_callchain_entry *entry,
+                                  struct pt_regs *regs)
  {
         unsigned int sp, next_sp;
         unsigned int next_ip;
@@ -447,8 +433,7 @@ static void perf_callchain_user_32(struct pt_regs *regs,
         next_ip = regs->nip;
         lr = regs->link;
         sp = regs->gpr[1];
-       callchain_store(entry, PERF_CONTEXT_USER);
-       callchain_store(entry, next_ip);
+       perf_callchain_store(entry, next_ip);
  
         while (entry->nr < PERF_MAX_STACK_DEPTH) {
                 fp = (unsigned int __user *) (unsigned long) sp;
@@ -470,45 +455,24 @@ static void perf_callchain_user_32(struct pt_regs *regs,
                             read_user_stack_32(&uregs[PT_R1], &sp))
                                 return;
                         level = 0;
-                       callchain_store(entry, PERF_CONTEXT_USER);
-                       callchain_store(entry, next_ip);
+                       perf_callchain_store(entry, PERF_CONTEXT_USER);
+                       perf_callchain_store(entry, next_ip);
                         continue;
                 }
  
                 if (level == 0)
                         next_ip = lr;
-               callchain_store(entry, next_ip);
+               perf_callchain_store(entry, next_ip);
                 ++level;
                 sp = next_sp;
         }
  }
  
-/*
- * Since we can't get PMU interrupts inside a PMU interrupt handler,
- * we don't need separate irq and nmi entries here.
- */
-static DEFINE_PER_CPU(struct perf_callchain_entry, cpu_perf_callchain);
-
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
  {
-       struct perf_callchain_entry *entry = &__get_cpu_var(cpu_perf_callchain);
-
-       entry->nr = 0;
-
-       if (!user_mode(regs)) {
-               perf_callchain_kernel(regs, entry);
-               if (current->mm)
-                       regs = task_pt_regs(current);
-               else
-                       regs = NULL;
-       }
-
-       if (regs) {
-               if (current_is_64bit())
-                       perf_callchain_user_64(regs, entry);
-               else
-                       perf_callchain_user_32(regs, entry);
-       }
-
-       return entry;
+       if (current_is_64bit())
+               perf_callchain_user_64(entry, regs);
+       else
+               perf_callchain_user_32(entry, regs);
  }
diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c

index d301a30445e09a49cec4a3d4dcf2ea01529934b3..3129c855933c2a3857b0c4b3321b259b851279b8 100644 (file)
--- a/arch/powerpc/kernel/perf_event.c
+++ b/arch/powerpc/kernel/perf_event.c
@@ -402,6 +402,9 @@ static void power_pmu_read(struct perf_event *event)
  {
         s64 val, delta, prev;
  
+       if (event->hw.state & PERF_HES_STOPPED)
+               return;
+
         if (!event->hw.idx)
                 return;
         /*
@@ -517,7 +520,7 @@ static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0)
   * Disable all events to prevent PMU interrupts and to allow
   * events to be added or removed.
   */
-void hw_perf_disable(void)
+static void power_pmu_disable(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuhw;
         unsigned long flags;
@@ -565,7 +568,7 @@ void hw_perf_disable(void)
   * If we were previously disabled and events were added, then
   * put the new config on the PMU.
   */
-void hw_perf_enable(void)
+static void power_pmu_enable(struct pmu *pmu)
  {
         struct perf_event *event;
         struct cpu_hw_events *cpuhw;
@@ -672,6 +675,8 @@ void hw_perf_enable(void)
                 }
                 local64_set(&event->hw.prev_count, val);
                 event->hw.idx = idx;
+               if (event->hw.state & PERF_HES_STOPPED)
+                       val = 0;
                 write_pmc(idx, val);
                 perf_event_update_userpage(event);
         }
@@ -727,7 +732,7 @@ static int collect_events(struct perf_event *group, int max_count,
   * re-enable the PMU in order to get hw_perf_enable to do the
   * actual work of reconfiguring the PMU.
   */
-static int power_pmu_enable(struct perf_event *event)
+static int power_pmu_add(struct perf_event *event, int ef_flags)
  {
         struct cpu_hw_events *cpuhw;
         unsigned long flags;
@@ -735,7 +740,7 @@ static int power_pmu_enable(struct perf_event *event)
         int ret = -EAGAIN;
  
         local_irq_save(flags);
-       perf_disable();
+       perf_pmu_disable(event->pmu);
  
         /*
          * Add the event to the list (if there is room)
@@ -749,6 +754,9 @@ static int power_pmu_enable(struct perf_event *event)
         cpuhw->events[n0] = event->hw.config;
         cpuhw->flags[n0] = event->hw.event_base;
  
+       if (!(ef_flags & PERF_EF_START))
+               event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+
         /*
          * If group events scheduling transaction was started,
          * skip the schedulability test here, it will be peformed
@@ -769,7 +777,7 @@ nocheck:
  
         ret = 0;
   out:
-       perf_enable();
+       perf_pmu_enable(event->pmu);
         local_irq_restore(flags);
         return ret;
  }
@@ -777,14 +785,14 @@ nocheck:
  /*
   * Remove a event from the PMU.
   */
-static void power_pmu_disable(struct perf_event *event)
+static void power_pmu_del(struct perf_event *event, int ef_flags)
  {
         struct cpu_hw_events *cpuhw;
         long i;
         unsigned long flags;
  
         local_irq_save(flags);
-       perf_disable();
+       perf_pmu_disable(event->pmu);
  
         power_pmu_read(event);
  
@@ -821,34 +829,60 @@ static void power_pmu_disable(struct perf_event *event)
                 cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
         }
  
-       perf_enable();
+       perf_pmu_enable(event->pmu);
         local_irq_restore(flags);
  }
  
  /*
- * Re-enable interrupts on a event after they were throttled
- * because they were coming too fast.
+ * POWER-PMU does not support disabling individual counters, hence
+ * program their cycle counter to their max value and ignore the interrupts.
   */
-static void power_pmu_unthrottle(struct perf_event *event)
+
+static void power_pmu_start(struct perf_event *event, int ef_flags)
+{
+       unsigned long flags;
+       s64 left;
+
+       if (!event->hw.idx || !event->hw.sample_period)
+               return;
+
+       if (!(event->hw.state & PERF_HES_STOPPED))
+               return;
+
+       if (ef_flags & PERF_EF_RELOAD)
+               WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+
+       local_irq_save(flags);
+       perf_pmu_disable(event->pmu);
+
+       event->hw.state = 0;
+       left = local64_read(&event->hw.period_left);
+       write_pmc(event->hw.idx, left);
+
+       perf_event_update_userpage(event);
+       perf_pmu_enable(event->pmu);
+       local_irq_restore(flags);
+}
+
+static void power_pmu_stop(struct perf_event *event, int ef_flags)
  {
-       s64 val, left;
         unsigned long flags;
  
         if (!event->hw.idx || !event->hw.sample_period)
                 return;
+
+       if (event->hw.state & PERF_HES_STOPPED)
+               return;
+
         local_irq_save(flags);
-       perf_disable();
+       perf_pmu_disable(event->pmu);
+
         power_pmu_read(event);
-       left = event->hw.sample_period;
-       event->hw.last_period = left;
-       val = 0;
-       if (left < 0x80000000L)
-               val = 0x80000000L - left;
-       write_pmc(event->hw.idx, val);
-       local64_set(&event->hw.prev_count, val);
-       local64_set(&event->hw.period_left, left);
+       event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+       write_pmc(event->hw.idx, 0);
+
         perf_event_update_userpage(event);
-       perf_enable();
+       perf_pmu_enable(event->pmu);
         local_irq_restore(flags);
  }
  
@@ -857,10 +891,11 @@ static void power_pmu_unthrottle(struct perf_event *event)
   * Set the flag to make pmu::enable() not perform the
   * schedulability test, it will be performed at commit time
   */
-void power_pmu_start_txn(const struct pmu *pmu)
+void power_pmu_start_txn(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
  
+       perf_pmu_disable(pmu);
         cpuhw->group_flag |= PERF_EVENT_TXN;
         cpuhw->n_txn_start = cpuhw->n_events;
  }
@@ -870,11 +905,12 @@ void power_pmu_start_txn(const struct pmu *pmu)
   * Clear the flag and pmu::enable() will perform the
   * schedulability test.
   */
-void power_pmu_cancel_txn(const struct pmu *pmu)
+void power_pmu_cancel_txn(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
  
         cpuhw->group_flag &= ~PERF_EVENT_TXN;
+       perf_pmu_enable(pmu);
  }
  
  /*
@@ -882,7 +918,7 @@ void power_pmu_cancel_txn(const struct pmu *pmu)
   * Perform the group schedulability test as a whole
   * Return 0 if success
   */
-int power_pmu_commit_txn(const struct pmu *pmu)
+int power_pmu_commit_txn(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuhw;
         long i, n;
@@ -901,19 +937,10 @@ int power_pmu_commit_txn(const struct pmu *pmu)
                 cpuhw->event[i]->hw.config = cpuhw->events[i];
  
         cpuhw->group_flag &= ~PERF_EVENT_TXN;
+       perf_pmu_enable(pmu);
         return 0;
  }
  
-struct pmu power_pmu = {
-       .enable         = power_pmu_enable,
-       .disable        = power_pmu_disable,
-       .read           = power_pmu_read,
-       .unthrottle     = power_pmu_unthrottle,
-       .start_txn      = power_pmu_start_txn,
-       .cancel_txn     = power_pmu_cancel_txn,
-       .commit_txn     = power_pmu_commit_txn,
-};
-
  /*
   * Return 1 if we might be able to put event on a limited PMC,
   * or 0 if not.
@@ -1014,7 +1041,7 @@ static int hw_perf_cache_event(u64 config, u64 *eventp)
         return 0;
  }
  
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+static int power_pmu_event_init(struct perf_event *event)
  {
         u64 ev;
         unsigned long flags;
@@ -1026,25 +1053,27 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
         struct cpu_hw_events *cpuhw;
  
         if (!ppmu)
-               return ERR_PTR(-ENXIO);
+               return -ENOENT;
+
         switch (event->attr.type) {
         case PERF_TYPE_HARDWARE:
                 ev = event->attr.config;
                 if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
-                       return ERR_PTR(-EOPNOTSUPP);
+                       return -EOPNOTSUPP;
                 ev = ppmu->generic_events[ev];
                 break;
         case PERF_TYPE_HW_CACHE:
                 err = hw_perf_cache_event(event->attr.config, &ev);
                 if (err)
-                       return ERR_PTR(err);
+                       return err;
                 break;
         case PERF_TYPE_RAW:
                 ev = event->attr.config;
                 break;
         default:
-               return ERR_PTR(-EINVAL);
+               return -ENOENT;
         }
+
         event->hw.config_base = ev;
         event->hw.idx = 0;
  
@@ -1063,7 +1092,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
          * XXX we should check if the task is an idle task.
          */
         flags = 0;
-       if (event->ctx->task)
+       if (event->attach_state & PERF_ATTACH_TASK)
                 flags |= PPMU_ONLY_COUNT_RUN;
  
         /*
@@ -1081,7 +1110,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
                          */
                         ev = normal_pmc_alternative(ev, flags);
                         if (!ev)
-                               return ERR_PTR(-EINVAL);
+                               return -EINVAL;
                 }
         }
  
@@ -1095,19 +1124,19 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
                 n = collect_events(event->group_leader, ppmu->n_counter - 1,
                                    ctrs, events, cflags);
                 if (n < 0)
-                       return ERR_PTR(-EINVAL);
+                       return -EINVAL;
         }
         events[n] = ev;
         ctrs[n] = event;
         cflags[n] = flags;
         if (check_excludes(ctrs, cflags, n, 1))
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
  
         cpuhw = &get_cpu_var(cpu_hw_events);
         err = power_check_constraints(cpuhw, events, cflags, n + 1);
         put_cpu_var(cpu_hw_events);
         if (err)
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
  
         event->hw.config = events[n];
         event->hw.event_base = cflags[n];
@@ -1132,11 +1161,23 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
         }
         event->destroy = hw_perf_event_destroy;
  
-       if (err)
-               return ERR_PTR(err);
-       return &power_pmu;
+       return err;
  }
  
+struct pmu power_pmu = {
+       .pmu_enable     = power_pmu_enable,
+       .pmu_disable    = power_pmu_disable,
+       .event_init     = power_pmu_event_init,
+       .add            = power_pmu_add,
+       .del            = power_pmu_del,
+       .start          = power_pmu_start,
+       .stop           = power_pmu_stop,
+       .read           = power_pmu_read,
+       .start_txn      = power_pmu_start_txn,
+       .cancel_txn     = power_pmu_cancel_txn,
+       .commit_txn     = power_pmu_commit_txn,
+};
+
  /*
   * A counter has overflowed; update its count and record
   * things if requested.  Note that interrupts are hard-disabled
@@ -1149,6 +1190,11 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
         s64 prev, delta, left;
         int record = 0;
  
+       if (event->hw.state & PERF_HES_STOPPED) {
+               write_pmc(event->hw.idx, 0);
+               return;
+       }
+
         /* we don't have to worry about interrupts here */
         prev = local64_read(&event->hw.prev_count);
         delta = (val - prev) & 0xfffffffful;
@@ -1171,6 +1217,11 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
                         val = 0x80000000LL - left;
         }
  
+       write_pmc(event->hw.idx, val);
+       local64_set(&event->hw.prev_count, val);
+       local64_set(&event->hw.period_left, left);
+       perf_event_update_userpage(event);
+
         /*
          * Finally record data if requested.
          */
@@ -1183,23 +1234,9 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
                 if (event->attr.sample_type & PERF_SAMPLE_ADDR)
                         perf_get_data_addr(regs, &data.addr);
  
-               if (perf_event_overflow(event, nmi, &data, regs)) {
-                       /*
-                        * Interrupts are coming too fast - throttle them
-                        * by setting the event to 0, so it will be
-                        * at least 2^30 cycles until the next interrupt
-                        * (assuming each event counts at most 2 counts
-                        * per cycle).
-                        */
-                       val = 0;
-                       left = ~0ULL >> 1;
-               }
+               if (perf_event_overflow(event, nmi, &data, regs))
+                       power_pmu_stop(event, 0);
         }
-
-       write_pmc(event->hw.idx, val);
-       local64_set(&event->hw.prev_count, val);
-       local64_set(&event->hw.period_left, left);
-       perf_event_update_userpage(event);
  }
  
  /*
@@ -1342,6 +1379,7 @@ int register_power_pmu(struct power_pmu *pmu)
                 freeze_events_kernel = MMCR0_FCHV;
  #endif /* CONFIG_PPC64 */
  
+       perf_pmu_register(&power_pmu);
         perf_cpu_notifier(power_pmu_notifier);
  
         return 0;
diff --git a/arch/powerpc/kernel/perf_event_fsl_emb.c b/arch/powerpc/kernel/perf_event_fsl_emb.c

index 1ba45471ae436617e1ecbf3654a5064ef15d1af7..7ecca59ddf77fe20bd46b470d9392cdd16fd5ba9 100644 (file)
--- a/arch/powerpc/kernel/perf_event_fsl_emb.c
+++ b/arch/powerpc/kernel/perf_event_fsl_emb.c
@@ -156,6 +156,9 @@ static void fsl_emb_pmu_read(struct perf_event *event)
  {
         s64 val, delta, prev;
  
+       if (event->hw.state & PERF_HES_STOPPED)
+               return;
+
         /*
          * Performance monitor interrupts come even when interrupts
          * are soft-disabled, as long as interrupts are hard-enabled.
@@ -177,7 +180,7 @@ static void fsl_emb_pmu_read(struct perf_event *event)
   * Disable all events to prevent PMU interrupts and to allow
   * events to be added or removed.
   */
-void hw_perf_disable(void)
+static void fsl_emb_pmu_disable(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuhw;
         unsigned long flags;
@@ -216,7 +219,7 @@ void hw_perf_disable(void)
   * If we were previously disabled and events were added, then
   * put the new config on the PMU.
   */
-void hw_perf_enable(void)
+static void fsl_emb_pmu_enable(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuhw;
         unsigned long flags;
@@ -262,8 +265,8 @@ static int collect_events(struct perf_event *group, int max_count,
         return n;
  }
  
-/* perf must be disabled, context locked on entry */
-static int fsl_emb_pmu_enable(struct perf_event *event)
+/* context locked on entry */
+static int fsl_emb_pmu_add(struct perf_event *event, int flags)
  {
         struct cpu_hw_events *cpuhw;
         int ret = -EAGAIN;
@@ -271,6 +274,7 @@ static int fsl_emb_pmu_enable(struct perf_event *event)
         u64 val;
         int i;
  
+       perf_pmu_disable(event->pmu);
         cpuhw = &get_cpu_var(cpu_hw_events);
  
         if (event->hw.config & FSL_EMB_EVENT_RESTRICTED)
@@ -301,6 +305,12 @@ static int fsl_emb_pmu_enable(struct perf_event *event)
                         val = 0x80000000L - left;
         }
         local64_set(&event->hw.prev_count, val);
+
+       if (!(flags & PERF_EF_START)) {
+               event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+               val = 0;
+       }
+
         write_pmc(i, val);
         perf_event_update_userpage(event);
  
@@ -310,15 +320,17 @@ static int fsl_emb_pmu_enable(struct perf_event *event)
         ret = 0;
   out:
         put_cpu_var(cpu_hw_events);
+       perf_pmu_enable(event->pmu);
         return ret;
  }
  
-/* perf must be disabled, context locked on entry */
-static void fsl_emb_pmu_disable(struct perf_event *event)
+/* context locked on entry */
+static void fsl_emb_pmu_del(struct perf_event *event, int flags)
  {
         struct cpu_hw_events *cpuhw;
         int i = event->hw.idx;
  
+       perf_pmu_disable(event->pmu);
         if (i < 0)
                 goto out;
  
@@ -346,44 +358,57 @@ static void fsl_emb_pmu_disable(struct perf_event *event)
         cpuhw->n_events--;
  
   out:
+       perf_pmu_enable(event->pmu);
         put_cpu_var(cpu_hw_events);
  }
  
-/*
- * Re-enable interrupts on a event after they were throttled
- * because they were coming too fast.
- *
- * Context is locked on entry, but perf is not disabled.
- */
-static void fsl_emb_pmu_unthrottle(struct perf_event *event)
+static void fsl_emb_pmu_start(struct perf_event *event, int ef_flags)
  {
-       s64 val, left;
         unsigned long flags;
+       s64 left;
  
         if (event->hw.idx < 0 || !event->hw.sample_period)
                 return;
+
+       if (!(event->hw.state & PERF_HES_STOPPED))
+               return;
+
+       if (ef_flags & PERF_EF_RELOAD)
+               WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+
         local_irq_save(flags);
-       perf_disable();
-       fsl_emb_pmu_read(event);
-       left = event->hw.sample_period;
-       event->hw.last_period = left;
-       val = 0;
-       if (left < 0x80000000L)
-               val = 0x80000000L - left;
-       write_pmc(event->hw.idx, val);
-       local64_set(&event->hw.prev_count, val);
-       local64_set(&event->hw.period_left, left);
+       perf_pmu_disable(event->pmu);
+
+       event->hw.state = 0;
+       left = local64_read(&event->hw.period_left);
+       write_pmc(event->hw.idx, left);
+
         perf_event_update_userpage(event);
-       perf_enable();
+       perf_pmu_enable(event->pmu);
         local_irq_restore(flags);
  }
  
-static struct pmu fsl_emb_pmu = {
-       .enable         = fsl_emb_pmu_enable,
-       .disable        = fsl_emb_pmu_disable,
-       .read           = fsl_emb_pmu_read,
-       .unthrottle     = fsl_emb_pmu_unthrottle,
-};
+static void fsl_emb_pmu_stop(struct perf_event *event, int ef_flags)
+{
+       unsigned long flags;
+
+       if (event->hw.idx < 0 || !event->hw.sample_period)
+               return;
+
+       if (event->hw.state & PERF_HES_STOPPED)
+               return;
+
+       local_irq_save(flags);
+       perf_pmu_disable(event->pmu);
+
+       fsl_emb_pmu_read(event);
+       event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+       write_pmc(event->hw.idx, 0);
+
+       perf_event_update_userpage(event);
+       perf_pmu_enable(event->pmu);
+       local_irq_restore(flags);
+}
  
  /*
   * Release the PMU if this is the last perf_event.
@@ -428,7 +453,7 @@ static int hw_perf_cache_event(u64 config, u64 *eventp)
         return 0;
  }
  
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+static int fsl_emb_pmu_event_init(struct perf_event *event)
  {
         u64 ev;
         struct perf_event *events[MAX_HWEVENTS];
@@ -441,14 +466,14 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
         case PERF_TYPE_HARDWARE:
                 ev = event->attr.config;
                 if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
-                       return ERR_PTR(-EOPNOTSUPP);
+                       return -EOPNOTSUPP;
                 ev = ppmu->generic_events[ev];
                 break;
  
         case PERF_TYPE_HW_CACHE:
                 err = hw_perf_cache_event(event->attr.config, &ev);
                 if (err)
-                       return ERR_PTR(err);
+                       return err;
                 break;
  
         case PERF_TYPE_RAW:
@@ -456,12 +481,12 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
                 break;
  
         default:
-               return ERR_PTR(-EINVAL);
+               return -ENOENT;
         }
  
         event->hw.config = ppmu->xlate_event(ev);
         if (!(event->hw.config & FSL_EMB_EVENT_VALID))
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
  
         /*
          * If this is in a group, check if it can go on with all the
@@ -473,7 +498,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
                 n = collect_events(event->group_leader,
                                    ppmu->n_counter - 1, events);
                 if (n < 0)
-                       return ERR_PTR(-EINVAL);
+                       return -EINVAL;
         }
  
         if (event->hw.config & FSL_EMB_EVENT_RESTRICTED) {
@@ -484,7 +509,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
                 }
  
                 if (num_restricted >= ppmu->n_restricted)
-                       return ERR_PTR(-EINVAL);
+                       return -EINVAL;
         }
  
         event->hw.idx = -1;
@@ -497,7 +522,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
         if (event->attr.exclude_kernel)
                 event->hw.config_base |= PMLCA_FCS;
         if (event->attr.exclude_idle)
-               return ERR_PTR(-ENOTSUPP);
+               return -ENOTSUPP;
  
         event->hw.last_period = event->hw.sample_period;
         local64_set(&event->hw.period_left, event->hw.last_period);
@@ -523,11 +548,20 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
         }
         event->destroy = hw_perf_event_destroy;
  
-       if (err)
-               return ERR_PTR(err);
-       return &fsl_emb_pmu;
+       return err;
  }
  
+static struct pmu fsl_emb_pmu = {
+       .pmu_enable     = fsl_emb_pmu_enable,
+       .pmu_disable    = fsl_emb_pmu_disable,
+       .event_init     = fsl_emb_pmu_event_init,
+       .add            = fsl_emb_pmu_add,
+       .del            = fsl_emb_pmu_del,
+       .start          = fsl_emb_pmu_start,
+       .stop           = fsl_emb_pmu_stop,
+       .read           = fsl_emb_pmu_read,
+};
+
  /*
   * A counter has overflowed; update its count and record
   * things if requested.  Note that interrupts are hard-disabled
@@ -540,6 +574,11 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
         s64 prev, delta, left;
         int record = 0;
  
+       if (event->hw.state & PERF_HES_STOPPED) {
+               write_pmc(event->hw.idx, 0);
+               return;
+       }
+
         /* we don't have to worry about interrupts here */
         prev = local64_read(&event->hw.prev_count);
         delta = (val - prev) & 0xfffffffful;
@@ -562,6 +601,11 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
                         val = 0x80000000LL - left;
         }
  
+       write_pmc(event->hw.idx, val);
+       local64_set(&event->hw.prev_count, val);
+       local64_set(&event->hw.period_left, left);
+       perf_event_update_userpage(event);
+
         /*
          * Finally record data if requested.
          */
@@ -571,23 +615,9 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
                 perf_sample_data_init(&data, 0);
                 data.period = event->hw.last_period;
  
-               if (perf_event_overflow(event, nmi, &data, regs)) {
-                       /*
-                        * Interrupts are coming too fast - throttle them
-                        * by setting the event to 0, so it will be
-                        * at least 2^30 cycles until the next interrupt
-                        * (assuming each event counts at most 2 counts
-                        * per cycle).
-                        */
-                       val = 0;
-                       left = ~0ULL >> 1;
-               }
+               if (perf_event_overflow(event, nmi, &data, regs))
+                       fsl_emb_pmu_stop(event, 0);
         }
-
-       write_pmc(event->hw.idx, val);
-       local64_set(&event->hw.prev_count, val);
-       local64_set(&event->hw.period_left, left);
-       perf_event_update_userpage(event);
  }
  
  static void perf_event_interrupt(struct pt_regs *regs)
@@ -651,5 +681,7 @@ int register_fsl_emb_pmu(struct fsl_emb_pmu *pmu)
         pr_info("%s performance monitor hardware support registered\n",
                 pmu->name);
  
+       perf_pmu_register(&fsl_emb_pmu);
+
         return 0;
  }
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c

index 8533b3b83f5d0e35b50fb39eefb663e6d81975b0..54888eb10c3b4686f5b3d29a7166afc87bc76da0 100644 (file)
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -53,7 +53,7 @@
  #include <linux/posix-timers.h>
  #include <linux/irq.h>
  #include <linux/delay.h>
-#include <linux/perf_event.h>
+#include <linux/irq_work.h>
  #include <asm/trace.h>
  
  #include <asm/io.h>
@@ -493,60 +493,60 @@ void __init iSeries_time_init_early(void)
  }
  #endif /* CONFIG_PPC_ISERIES */
  
-#ifdef CONFIG_PERF_EVENTS
+#ifdef CONFIG_IRQ_WORK
  
  /*
   * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
   */
  #ifdef CONFIG_PPC64
-static inline unsigned long test_perf_event_pending(void)
+static inline unsigned long test_irq_work_pending(void)
  {
         unsigned long x;
  
         asm volatile("lbz %0,%1(13)"
                 : "=r" (x)
-               : "i" (offsetof(struct paca_struct, perf_event_pending)));
+               : "i" (offsetof(struct paca_struct, irq_work_pending)));
         return x;
  }
  
-static inline void set_perf_event_pending_flag(void)
+static inline void set_irq_work_pending_flag(void)
  {
         asm volatile("stb %0,%1(13)" : :
                 "r" (1),
-               "i" (offsetof(struct paca_struct, perf_event_pending)));
+               "i" (offsetof(struct paca_struct, irq_work_pending)));
  }
  
-static inline void clear_perf_event_pending(void)
+static inline void clear_irq_work_pending(void)
  {
         asm volatile("stb %0,%1(13)" : :
                 "r" (0),
-               "i" (offsetof(struct paca_struct, perf_event_pending)));
+               "i" (offsetof(struct paca_struct, irq_work_pending)));
  }
  
  #else /* 32-bit */
  
-DEFINE_PER_CPU(u8, perf_event_pending);
+DEFINE_PER_CPU(u8, irq_work_pending);
  
-#define set_perf_event_pending_flag()  __get_cpu_var(perf_event_pending) = 1
-#define test_perf_event_pending()      __get_cpu_var(perf_event_pending)
-#define clear_perf_event_pending()     __get_cpu_var(perf_event_pending) = 0
+#define set_irq_work_pending_flag()    __get_cpu_var(irq_work_pending) = 1
+#define test_irq_work_pending()                __get_cpu_var(irq_work_pending)
+#define clear_irq_work_pending()       __get_cpu_var(irq_work_pending) = 0
  
  #endif /* 32 vs 64 bit */
  
-void set_perf_event_pending(void)
+void set_irq_work_pending(void)
  {
         preempt_disable();
-       set_perf_event_pending_flag();
+       set_irq_work_pending_flag();
         set_dec(1);
         preempt_enable();
  }
  
-#else  /* CONFIG_PERF_EVENTS */
+#else  /* CONFIG_IRQ_WORK */
  
-#define test_perf_event_pending()      0
-#define clear_perf_event_pending()
+#define test_irq_work_pending()        0
+#define clear_irq_work_pending()
  
-#endif /* CONFIG_PERF_EVENTS */
+#endif /* CONFIG_IRQ_WORK */
  
  /*
   * For iSeries shared processors, we have to let the hypervisor
@@ -587,9 +587,9 @@ void timer_interrupt(struct pt_regs * regs)
  
         calculate_steal_time();
  
-       if (test_perf_event_pending()) {
-               clear_perf_event_pending();
-               perf_event_do_pending();
+       if (test_irq_work_pending()) {
+               clear_irq_work_pending();
+               irq_work_run();
         }
  
  #ifdef CONFIG_PPC_ISERIES
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig

index 74a2f1b607a473b6bf39ced372bf89b96e0db452..75976a14194770390154ca124d9e94f95eedecf4 100644 (file)
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -95,6 +95,7 @@ config S390
         select HAVE_KVM if 64BIT
         select HAVE_ARCH_TRACEHOOK
         select INIT_ALL_POSSIBLE
+       select HAVE_IRQ_WORK
         select HAVE_PERF_EVENTS
         select HAVE_KERNEL_GZIP
         select HAVE_KERNEL_BZIP2
diff --git a/arch/s390/include/asm/hardirq.h b/arch/s390/include/asm/hardirq.h

index 498bc38923856ddb1957ddd0ac934b7c7e510003..881d94590aeb4c84efb8e5c58fae68b73b70f6e5 100644 (file)
--- a/arch/s390/include/asm/hardirq.h
+++ b/arch/s390/include/asm/hardirq.h
@@ -12,10 +12,6 @@
  #ifndef __ASM_HARDIRQ_H
  #define __ASM_HARDIRQ_H
  
-#include <linux/threads.h>
-#include <linux/sched.h>
-#include <linux/cache.h>
-#include <linux/interrupt.h>
  #include <asm/lowcore.h>
  
  #define local_softirq_pending() (S390_lowcore.softirq_pending)
diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h

index 3840cbe77637fe1ffef8c5baceb2c82c8ed7759d..a75f168d2718578544914c8a00eb0e1245ef9c8f 100644 (file)
--- a/arch/s390/include/asm/perf_event.h
+++ b/arch/s390/include/asm/perf_event.h
@@ -4,7 +4,6 @@
   * Copyright 2009 Martin Schwidefsky, IBM Corporation.
   */
  
-static inline void set_perf_event_pending(void) {}
-static inline void clear_perf_event_pending(void) {}
+/* Empty, just to avoid compiling error */
  
  #define PERF_EVENT_INDEX_OFFSET 0
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig

index 33990fa95af013a64e081c4bfc411f58228f64cb..35b6879628a04feccd1dd2071846cb92563f5586 100644 (file)
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -16,6 +16,7 @@ config SUPERH
         select HAVE_ARCH_TRACEHOOK
         select HAVE_DMA_API_DEBUG
         select HAVE_DMA_ATTRS
+       select HAVE_IRQ_WORK
         select HAVE_PERF_EVENTS
         select PERF_USE_VMALLOC
         select HAVE_KERNEL_GZIP
@@ -249,6 +250,11 @@ config ARCH_SHMOBILE
         select PM
         select PM_RUNTIME
  
+config CPU_HAS_PMU
+       depends on CPU_SH4 || CPU_SH4A
+       default y
+       bool
+
  if SUPERH32
  
  choice
@@ -738,6 +744,14 @@ config GUSA_RB
           LLSC, this should be more efficient than the other alternative of
           disabling interrupts around the atomic sequence.
  
+config HW_PERF_EVENTS
+       bool "Enable hardware performance counter support for perf events"
+       depends on PERF_EVENTS && CPU_HAS_PMU
+       default y
+       help
+         Enable hardware performance counter support for perf events. If
+         disabled, perf events will use software events only.
+
  source "drivers/sh/Kconfig"
  
  endmenu
diff --git a/arch/sh/include/asm/perf_event.h b/arch/sh/include/asm/perf_event.h

index 3d0c9f36d15050bb049fe8cfdbe4e7ae73f08f6d..14308bed7ea510cb6b43429887ca727cf1f0510d 100644 (file)
--- a/arch/sh/include/asm/perf_event.h
+++ b/arch/sh/include/asm/perf_event.h
@@ -26,11 +26,4 @@ extern int register_sh_pmu(struct sh_pmu *);
  extern int reserve_pmc_hardware(void);
  extern void release_pmc_hardware(void);
  
-static inline void set_perf_event_pending(void)
-{
-       /* Nothing to see here, move along. */
-}
-
-#define PERF_EVENT_INDEX_OFFSET        0
-
  #endif /* __ASM_SH_PERF_EVENT_H */
diff --git a/arch/sh/kernel/perf_callchain.c b/arch/sh/kernel/perf_callchain.c

index a9dd3abde28e3f45bbd7d7654e8717c13aed8f34..d5ca1ef50fa9694a1942a8c304bd2c5aa69d9388 100644 (file)
--- a/arch/sh/kernel/perf_callchain.c
+++ b/arch/sh/kernel/perf_callchain.c
@@ -14,11 +14,6 @@
  #include <asm/unwinder.h>
  #include <asm/ptrace.h>
  
-static inline void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
-       if (entry->nr < PERF_MAX_STACK_DEPTH)
-               entry->ip[entry->nr++] = ip;
-}
  
  static void callchain_warning(void *data, char *msg)
  {
@@ -39,7 +34,7 @@ static void callchain_address(void *data, unsigned long addr, int reliable)
         struct perf_callchain_entry *entry = data;
  
         if (reliable)
-               callchain_store(entry, addr);
+               perf_callchain_store(entry, addr);
  }
  
  static const struct stacktrace_ops callchain_ops = {
@@ -49,47 +44,10 @@ static const struct stacktrace_ops callchain_ops = {
         .address        = callchain_address,
  };
  
-static void
-perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
  {
-       callchain_store(entry, PERF_CONTEXT_KERNEL);
-       callchain_store(entry, regs->pc);
+       perf_callchain_store(entry, regs->pc);
  
         unwind_stack(NULL, regs, NULL, &callchain_ops, entry);
  }
-
-static void
-perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-       int is_user;
-
-       if (!regs)
-               return;
-
-       is_user = user_mode(regs);
-
-       if (is_user && current->state != TASK_RUNNING)
-               return;
-
-       /*
-        * Only the kernel side is implemented for now.
-        */
-       if (!is_user)
-               perf_callchain_kernel(regs, entry);
-}
-
-/*
- * No need for separate IRQ and NMI entries.
- */
-static DEFINE_PER_CPU(struct perf_callchain_entry, callchain);
-
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-       struct perf_callchain_entry *entry = &__get_cpu_var(callchain);
-
-       entry->nr = 0;
-
-       perf_do_callchain(regs, entry);
-
-       return entry;
-}
diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c

index 7a3dc356725839f2cf8579491efd8d02ba11b483..5a4b33435650c8ea108668d8e7e30786a20bd335 100644 (file)
--- a/arch/sh/kernel/perf_event.c
+++ b/arch/sh/kernel/perf_event.c
@@ -59,6 +59,24 @@ static inline int sh_pmu_initialized(void)
         return !!sh_pmu;
  }
  
+const char *perf_pmu_name(void)
+{
+       if (!sh_pmu)
+               return NULL;
+
+       return sh_pmu->name;
+}
+EXPORT_SYMBOL_GPL(perf_pmu_name);
+
+int perf_num_counters(void)
+{
+       if (!sh_pmu)
+               return 0;
+
+       return sh_pmu->num_events;
+}
+EXPORT_SYMBOL_GPL(perf_num_counters);
+
  /*
   * Release the PMU if this is the last perf_event.
   */
@@ -206,50 +224,80 @@ again:
         local64_add(delta, &event->count);
  }
  
-static void sh_pmu_disable(struct perf_event *event)
+static void sh_pmu_stop(struct perf_event *event, int flags)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         struct hw_perf_event *hwc = &event->hw;
         int idx = hwc->idx;
  
-       clear_bit(idx, cpuc->active_mask);
-       sh_pmu->disable(hwc, idx);
+       if (!(event->hw.state & PERF_HES_STOPPED)) {
+               sh_pmu->disable(hwc, idx);
+               cpuc->events[idx] = NULL;
+               event->hw.state |= PERF_HES_STOPPED;
+       }
+
+       if ((flags & PERF_EF_UPDATE) && !(event->hw.state & PERF_HES_UPTODATE)) {
+               sh_perf_event_update(event, &event->hw, idx);
+               event->hw.state |= PERF_HES_UPTODATE;
+       }
+}
+
+static void sh_pmu_start(struct perf_event *event, int flags)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+       int idx = hwc->idx;
+
+       if (WARN_ON_ONCE(idx == -1))
+               return;
+
+       if (flags & PERF_EF_RELOAD)
+               WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
  
-       barrier();
+       cpuc->events[idx] = event;
+       event->hw.state = 0;
+       sh_pmu->enable(hwc, idx);
+}
  
-       sh_perf_event_update(event, &event->hw, idx);
+static void sh_pmu_del(struct perf_event *event, int flags)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
  
-       cpuc->events[idx] = NULL;
-       clear_bit(idx, cpuc->used_mask);
+       sh_pmu_stop(event, PERF_EF_UPDATE);
+       __clear_bit(event->hw.idx, cpuc->used_mask);
  
         perf_event_update_userpage(event);
  }
  
-static int sh_pmu_enable(struct perf_event *event)
+static int sh_pmu_add(struct perf_event *event, int flags)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         struct hw_perf_event *hwc = &event->hw;
         int idx = hwc->idx;
+       int ret = -EAGAIN;
+
+       perf_pmu_disable(event->pmu);
  
-       if (test_and_set_bit(idx, cpuc->used_mask)) {
+       if (__test_and_set_bit(idx, cpuc->used_mask)) {
                 idx = find_first_zero_bit(cpuc->used_mask, sh_pmu->num_events);
                 if (idx == sh_pmu->num_events)
-                       return -EAGAIN;
+                       goto out;
  
-               set_bit(idx, cpuc->used_mask);
+               __set_bit(idx, cpuc->used_mask);
                 hwc->idx = idx;
         }
  
         sh_pmu->disable(hwc, idx);
  
-       cpuc->events[idx] = event;
-       set_bit(idx, cpuc->active_mask);
-
-       sh_pmu->enable(hwc, idx);
+       event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+       if (flags & PERF_EF_START)
+               sh_pmu_start(event, PERF_EF_RELOAD);
  
         perf_event_update_userpage(event);
-
-       return 0;
+       ret = 0;
+out:
+       perf_pmu_enable(event->pmu);
+       return ret;
  }
  
  static void sh_pmu_read(struct perf_event *event)
@@ -257,24 +305,56 @@ static void sh_pmu_read(struct perf_event *event)
         sh_perf_event_update(event, &event->hw, event->hw.idx);
  }
  
-static const struct pmu pmu = {
-       .enable         = sh_pmu_enable,
-       .disable        = sh_pmu_disable,
-       .read           = sh_pmu_read,
-};
-
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+static int sh_pmu_event_init(struct perf_event *event)
  {
-       int err = __hw_perf_event_init(event);
+       int err;
+
+       switch (event->attr.type) {
+       case PERF_TYPE_RAW:
+       case PERF_TYPE_HW_CACHE:
+       case PERF_TYPE_HARDWARE:
+               err = __hw_perf_event_init(event);
+               break;
+
+       default:
+               return -ENOENT;
+       }
+
         if (unlikely(err)) {
                 if (event->destroy)
                         event->destroy(event);
-               return ERR_PTR(err);
         }
  
-       return &pmu;
+       return err;
+}
+
+static void sh_pmu_enable(struct pmu *pmu)
+{
+       if (!sh_pmu_initialized())
+               return;
+
+       sh_pmu->enable_all();
+}
+
+static void sh_pmu_disable(struct pmu *pmu)
+{
+       if (!sh_pmu_initialized())
+               return;
+
+       sh_pmu->disable_all();
  }
  
+static struct pmu pmu = {
+       .pmu_enable     = sh_pmu_enable,
+       .pmu_disable    = sh_pmu_disable,
+       .event_init     = sh_pmu_event_init,
+       .add            = sh_pmu_add,
+       .del            = sh_pmu_del,
+       .start          = sh_pmu_start,
+       .stop           = sh_pmu_stop,
+       .read           = sh_pmu_read,
+};
+
  static void sh_pmu_setup(int cpu)
  {
         struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
@@ -299,32 +379,17 @@ sh_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
         return NOTIFY_OK;
  }
  
-void hw_perf_enable(void)
-{
-       if (!sh_pmu_initialized())
-               return;
-
-       sh_pmu->enable_all();
-}
-
-void hw_perf_disable(void)
-{
-       if (!sh_pmu_initialized())
-               return;
-
-       sh_pmu->disable_all();
-}
-
-int __cpuinit register_sh_pmu(struct sh_pmu *pmu)
+int __cpuinit register_sh_pmu(struct sh_pmu *_pmu)
  {
         if (sh_pmu)
                 return -EBUSY;
-       sh_pmu = pmu;
+       sh_pmu = _pmu;
  
-       pr_info("Performance Events: %s support registered\n", pmu->name);
+       pr_info("Performance Events: %s support registered\n", _pmu->name);
  
-       WARN_ON(pmu->num_events > MAX_HWEVENTS);
+       WARN_ON(_pmu->num_events > MAX_HWEVENTS);
  
+       perf_pmu_register(&pmu);
         perf_cpu_notifier(sh_pmu_notifier);
         return 0;
  }
diff --git a/arch/sh/oprofile/Makefile b/arch/sh/oprofile/Makefile

index 4886c5c1786c24dbacadccd492a9dbf5fa8bd075..e85aae73e3dcf12a4ef1bb06b344b901efd54c6f 100644 (file)
--- a/arch/sh/oprofile/Makefile
+++ b/arch/sh/oprofile/Makefile
@@ -6,4 +6,8 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
                 oprofilefs.o oprofile_stats.o \
                 timer_int.o )
  
+ifeq ($(CONFIG_HW_PERF_EVENTS),y)
+DRIVER_OBJS += $(addprefix ../../../drivers/oprofile/, oprofile_perf.o)
+endif
+
  oprofile-y     := $(DRIVER_OBJS) common.o backtrace.o
diff --git a/arch/sh/oprofile/common.c b/arch/sh/oprofile/common.c

index ac604937f3ee16fde018d6f5b0c094a189f4d2b5..e10d89376f9b79140adaa44f7fb63f0681604482 100644 (file)
--- a/arch/sh/oprofile/common.c
+++ b/arch/sh/oprofile/common.c
@@ -17,114 +17,45 @@
  #include <linux/init.h>
  #include <linux/errno.h>
  #include <linux/smp.h>
+#include <linux/perf_event.h>
  #include <asm/processor.h>
-#include "op_impl.h"
-
-static struct op_sh_model *model;
-
-static struct op_counter_config ctr[20];
  
+#ifdef CONFIG_HW_PERF_EVENTS
  extern void sh_backtrace(struct pt_regs * const regs, unsigned int depth);
  
-static int op_sh_setup(void)
-{
-       /* Pre-compute the values to stuff in the hardware registers.  */
-       model->reg_setup(ctr);
-
-       /* Configure the registers on all cpus.  */
-       on_each_cpu(model->cpu_setup, NULL, 1);
-
-        return 0;
-}
-
-static int op_sh_create_files(struct super_block *sb, struct dentry *root)
+char *op_name_from_perf_id(void)
  {
-       int i, ret = 0;
+       const char *pmu;
+       char buf[20];
+       int size;
  
-       for (i = 0; i < model->num_counters; i++) {
-               struct dentry *dir;
-               char buf[4];
+       pmu = perf_pmu_name();
+       if (!pmu)
+               return NULL;
  
-               snprintf(buf, sizeof(buf), "%d", i);
-               dir = oprofilefs_mkdir(sb, root, buf);
+       size = snprintf(buf, sizeof(buf), "sh/%s", pmu);
+       if (size > -1 && size < sizeof(buf))
+               return buf;
  
-               ret |= oprofilefs_create_ulong(sb, dir, "enabled", &ctr[i].enabled);
-               ret |= oprofilefs_create_ulong(sb, dir, "event", &ctr[i].event);
-               ret |= oprofilefs_create_ulong(sb, dir, "kernel", &ctr[i].kernel);
-               ret |= oprofilefs_create_ulong(sb, dir, "user", &ctr[i].user);
-
-               if (model->create_files)
-                       ret |= model->create_files(sb, dir);
-               else
-                       ret |= oprofilefs_create_ulong(sb, dir, "count", &ctr[i].count);
-
-               /* Dummy entries */
-               ret |= oprofilefs_create_ulong(sb, dir, "unit_mask", &ctr[i].unit_mask);
-       }
-
-       return ret;
+       return NULL;
  }
  
-static int op_sh_start(void)
+int __init oprofile_arch_init(struct oprofile_operations *ops)
  {
-       /* Enable performance monitoring for all counters.  */
-       on_each_cpu(model->cpu_start, NULL, 1);
+       ops->backtrace = sh_backtrace;
  
-       return 0;
+       return oprofile_perf_init(ops);
  }
  
-static void op_sh_stop(void)
+void __exit oprofile_arch_exit(void)
  {
-       /* Disable performance monitoring for all counters.  */
-       on_each_cpu(model->cpu_stop, NULL, 1);
+       oprofile_perf_exit();
  }
-
+#else
  int __init oprofile_arch_init(struct oprofile_operations *ops)
  {
-       struct op_sh_model *lmodel = NULL;
-       int ret;
-
-       /*
-        * Always assign the backtrace op. If the counter initialization
-        * fails, we fall back to the timer which will still make use of
-        * this.
-        */
-       ops->backtrace = sh_backtrace;
-
-       /*
-        * XXX
-        *
-        * All of the SH7750/SH-4A counters have been converted to perf,
-        * this infrastructure hook is left for other users until they've
-        * had a chance to convert over, at which point all of this
-        * will be deleted.
-        */
-
-       if (!lmodel)
-               return -ENODEV;
-       if (!(current_cpu_data.flags & CPU_HAS_PERF_COUNTER))
-               return -ENODEV;
-
-       ret = lmodel->init();
-       if (unlikely(ret != 0))
-               return ret;
-
-       model = lmodel;
-
-       ops->setup              = op_sh_setup;
-       ops->create_files       = op_sh_create_files;
-       ops->start              = op_sh_start;
-       ops->stop               = op_sh_stop;
-       ops->cpu_type           = lmodel->cpu_type;
-
-       printk(KERN_INFO "oprofile: using %s performance monitoring.\n",
-              lmodel->cpu_type);
-
-       return 0;
-}
-
-void oprofile_arch_exit(void)
-{
-       if (model && model->exit)
-               model->exit();
+       pr_info("oprofile: hardware counters not available\n");
+       return -ENODEV;
  }
+void __exit oprofile_arch_exit(void) {}
+#endif /* CONFIG_HW_PERF_EVENTS */
diff --git a/arch/sh/oprofile/op_impl.h b/arch/sh/oprofile/op_impl.h

deleted file mode 100644 (file)

index 1244479..0000000
--- a/arch/sh/oprofile/op_impl.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef __OP_IMPL_H
-#define __OP_IMPL_H
-
-/* Per-counter configuration as set via oprofilefs.  */
-struct op_counter_config {
-       unsigned long enabled;
-       unsigned long event;
-
-       unsigned long count;
-
-       /* Dummy values for userspace tool compliance */
-       unsigned long kernel;
-       unsigned long user;
-       unsigned long unit_mask;
-};
-
-/* Per-architecture configury and hooks.  */
-struct op_sh_model {
-       void (*reg_setup)(struct op_counter_config *);
-       int (*create_files)(struct super_block *sb, struct dentry *dir);
-       void (*cpu_setup)(void *dummy);
-       int (*init)(void);
-       void (*exit)(void);
-       void (*cpu_start)(void *args);
-       void (*cpu_stop)(void *args);
-       char *cpu_type;
-       unsigned char num_counters;
-};
-
-/* arch/sh/oprofile/common.c */
-extern void sh_backtrace(struct pt_regs * const regs, unsigned int depth);
-
-#endif /* __OP_IMPL_H */
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig

index 491e9d6de1912d56cea2b21f51bf8cfd7980e278..3e9d31401fb24e9dd1b6b3e67add09a4ea7f1468 100644 (file)
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -26,10 +26,12 @@ config SPARC
         select ARCH_WANT_OPTIONAL_GPIOLIB
         select RTC_CLASS
         select RTC_DRV_M48T59
+       select HAVE_IRQ_WORK
         select HAVE_PERF_EVENTS
         select PERF_USE_VMALLOC
         select HAVE_DMA_ATTRS
         select HAVE_DMA_API_DEBUG
+       select HAVE_ARCH_JUMP_LABEL
  
  config SPARC32
         def_bool !64BIT
@@ -53,6 +55,7 @@ config SPARC64
         select RTC_DRV_BQ4802
         select RTC_DRV_SUN4V
         select RTC_DRV_STARFIRE
+       select HAVE_IRQ_WORK
         select HAVE_PERF_EVENTS
         select PERF_USE_VMALLOC
  
diff --git a/arch/sparc/include/asm/jump_label.h b/arch/sparc/include/asm/jump_label.h

new file mode 100644 (file)

index 0000000..62e66d7
--- /dev/null
+++ b/arch/sparc/include/asm/jump_label.h
@@ -0,0 +1,32 @@
+#ifndef _ASM_SPARC_JUMP_LABEL_H
+#define _ASM_SPARC_JUMP_LABEL_H
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <asm/system.h>
+
+#define JUMP_LABEL_NOP_SIZE 4
+
+#define JUMP_LABEL(key, label)                                 \
+       do {                                                    \
+               asm goto("1:\n\t"                               \
+                        "nop\n\t"                              \
+                        "nop\n\t"                              \
+                        ".pushsection __jump_table,  \"a\"\n\t"\
+                        ".word 1b, %l[" #label "], %c0\n\t"    \
+                        ".popsection \n\t"                     \
+                        : :  "i" (key) :  : label);\
+       } while (0)
+
+#endif /* __KERNEL__ */
+
+typedef u32 jump_label_t;
+
+struct jump_entry {
+       jump_label_t code;
+       jump_label_t target;
+       jump_label_t key;
+};
+
+#endif
diff --git a/arch/sparc/include/asm/perf_event.h b/arch/sparc/include/asm/perf_event.h

index 727af70646cbddff92f36e6c47ad782c9018ea11..6e8bfa1786dab1f45d3dff5a1dcacc31a08d4844 100644 (file)
--- a/arch/sparc/include/asm/perf_event.h
+++ b/arch/sparc/include/asm/perf_event.h
@@ -1,10 +1,6 @@
  #ifndef __ASM_SPARC_PERF_EVENT_H
  #define __ASM_SPARC_PERF_EVENT_H
  
-extern void set_perf_event_pending(void);
-
-#define        PERF_EVENT_INDEX_OFFSET 0
-
  #ifdef CONFIG_PERF_EVENTS
  #include <asm/ptrace.h>
  
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile

index 0c2dc1f24a9a74adb05299a89ee07f8ed2bc1eb9..599398fbbc7cb78fd2f8849400d9092a95ffa786 100644 (file)
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -119,3 +119,5 @@ obj-$(CONFIG_COMPAT)    += $(audit--y)
  
  pc--$(CONFIG_PERF_EVENTS) := perf_event.o
  obj-$(CONFIG_SPARC64)  += $(pc--y)
+
+obj-$(CONFIG_SPARC64)  += jump_label.o
diff --git a/arch/sparc/kernel/jump_label.c b/arch/sparc/kernel/jump_label.c

new file mode 100644 (file)

index 0000000..ea2dafc
--- /dev/null
+++ b/arch/sparc/kernel/jump_label.c
@@ -0,0 +1,47 @@
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+
+#include <linux/jump_label.h>
+#include <linux/memory.h>
+
+#ifdef HAVE_JUMP_LABEL
+
+void arch_jump_label_transform(struct jump_entry *entry,
+                              enum jump_label_type type)
+{
+       u32 val;
+       u32 *insn = (u32 *) (unsigned long) entry->code;
+
+       if (type == JUMP_LABEL_ENABLE) {
+               s32 off = (s32)entry->target - (s32)entry->code;
+
+#ifdef CONFIG_SPARC64
+               /* ba,pt %xcc, . + (off << 2) */
+               val = 0x10680000 | ((u32) off >> 2);
+#else
+               /* ba . + (off << 2) */
+               val = 0x10800000 | ((u32) off >> 2);
+#endif
+       } else {
+               val = 0x01000000;
+       }
+
+       get_online_cpus();
+       mutex_lock(&text_mutex);
+       *insn = val;
+       flushi(insn);
+       mutex_unlock(&text_mutex);
+       put_online_cpus();
+}
+
+void arch_jump_label_text_poke_early(jump_label_t addr)
+{
+       u32 *insn_p = (u32 *) (unsigned long) addr;
+
+       *insn_p = 0x01000000;
+       flushi(insn_p);
+}
+
+#endif
diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c

index f848aadf54dc1c2c1feb537752fdf3410efbe9d8..ee3c7dde8d9fbd5af21f5dbf555628eff1c64b9c 100644 (file)
--- a/arch/sparc/kernel/module.c
+++ b/arch/sparc/kernel/module.c
@@ -18,6 +18,9 @@
  #include <asm/spitfire.h>
  
  #ifdef CONFIG_SPARC64
+
+#include <linux/jump_label.h>
+
  static void *module_map(unsigned long size)
  {
         struct vm_struct *area;
@@ -227,6 +230,9 @@ int module_finalize(const Elf_Ehdr *hdr,
                     const Elf_Shdr *sechdrs,
                     struct module *me)
  {
+       /* make jump label nops */
+       jump_label_apply_nops(me);
+
         /* Cheetah's I-cache is fully coherent.  */
         if (tlb_type == spitfire) {
                 unsigned long va;
diff --git a/arch/sparc/kernel/pcr.c b/arch/sparc/kernel/pcr.c

index c4a6a50b4849a64c0f98759921267fbe76c3cfe7..b87873c0e8ea5f72be8ffe784e1035911b05a68c 100644 (file)
--- a/arch/sparc/kernel/pcr.c
+++ b/arch/sparc/kernel/pcr.c
@@ -7,7 +7,7 @@
  #include <linux/init.h>
  #include <linux/irq.h>
  
-#include <linux/perf_event.h>
+#include <linux/irq_work.h>
  #include <linux/ftrace.h>
  
  #include <asm/pil.h>
@@ -43,14 +43,14 @@ void __irq_entry deferred_pcr_work_irq(int irq, struct pt_regs *regs)
  
         old_regs = set_irq_regs(regs);
         irq_enter();
-#ifdef CONFIG_PERF_EVENTS
-       perf_event_do_pending();
+#ifdef CONFIG_IRQ_WORK
+       irq_work_run();
  #endif
         irq_exit();
         set_irq_regs(old_regs);
  }
  
-void set_perf_event_pending(void)
+void arch_irq_work_raise(void)
  {
         set_softint(1 << PIL_DEFERRED_PCR_WORK);
  }
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c

index 6318e622cfb065d9da81e40b766c12dc0a6e2e7b..0d6deb55a2ae7e4189b5ab60aec81cd8df28adb6 100644 (file)
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -658,13 +658,16 @@ static u64 maybe_change_configuration(struct cpu_hw_events *cpuc, u64 pcr)
  
                 enc = perf_event_get_enc(cpuc->events[i]);
                 pcr &= ~mask_for_index(idx);
-               pcr |= event_encoding(enc, idx);
+               if (hwc->state & PERF_HES_STOPPED)
+                       pcr |= nop_for_index(idx);
+               else
+                       pcr |= event_encoding(enc, idx);
         }
  out:
         return pcr;
  }
  
-void hw_perf_enable(void)
+static void sparc_pmu_enable(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         u64 pcr;
@@ -691,7 +694,7 @@ void hw_perf_enable(void)
         pcr_ops->write(cpuc->pcr);
  }
  
-void hw_perf_disable(void)
+static void sparc_pmu_disable(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         u64 val;
@@ -710,19 +713,65 @@ void hw_perf_disable(void)
         pcr_ops->write(cpuc->pcr);
  }
  
-static void sparc_pmu_disable(struct perf_event *event)
+static int active_event_index(struct cpu_hw_events *cpuc,
+                             struct perf_event *event)
+{
+       int i;
+
+       for (i = 0; i < cpuc->n_events; i++) {
+               if (cpuc->event[i] == event)
+                       break;
+       }
+       BUG_ON(i == cpuc->n_events);
+       return cpuc->current_idx[i];
+}
+
+static void sparc_pmu_start(struct perf_event *event, int flags)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+       int idx = active_event_index(cpuc, event);
+
+       if (flags & PERF_EF_RELOAD) {
+               WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+               sparc_perf_event_set_period(event, &event->hw, idx);
+       }
+
+       event->hw.state = 0;
+
+       sparc_pmu_enable_event(cpuc, &event->hw, idx);
+}
+
+static void sparc_pmu_stop(struct perf_event *event, int flags)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+       int idx = active_event_index(cpuc, event);
+
+       if (!(event->hw.state & PERF_HES_STOPPED)) {
+               sparc_pmu_disable_event(cpuc, &event->hw, idx);
+               event->hw.state |= PERF_HES_STOPPED;
+       }
+
+       if (!(event->hw.state & PERF_HES_UPTODATE) && (flags & PERF_EF_UPDATE)) {
+               sparc_perf_event_update(event, &event->hw, idx);
+               event->hw.state |= PERF_HES_UPTODATE;
+       }
+}
+
+static void sparc_pmu_del(struct perf_event *event, int _flags)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-       struct hw_perf_event *hwc = &event->hw;
         unsigned long flags;
         int i;
  
         local_irq_save(flags);
-       perf_disable();
+       perf_pmu_disable(event->pmu);
  
         for (i = 0; i < cpuc->n_events; i++) {
                 if (event == cpuc->event[i]) {
-                       int idx = cpuc->current_idx[i];
+                       /* Absorb the final count and turn off the
+                        * event.
+                        */
+                       sparc_pmu_stop(event, PERF_EF_UPDATE);
  
                         /* Shift remaining entries down into
                          * the existing slot.
@@ -734,13 +783,6 @@ static void sparc_pmu_disable(struct perf_event *event)
                                         cpuc->current_idx[i];
                         }
  
-                       /* Absorb the final count and turn off the
-                        * event.
-                        */
-                       sparc_pmu_disable_event(cpuc, hwc, idx);
-                       barrier();
-                       sparc_perf_event_update(event, hwc, idx);
-
                         perf_event_update_userpage(event);
  
                         cpuc->n_events--;
@@ -748,23 +790,10 @@ static void sparc_pmu_disable(struct perf_event *event)
                 }
         }
  
-       perf_enable();
+       perf_pmu_enable(event->pmu);
         local_irq_restore(flags);
  }
  
-static int active_event_index(struct cpu_hw_events *cpuc,
-                             struct perf_event *event)
-{
-       int i;
-
-       for (i = 0; i < cpuc->n_events; i++) {
-               if (cpuc->event[i] == event)
-                       break;
-       }
-       BUG_ON(i == cpuc->n_events);
-       return cpuc->current_idx[i];
-}
-
  static void sparc_pmu_read(struct perf_event *event)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -774,15 +803,6 @@ static void sparc_pmu_read(struct perf_event *event)
         sparc_perf_event_update(event, hwc, idx);
  }
  
-static void sparc_pmu_unthrottle(struct perf_event *event)
-{
-       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-       int idx = active_event_index(cpuc, event);
-       struct hw_perf_event *hwc = &event->hw;
-
-       sparc_pmu_enable_event(cpuc, hwc, idx);
-}
-
  static atomic_t active_events = ATOMIC_INIT(0);
  static DEFINE_MUTEX(pmc_grab_mutex);
  
@@ -877,7 +897,7 @@ static int sparc_check_constraints(struct perf_event **evts,
         if (!n_ev)
                 return 0;
  
-       if (n_ev > perf_max_events)
+       if (n_ev > MAX_HWEVENTS)
                 return -1;
  
         msk0 = perf_event_get_msk(events[0]);
@@ -984,23 +1004,27 @@ static int collect_events(struct perf_event *group, int max_count,
         return n;
  }
  
-static int sparc_pmu_enable(struct perf_event *event)
+static int sparc_pmu_add(struct perf_event *event, int ef_flags)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         int n0, ret = -EAGAIN;
         unsigned long flags;
  
         local_irq_save(flags);
-       perf_disable();
+       perf_pmu_disable(event->pmu);
  
         n0 = cpuc->n_events;
-       if (n0 >= perf_max_events)
+       if (n0 >= MAX_HWEVENTS)
                 goto out;
  
         cpuc->event[n0] = event;
         cpuc->events[n0] = event->hw.event_base;
         cpuc->current_idx[n0] = PIC_NO_INDEX;
  
+       event->hw.state = PERF_HES_UPTODATE;
+       if (!(ef_flags & PERF_EF_START))
+               event->hw.state |= PERF_HES_STOPPED;
+
         /*
          * If group events scheduling transaction was started,
          * skip the schedulability test here, it will be peformed
@@ -1020,12 +1044,12 @@ nocheck:
  
         ret = 0;
  out:
-       perf_enable();
+       perf_pmu_enable(event->pmu);
         local_irq_restore(flags);
         return ret;
  }
  
-static int __hw_perf_event_init(struct perf_event *event)
+static int sparc_pmu_event_init(struct perf_event *event)
  {
         struct perf_event_attr *attr = &event->attr;
         struct perf_event *evts[MAX_HWEVENTS];
@@ -1038,22 +1062,33 @@ static int __hw_perf_event_init(struct perf_event *event)
         if (atomic_read(&nmi_active) < 0)
                 return -ENODEV;
  
-       pmap = NULL;
-       if (attr->type == PERF_TYPE_HARDWARE) {
+       switch (attr->type) {
+       case PERF_TYPE_HARDWARE:
                 if (attr->config >= sparc_pmu->max_events)
                         return -EINVAL;
                 pmap = sparc_pmu->event_map(attr->config);
-       } else if (attr->type == PERF_TYPE_HW_CACHE) {
+               break;
+
+       case PERF_TYPE_HW_CACHE:
                 pmap = sparc_map_cache_event(attr->config);
                 if (IS_ERR(pmap))
                         return PTR_ERR(pmap);
-       } else if (attr->type != PERF_TYPE_RAW)
-               return -EOPNOTSUPP;
+               break;
+
+       case PERF_TYPE_RAW:
+               pmap = NULL;
+               break;
+
+       default:
+               return -ENOENT;
+
+       }
  
         if (pmap) {
                 hwc->event_base = perf_event_encode(pmap);
         } else {
-               /* User gives us "(encoding << 16) | pic_mask" for
+               /*
+                * User gives us "(encoding << 16) | pic_mask" for
                  * PERF_TYPE_RAW events.
                  */
                 hwc->event_base = attr->config;
@@ -1071,7 +1106,7 @@ static int __hw_perf_event_init(struct perf_event *event)
         n = 0;
         if (event->group_leader != event) {
                 n = collect_events(event->group_leader,
-                                  perf_max_events - 1,
+                                  MAX_HWEVENTS - 1,
                                    evts, events, current_idx_dmy);
                 if (n < 0)
                         return -EINVAL;
@@ -1107,10 +1142,11 @@ static int __hw_perf_event_init(struct perf_event *event)
   * Set the flag to make pmu::enable() not perform the
   * schedulability test, it will be performed at commit time
   */
-static void sparc_pmu_start_txn(const struct pmu *pmu)
+static void sparc_pmu_start_txn(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
  
+       perf_pmu_disable(pmu);
         cpuhw->group_flag |= PERF_EVENT_TXN;
  }
  
@@ -1119,11 +1155,12 @@ static void sparc_pmu_start_txn(const struct pmu *pmu)
   * Clear the flag and pmu::enable() will perform the
   * schedulability test.
   */
-static void sparc_pmu_cancel_txn(const struct pmu *pmu)
+static void sparc_pmu_cancel_txn(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
  
         cpuhw->group_flag &= ~PERF_EVENT_TXN;
+       perf_pmu_enable(pmu);
  }
  
  /*
@@ -1131,7 +1168,7 @@ static void sparc_pmu_cancel_txn(const struct pmu *pmu)
   * Perform the group schedulability test as a whole
   * Return 0 if success
   */
-static int sparc_pmu_commit_txn(const struct pmu *pmu)
+static int sparc_pmu_commit_txn(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         int n;
@@ -1147,28 +1184,24 @@ static int sparc_pmu_commit_txn(const struct pmu *pmu)
                 return -EAGAIN;
  
         cpuc->group_flag &= ~PERF_EVENT_TXN;
+       perf_pmu_enable(pmu);
         return 0;
  }
  
-static const struct pmu pmu = {
-       .enable         = sparc_pmu_enable,
-       .disable        = sparc_pmu_disable,
+static struct pmu pmu = {
+       .pmu_enable     = sparc_pmu_enable,
+       .pmu_disable    = sparc_pmu_disable,
+       .event_init     = sparc_pmu_event_init,
+       .add            = sparc_pmu_add,
+       .del            = sparc_pmu_del,
+       .start          = sparc_pmu_start,
+       .stop           = sparc_pmu_stop,
         .read           = sparc_pmu_read,
-       .unthrottle     = sparc_pmu_unthrottle,
         .start_txn      = sparc_pmu_start_txn,
         .cancel_txn     = sparc_pmu_cancel_txn,
         .commit_txn     = sparc_pmu_commit_txn,
  };
  
-const struct pmu *hw_perf_event_init(struct perf_event *event)
-{
-       int err = __hw_perf_event_init(event);
-
-       if (err)
-               return ERR_PTR(err);
-       return &pmu;
-}
-
  void perf_event_print_debug(void)
  {
         unsigned long flags;
@@ -1244,7 +1277,7 @@ static int __kprobes perf_event_nmi_handler(struct notifier_block *self,
                         continue;
  
                 if (perf_event_overflow(event, 1, &data, regs))
-                       sparc_pmu_disable_event(cpuc, hwc, idx);
+                       sparc_pmu_stop(event, 0);
         }
  
         return NOTIFY_STOP;
@@ -1285,28 +1318,21 @@ void __init init_hw_perf_events(void)
  
         pr_cont("Supported PMU type is '%s'\n", sparc_pmu_type);
  
-       /* All sparc64 PMUs currently have 2 events.  */
-       perf_max_events = 2;
-
+       perf_pmu_register(&pmu);
         register_die_notifier(&perf_event_nmi_notifier);
  }
  
-static inline void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
-       if (entry->nr < PERF_MAX_STACK_DEPTH)
-               entry->ip[entry->nr++] = ip;
-}
-
-static void perf_callchain_kernel(struct pt_regs *regs,
-                                 struct perf_callchain_entry *entry)
+void perf_callchain_kernel(struct perf_callchain_entry *entry,
+                          struct pt_regs *regs)
  {
         unsigned long ksp, fp;
  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
         int graph = 0;
  #endif
  
-       callchain_store(entry, PERF_CONTEXT_KERNEL);
-       callchain_store(entry, regs->tpc);
+       stack_trace_flush();
+
+       perf_callchain_store(entry, regs->tpc);
  
         ksp = regs->u_regs[UREG_I6];
         fp = ksp + STACK_BIAS;
@@ -1330,13 +1356,13 @@ static void perf_callchain_kernel(struct pt_regs *regs,
                         pc = sf->callers_pc;
                         fp = (unsigned long)sf->fp + STACK_BIAS;
                 }
-               callchain_store(entry, pc);
+               perf_callchain_store(entry, pc);
  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
                 if ((pc + 8UL) == (unsigned long) &return_to_handler) {
                         int index = current->curr_ret_stack;
                         if (current->ret_stack && index >= graph) {
                                 pc = current->ret_stack[index - graph].ret;
-                               callchain_store(entry, pc);
+                               perf_callchain_store(entry, pc);
                                 graph++;
                         }
                 }
@@ -1344,13 +1370,12 @@ static void perf_callchain_kernel(struct pt_regs *regs,
         } while (entry->nr < PERF_MAX_STACK_DEPTH);
  }
  
-static void perf_callchain_user_64(struct pt_regs *regs,
-                                  struct perf_callchain_entry *entry)
+static void perf_callchain_user_64(struct perf_callchain_entry *entry,
+                                  struct pt_regs *regs)
  {
         unsigned long ufp;
  
-       callchain_store(entry, PERF_CONTEXT_USER);
-       callchain_store(entry, regs->tpc);
+       perf_callchain_store(entry, regs->tpc);
  
         ufp = regs->u_regs[UREG_I6] + STACK_BIAS;
         do {
@@ -1363,17 +1388,16 @@ static void perf_callchain_user_64(struct pt_regs *regs,
  
                 pc = sf.callers_pc;
                 ufp = (unsigned long)sf.fp + STACK_BIAS;
-               callchain_store(entry, pc);
+               perf_callchain_store(entry, pc);
         } while (entry->nr < PERF_MAX_STACK_DEPTH);
  }
  
-static void perf_callchain_user_32(struct pt_regs *regs,
-                                  struct perf_callchain_entry *entry)
+static void perf_callchain_user_32(struct perf_callchain_entry *entry,
+                                  struct pt_regs *regs)
  {
         unsigned long ufp;
  
-       callchain_store(entry, PERF_CONTEXT_USER);
-       callchain_store(entry, regs->tpc);
+       perf_callchain_store(entry, regs->tpc);
  
         ufp = regs->u_regs[UREG_I6] & 0xffffffffUL;
         do {
@@ -1386,34 +1410,16 @@ static void perf_callchain_user_32(struct pt_regs *regs,
  
                 pc = sf.callers_pc;
                 ufp = (unsigned long)sf.fp;
-               callchain_store(entry, pc);
+               perf_callchain_store(entry, pc);
         } while (entry->nr < PERF_MAX_STACK_DEPTH);
  }
  
-/* Like powerpc we can't get PMU interrupts within the PMU handler,
- * so no need for separate NMI and IRQ chains as on x86.
- */
-static DEFINE_PER_CPU(struct perf_callchain_entry, callchain);
-
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
  {
-       struct perf_callchain_entry *entry = &__get_cpu_var(callchain);
-
-       entry->nr = 0;
-       if (!user_mode(regs)) {
-               stack_trace_flush();
-               perf_callchain_kernel(regs, entry);
-               if (current->mm)
-                       regs = task_pt_regs(current);
-               else
-                       regs = NULL;
-       }
-       if (regs) {
-               flushw_user();
-               if (test_thread_flag(TIF_32BIT))
-                       perf_callchain_user_32(regs, entry);
-               else
-                       perf_callchain_user_64(regs, entry);
-       }
-       return entry;
+       flushw_user();
+       if (test_thread_flag(TIF_32BIT))
+               perf_callchain_user_32(entry, regs);
+       else
+               perf_callchain_user_64(entry, regs);
  }
diff --git a/arch/um/drivers/hostaudio_kern.c b/arch/um/drivers/hostaudio_kern.c

index 0c46e398cd8f313d89a3ff07187916aa6021b93f..63c740a85b4cca0ed9333091593885cbdfface2e 100644 (file)
--- a/arch/um/drivers/hostaudio_kern.c
+++ b/arch/um/drivers/hostaudio_kern.c
@@ -40,6 +40,11 @@ static char *mixer = HOSTAUDIO_DEV_MIXER;
  "    This is used to specify the host mixer device to the hostaudio driver.\n"\
  "    The default is \"" HOSTAUDIO_DEV_MIXER "\".\n\n"
  
+module_param(dsp, charp, 0644);
+MODULE_PARM_DESC(dsp, DSP_HELP);
+module_param(mixer, charp, 0644);
+MODULE_PARM_DESC(mixer, MIXER_HELP);
+
  #ifndef MODULE
  static int set_dsp(char *name, int *add)
  {
@@ -56,15 +61,6 @@ static int set_mixer(char *name, int *add)
  }
  
  __uml_setup("mixer=", set_mixer, "mixer=<mixer device>\n" MIXER_HELP);
-
-#else /*MODULE*/
-
-module_param(dsp, charp, 0644);
-MODULE_PARM_DESC(dsp, DSP_HELP);
-
-module_param(mixer, charp, 0644);
-MODULE_PARM_DESC(mixer, MIXER_HELP);
-
  #endif
  
  /* /dev/dsp file operations */
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c

index 1bcd208c459f609ab3634f951e1c11b3b2e50038..9734994cba1e86c53f60dead72952f7feb3ab02d 100644 (file)
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -163,6 +163,7 @@ struct ubd {
         struct scatterlist sg[MAX_SG];
         struct request *request;
         int start_sg, end_sg;
+       sector_t rq_pos;
  };
  
  #define DEFAULT_COW { \
@@ -187,6 +188,7 @@ struct ubd {
         .request =              NULL, \
         .start_sg =             0, \
         .end_sg =               0, \
+       .rq_pos =               0, \
  }
  
  /* Protected by ubd_lock */
@@ -1228,7 +1230,6 @@ static void do_ubd_request(struct request_queue *q)
  {
         struct io_thread_req *io_req;
         struct request *req;
-       sector_t sector;
         int n;
  
         while(1){
@@ -1239,12 +1240,12 @@ static void do_ubd_request(struct request_queue *q)
                                 return;
  
                         dev->request = req;
+                       dev->rq_pos = blk_rq_pos(req);
                         dev->start_sg = 0;
                         dev->end_sg = blk_rq_map_sg(q, req, dev->sg);
                 }
  
                 req = dev->request;
-               sector = blk_rq_pos(req);
                 while(dev->start_sg < dev->end_sg){
                         struct scatterlist *sg = &dev->sg[dev->start_sg];
  
@@ -1256,10 +1257,9 @@ static void do_ubd_request(struct request_queue *q)
                                 return;
                         }
                         prepare_request(req, io_req,
-                                       (unsigned long long)sector << 9,
+                                       (unsigned long long)dev->rq_pos << 9,
                                         sg->offset, sg->length, sg_page(sg));
  
-                       sector += sg->length >> 9;
                         n = os_write_file(thread_fd, &io_req,
                                           sizeof(struct io_thread_req *));
                         if(n != sizeof(struct io_thread_req *)){
@@ -1272,6 +1272,7 @@ static void do_ubd_request(struct request_queue *q)
                                 return;
                         }
  
+                       dev->rq_pos += sg->length >> 9;
                         dev->start_sg++;
                 }
                 dev->end_sg = 0;
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig

index f4c70c246ffe39d37b5e15bda8ee6fe60a5a84a0..89b88e3a56e93a70ad6fbf83d7b605bf48dfbda8 100644 (file)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -25,6 +25,7 @@ config X86
         select HAVE_IDE
         select HAVE_OPROFILE
         select HAVE_PERF_EVENTS if (!M386 && !M486)
+       select HAVE_IRQ_WORK
         select HAVE_IOREMAP_PROT
         select HAVE_KPROBES
         select ARCH_WANT_OPTIONAL_GPIOLIB
@@ -33,6 +34,7 @@ config X86
         select HAVE_KRETPROBES
         select HAVE_OPTPROBES
         select HAVE_FTRACE_MCOUNT_RECORD
+       select HAVE_C_RECORDMCOUNT
         select HAVE_DYNAMIC_FTRACE
         select HAVE_FUNCTION_TRACER
         select HAVE_FUNCTION_GRAPH_TRACER
@@ -59,6 +61,8 @@ config X86
         select ANON_INODES
         select HAVE_ARCH_KMEMCHECK
         select HAVE_USER_RETURN_NOTIFIER
+       select HAVE_ARCH_JUMP_LABEL
+       select HAVE_TEXT_POKE_SMP
  
  config INSTRUCTION_DECODER
         def_bool (KPROBES || PERF_EVENTS)
@@ -2136,6 +2140,10 @@ config HAVE_ATOMIC_IOMAP
         def_bool y
         depends on X86_32
  
+config HAVE_TEXT_POKE_SMP
+       bool
+       select STOP_MACHINE if SMP
+
  source "net/Kconfig"
  
  source "drivers/Kconfig"
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c

index 0350311906ae731ca91e9ecedbc3e7acbaa668d0..2d93bdbc9ac026f2c0ef1e3fcd9c9a208b609787 100644 (file)
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -34,7 +34,7 @@
  #include <asm/ia32.h>
  
  #undef WARN_OLD
-#undef CORE_DUMP /* probably broken */
+#undef CORE_DUMP /* definitely broken */
  
  static int load_aout_binary(struct linux_binprm *, struct pt_regs *regs);
  static int load_aout_library(struct file *);
@@ -131,21 +131,15 @@ static void set_brk(unsigned long start, unsigned long end)
   * macros to write out all the necessary info.
   */
  
-static int dump_write(struct file *file, const void *addr, int nr)
-{
-       return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-}
+#include <linux/coredump.h>
  
  #define DUMP_WRITE(addr, nr)                        \
         if (!dump_write(file, (void *)(addr), (nr))) \
                 goto end_coredump;
  
-#define DUMP_SEEK(offset)                                              \
-       if (file->f_op->llseek) {                                       \
-               if (file->f_op->llseek(file, (offset), 0) != (offset))  \
-                       goto end_coredump;                              \
-       } else                                                          \
-               file->f_pos = (offset)
+#define DUMP_SEEK(offset)              \
+       if (!dump_seek(file, offset))   \
+               goto end_coredump;
  
  #define START_DATA()   (u.u_tsize << PAGE_SHIFT)
  #define START_STACK(u) (u.start_stack)
@@ -217,12 +211,6 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file,
                 dump_size = dump.u_ssize << PAGE_SHIFT;
                 DUMP_WRITE(dump_start, dump_size);
         }
-       /*
-        * Finally dump the task struct.  Not be used by gdb, but
-        * could be useful
-        */
-       set_fs(KERNEL_DS);
-       DUMP_WRITE(current, sizeof(*current));
  end_coredump:
         set_fs(fs);
         return has_dumped;
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h

index bc6abb7bc7ee3084aa8b5d87ae19244715469990..76561d20ea2f27f0edfd0eee6d043b98c6aa6e90 100644 (file)
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -4,6 +4,7 @@
  #include <linux/types.h>
  #include <linux/stddef.h>
  #include <linux/stringify.h>
+#include <linux/jump_label.h>
  #include <asm/asm.h>
  
  /*
@@ -160,6 +161,8 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
  #define __parainstructions_end NULL
  #endif
  
+extern void *text_poke_early(void *addr, const void *opcode, size_t len);
+
  /*
   * Clear and restore the kernel write-protection flag on the local CPU.
   * Allows the kernel to edit read-only pages.
@@ -180,4 +183,12 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
  extern void *text_poke(void *addr, const void *opcode, size_t len);
  extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
  
+#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
+#define IDEAL_NOP_SIZE_5 5
+extern unsigned char ideal_nop5[IDEAL_NOP_SIZE_5];
+extern void arch_init_ideal_nop5(void);
+#else
+static inline void arch_init_ideal_nop5(void) {}
+#endif
+
  #endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h

index 5af2982133b5435b492372c447eef8c849b0eac4..f16a2caca1e0346ce62b47b1d7e0c866c8ddcfcf 100644 (file)
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -1,5 +1,5 @@
  /*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
   * Author: Joerg Roedel <joerg.roedel@amd.com>
   *         Leo Duran <leo.duran@amd.com>
   *
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h

index cb030374b90aeb526ad7f985527def42643bc1a9..916bc8111a01fa5de3a5df9f35ce3b8b60113342 100644 (file)
--- a/arch/x86/include/asm/amd_iommu_proto.h
+++ b/arch/x86/include/asm/amd_iommu_proto.h
@@ -1,5 +1,5 @@
  /*
- * Copyright (C) 2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2009-2010 Advanced Micro Devices, Inc.
   * Author: Joerg Roedel <joerg.roedel@amd.com>
   *
   * This program is free software; you can redistribute it and/or modify it
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h

index 08616180deaf5b1e94bfe9ad0079cec7e9c619a3..e3509fc303bf5a54069e5b9b6b6edbc9ec390a49 100644 (file)
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -1,5 +1,5 @@
  /*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
   * Author: Joerg Roedel <joerg.roedel@amd.com>
   *         Leo Duran <leo.duran@amd.com>
   *
@@ -416,13 +416,22 @@ struct amd_iommu {
         struct dma_ops_domain *default_dom;
  
         /*
-        * This array is required to work around a potential BIOS bug.
-        * The BIOS may miss to restore parts of the PCI configuration
-        * space when the system resumes from S3. The result is that the
-        * IOMMU does not execute commands anymore which leads to system
-        * failure.
+        * We can't rely on the BIOS to restore all values on reinit, so we
+        * need to stash them
          */
-       u32 cache_cfg[4];
+
+       /* The iommu BAR */
+       u32 stored_addr_lo;
+       u32 stored_addr_hi;
+
+       /*
+        * Each iommu has 6 l1s, each of which is documented as having 0x12
+        * registers
+        */
+       u32 stored_l1[6][0x12];
+
+       /* The l2 indirect registers */
+       u32 stored_l2[0x83];
  };
  
  /*
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h

index 8e8ec663a98fab4b771123fa1bfd6f7aef2f5931..b8e96a18676b872e751c4635921fb479f615c74e 100644 (file)
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -49,8 +49,8 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
  BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
  BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
  
-#ifdef CONFIG_PERF_EVENTS
-BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
+#ifdef CONFIG_IRQ_WORK
+BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR)
  #endif
  
  #ifdef CONFIG_X86_THERMAL_VECTOR
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h

index 4ac5b0f33fc1017b3760eb01b1a4ed7f7155dbce..bf357f9b25f0a69ac70b3fd73ea943437fc43e9b 100644 (file)
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -17,6 +17,7 @@ extern int fix_aperture;
  #define GARTEN         (1<<0)
  #define DISGARTCPU     (1<<4)
  #define DISGARTIO      (1<<5)
+#define DISTLBWALKPRB  (1<<6)
  
  /* GART cache control register bits. */
  #define INVGART                (1<<0)
@@ -27,7 +28,6 @@ extern int fix_aperture;
  #define AMD64_GARTAPERTUREBASE 0x94
  #define AMD64_GARTTABLEBASE    0x98
  #define AMD64_GARTCACHECTL     0x9c
-#define AMD64_GARTEN           (1<<0)
  
  #ifdef CONFIG_GART_IOMMU
  extern int gart_iommu_aperture;
@@ -57,6 +57,19 @@ static inline void gart_iommu_hole_init(void)
  
  extern int agp_amd64_init(void);
  
+static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order)
+{
+       u32 ctl;
+
+       /*
+        * Don't enable translation but enable GART IO and CPU accesses.
+        * Also, set DISTLBWALKPRB since GART tables memory is UC.
+        */
+       ctl = DISTLBWALKPRB | order << 1;
+
+       pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
+}
+
  static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
  {
         u32 tmp, ctl;
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h

index aeab29aee617240fbf479d2945572879be4525ec..55e4de613f0ee72471fd3cfaf28dee21150b276c 100644 (file)
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -14,7 +14,7 @@ typedef struct {
  #endif
         unsigned int x86_platform_ipis; /* arch dependent */
         unsigned int apic_perf_irqs;
-       unsigned int apic_pending_irqs;
+       unsigned int apic_irq_work_irqs;
  #ifdef CONFIG_SMP
         unsigned int irq_resched_count;
         unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h

index 46c0fe05f230112b5aa5e176d4292e56526f8279..3a54a1ca1a0234c0b19897f423bcde64be1f9dce 100644 (file)
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,7 +29,7 @@
  extern void apic_timer_interrupt(void);
  extern void x86_platform_ipi(void);
  extern void error_interrupt(void);
-extern void perf_pending_interrupt(void);
+extern void irq_work_interrupt(void);
  
  extern void spurious_interrupt(void);
  extern void thermal_interrupt(void);
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h

index e2ca3009255706910d1ddc407232f4d09f011f24..6af0894dafb445cdaedceb1630249c47e955633f 100644 (file)
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -114,9 +114,9 @@
  #define X86_PLATFORM_IPI_VECTOR                0xed
  
  /*
- * Performance monitoring pending work vector:
+ * IRQ work vector:
   */
-#define LOCAL_PENDING_VECTOR           0xec
+#define IRQ_WORK_VECTOR                        0xec
  
  #define UV_BAU_MESSAGE                 0xea
  
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h

new file mode 100644 (file)

index 0000000..f52d42e
--- /dev/null
+++ b/arch/x86/include/asm/jump_label.h
@@ -0,0 +1,37 @@
+#ifndef _ASM_X86_JUMP_LABEL_H
+#define _ASM_X86_JUMP_LABEL_H
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <asm/nops.h>
+
+#define JUMP_LABEL_NOP_SIZE 5
+
+# define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t"
+
+# define JUMP_LABEL(key, label)                                        \
+       do {                                                    \
+               asm goto("1:"                                   \
+                       JUMP_LABEL_INITIAL_NOP                  \
+                       ".pushsection __jump_table,  \"a\" \n\t"\
+                       _ASM_PTR "1b, %l[" #label "], %c0 \n\t" \
+                       ".popsection \n\t"                      \
+                       : :  "i" (key) :  : label);             \
+       } while (0)
+
+#endif /* __KERNEL__ */
+
+#ifdef CONFIG_X86_64
+typedef u64 jump_label_t;
+#else
+typedef u32 jump_label_t;
+#endif
+
+struct jump_entry {
+       jump_label_t code;
+       jump_label_t target;
+       jump_label_t key;
+};
+
+#endif
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h

index 502e53f999cf28a25cc2b00f1766bd1ffb2f9ed2..c52e2eb40a1e254339658481621be634b427a8f0 100644 (file)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -652,20 +652,6 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
         return (struct kvm_mmu_page *)page_private(page);
  }
  
-static inline u16 kvm_read_fs(void)
-{
-       u16 seg;
-       asm("mov %%fs, %0" : "=g"(seg));
-       return seg;
-}
-
-static inline u16 kvm_read_gs(void)
-{
-       u16 seg;
-       asm("mov %%gs, %0" : "=g"(seg));
-       return seg;
-}
-
  static inline u16 kvm_read_ldt(void)
  {
         u16 ldt;
@@ -673,16 +659,6 @@ static inline u16 kvm_read_ldt(void)
         return ldt;
  }
  
-static inline void kvm_load_fs(u16 sel)
-{
-       asm("mov %0, %%fs" : : "rm"(sel));
-}
-
-static inline void kvm_load_gs(u16 sel)
-{
-       asm("mov %0, %%gs" : : "rm"(sel));
-}
-
  static inline void kvm_load_ldt(u16 sel)
  {
         asm("lldt %0" : : "rm"(sel));
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h

index def500776b16a3b63d34da569021722e4d82f18a..a70cd216be5d729db1f364340f911d632819f18d 100644 (file)
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -36,19 +36,6 @@
  #define P4_ESCR_EMASK(v)       ((v) << P4_ESCR_EVENTMASK_SHIFT)
  #define P4_ESCR_TAG(v)         ((v) << P4_ESCR_TAG_SHIFT)
  
-/* Non HT mask */
-#define P4_ESCR_MASK                   \
-       (P4_ESCR_EVENT_MASK     |       \
-       P4_ESCR_EVENTMASK_MASK  |       \
-       P4_ESCR_TAG_MASK        |       \
-       P4_ESCR_TAG_ENABLE      |       \
-       P4_ESCR_T0_OS           |       \
-       P4_ESCR_T0_USR)
-
-/* HT mask */
-#define P4_ESCR_MASK_HT                        \
-       (P4_ESCR_MASK | P4_ESCR_T1_OS | P4_ESCR_T1_USR)
-
  #define P4_CCCR_OVF                    0x80000000U
  #define P4_CCCR_CASCADE                        0x40000000U
  #define P4_CCCR_OVF_PMI_T0             0x04000000U
@@ -70,23 +57,6 @@
  #define P4_CCCR_THRESHOLD(v)           ((v) << P4_CCCR_THRESHOLD_SHIFT)
  #define P4_CCCR_ESEL(v)                        ((v) << P4_CCCR_ESCR_SELECT_SHIFT)
  
-/* Non HT mask */
-#define P4_CCCR_MASK                           \
-       (P4_CCCR_OVF                    |       \
-       P4_CCCR_CASCADE                 |       \
-       P4_CCCR_OVF_PMI_T0              |       \
-       P4_CCCR_FORCE_OVF               |       \
-       P4_CCCR_EDGE                    |       \
-       P4_CCCR_THRESHOLD_MASK          |       \
-       P4_CCCR_COMPLEMENT              |       \
-       P4_CCCR_COMPARE                 |       \
-       P4_CCCR_ESCR_SELECT_MASK        |       \
-       P4_CCCR_ENABLE)
-
-/* HT mask */
-#define P4_CCCR_MASK_HT                                \
-       (P4_CCCR_MASK | P4_CCCR_OVF_PMI_T1 | P4_CCCR_THREAD_ANY)
-
  #define P4_GEN_ESCR_EMASK(class, name, bit)    \
         class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT)
  #define P4_ESCR_EMASK_BIT(class, name)         class##__##name
@@ -127,6 +97,28 @@
  #define P4_CONFIG_HT_SHIFT             63
  #define P4_CONFIG_HT                   (1ULL << P4_CONFIG_HT_SHIFT)
  
+/*
+ * The bits we allow to pass for RAW events
+ */
+#define P4_CONFIG_MASK_ESCR            \
+       P4_ESCR_EVENT_MASK      |       \
+       P4_ESCR_EVENTMASK_MASK  |       \
+       P4_ESCR_TAG_MASK        |       \
+       P4_ESCR_TAG_ENABLE
+
+#define P4_CONFIG_MASK_CCCR            \
+       P4_CCCR_EDGE            |       \
+       P4_CCCR_THRESHOLD_MASK  |       \
+       P4_CCCR_COMPLEMENT      |       \
+       P4_CCCR_COMPARE         |       \
+       P4_CCCR_THREAD_ANY      |       \
+       P4_CCCR_RESERVED
+
+/* some dangerous bits are reserved for kernel internals */
+#define P4_CONFIG_MASK                                   \
+       (p4_config_pack_escr(P4_CONFIG_MASK_ESCR))      | \
+       (p4_config_pack_cccr(P4_CONFIG_MASK_CCCR))
+
  static inline bool p4_is_event_cascaded(u64 config)
  {
         u32 cccr = p4_config_unpack_cccr(config);
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile

index fedf32a8c3ecdd7a1aaed41b2884d2c9b10bf033..7490bf8d14593dbcf9c7e7a71507927459c97f6a 100644 (file)
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -34,7 +34,8 @@ GCOV_PROFILE_paravirt.o               := n
  obj-y                  := process_$(BITS).o signal.o entry_$(BITS).o
  obj-y                  += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
  obj-y                  += time.o ioport.o ldt.o dumpstack.o
-obj-y                  += setup.o x86_init.o i8259.o irqinit.o
+obj-y                  += setup.o x86_init.o i8259.o irqinit.o jump_label.o
+obj-$(CONFIG_IRQ_WORK)  += irq_work.o
  obj-$(CONFIG_X86_VISWS)        += visws_quirks.o
  obj-$(CONFIG_X86_32)   += probe_roms_32.o
  obj-$(CONFIG_X86_32)   += sys_i386_32.o i386_ksyms_32.o
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c

index f65ab8b014c4f42421d77200243c39c7fd252165..a36bb90aef5383d68bcf4af0b0c33749d572163a 100644 (file)
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -195,7 +195,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
  
  extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
  extern s32 __smp_locks[], __smp_locks_end[];
-static void *text_poke_early(void *addr, const void *opcode, size_t len);
+void *text_poke_early(void *addr, const void *opcode, size_t len);
  
  /* Replace instructions with better alternatives for this CPU type.
     This runs before SMP is initialized to avoid SMP problems with
@@ -522,7 +522,7 @@ void __init alternative_instructions(void)
   * instructions. And on the local CPU you need to be protected again NMI or MCE
   * handlers seeing an inconsistent instruction while you patch.
   */
-static void *__init_or_module text_poke_early(void *addr, const void *opcode,
+void *__init_or_module text_poke_early(void *addr, const void *opcode,
                                               size_t len)
  {
         unsigned long flags;
@@ -637,7 +637,72 @@ void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
         tpp.len = len;
         atomic_set(&stop_machine_first, 1);
         wrote_text = 0;
-       stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
+       /* Use __stop_machine() because the caller already got online_cpus. */
+       __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
         return addr;
  }
  
+#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
+
+unsigned char ideal_nop5[IDEAL_NOP_SIZE_5];
+
+void __init arch_init_ideal_nop5(void)
+{
+       extern const unsigned char ftrace_test_p6nop[];
+       extern const unsigned char ftrace_test_nop5[];
+       extern const unsigned char ftrace_test_jmp[];
+       int faulted = 0;
+
+       /*
+        * There is no good nop for all x86 archs.
+        * We will default to using the P6_NOP5, but first we
+        * will test to make sure that the nop will actually
+        * work on this CPU. If it faults, we will then
+        * go to a lesser efficient 5 byte nop. If that fails
+        * we then just use a jmp as our nop. This isn't the most
+        * efficient nop, but we can not use a multi part nop
+        * since we would then risk being preempted in the middle
+        * of that nop, and if we enabled tracing then, it might
+        * cause a system crash.
+        *
+        * TODO: check the cpuid to determine the best nop.
+        */
+       asm volatile (
+               "ftrace_test_jmp:"
+               "jmp ftrace_test_p6nop\n"
+               "nop\n"
+               "nop\n"
+               "nop\n"  /* 2 byte jmp + 3 bytes */
+               "ftrace_test_p6nop:"
+               P6_NOP5
+               "jmp 1f\n"
+               "ftrace_test_nop5:"
+               ".byte 0x66,0x66,0x66,0x66,0x90\n"
+               "1:"
+               ".section .fixup, \"ax\"\n"
+               "2:     movl $1, %0\n"
+               "       jmp ftrace_test_nop5\n"
+               "3:     movl $2, %0\n"
+               "       jmp 1b\n"
+               ".previous\n"
+               _ASM_EXTABLE(ftrace_test_p6nop, 2b)
+               _ASM_EXTABLE(ftrace_test_nop5, 3b)
+               : "=r"(faulted) : "0" (faulted));
+
+       switch (faulted) {
+       case 0:
+               pr_info("converting mcount calls to 0f 1f 44 00 00\n");
+               memcpy(ideal_nop5, ftrace_test_p6nop, IDEAL_NOP_SIZE_5);
+               break;
+       case 1:
+               pr_info("converting mcount calls to 66 66 66 66 90\n");
+               memcpy(ideal_nop5, ftrace_test_nop5, IDEAL_NOP_SIZE_5);
+               break;
+       case 2:
+               pr_info("converting mcount calls to jmp . + 5\n");
+               memcpy(ideal_nop5, ftrace_test_jmp, IDEAL_NOP_SIZE_5);
+               break;
+       }
+
+}
+#endif
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c

index 679b6450382b9c8bafa2d71e2677fcd5ae075b95..d2fdb0826df25654fa94cc7ee307c23e235c5c91 100644 (file)
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
   * Author: Joerg Roedel <joerg.roedel@amd.com>
   *         Leo Duran <leo.duran@amd.com>
   *
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c

index 5a170cbbbed86aa1376eb19336a5329aacc96e91..3cb482e123de75f9044e0f843d23fb4348e513a7 100644 (file)
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
   * Author: Joerg Roedel <joerg.roedel@amd.com>
   *         Leo Duran <leo.duran@amd.com>
   *
@@ -194,6 +194,39 @@ static inline unsigned long tbl_size(int entry_size)
         return 1UL << shift;
  }
  
+/* Access to l1 and l2 indexed register spaces */
+
+static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
+{
+       u32 val;
+
+       pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
+       pci_read_config_dword(iommu->dev, 0xfc, &val);
+       return val;
+}
+
+static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
+{
+       pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
+       pci_write_config_dword(iommu->dev, 0xfc, val);
+       pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
+}
+
+static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
+{
+       u32 val;
+
+       pci_write_config_dword(iommu->dev, 0xf0, address);
+       pci_read_config_dword(iommu->dev, 0xf4, &val);
+       return val;
+}
+
+static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
+{
+       pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
+       pci_write_config_dword(iommu->dev, 0xf4, val);
+}
+
  /****************************************************************************
   *
   * AMD IOMMU MMIO register space handling functions
@@ -619,6 +652,7 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
  {
         int cap_ptr = iommu->cap_ptr;
         u32 range, misc;
+       int i, j;
  
         pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
                               &iommu->cap);
@@ -633,12 +667,29 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
                                         MMIO_GET_LD(range));
         iommu->evt_msi_num = MMIO_MSI_NUM(misc);
  
-       if (is_rd890_iommu(iommu->dev)) {
-               pci_read_config_dword(iommu->dev, 0xf0, &iommu->cache_cfg[0]);
-               pci_read_config_dword(iommu->dev, 0xf4, &iommu->cache_cfg[1]);
-               pci_read_config_dword(iommu->dev, 0xf8, &iommu->cache_cfg[2]);
-               pci_read_config_dword(iommu->dev, 0xfc, &iommu->cache_cfg[3]);
-       }
+       if (!is_rd890_iommu(iommu->dev))
+               return;
+
+       /*
+        * Some rd890 systems may not be fully reconfigured by the BIOS, so
+        * it's necessary for us to store this information so it can be
+        * reprogrammed on resume
+        */
+
+       pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
+                             &iommu->stored_addr_lo);
+       pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
+                             &iommu->stored_addr_hi);
+
+       /* Low bit locks writes to configuration space */
+       iommu->stored_addr_lo &= ~1;
+
+       for (i = 0; i < 6; i++)
+               for (j = 0; j < 0x12; j++)
+                       iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
+
+       for (i = 0; i < 0x83; i++)
+               iommu->stored_l2[i] = iommu_read_l2(iommu, i);
  }
  
  /*
@@ -1127,14 +1178,53 @@ static void iommu_init_flags(struct amd_iommu *iommu)
         iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
  }
  
-static void iommu_apply_quirks(struct amd_iommu *iommu)
+static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
  {
-       if (is_rd890_iommu(iommu->dev)) {
-               pci_write_config_dword(iommu->dev, 0xf0, iommu->cache_cfg[0]);
-               pci_write_config_dword(iommu->dev, 0xf4, iommu->cache_cfg[1]);
-               pci_write_config_dword(iommu->dev, 0xf8, iommu->cache_cfg[2]);
-               pci_write_config_dword(iommu->dev, 0xfc, iommu->cache_cfg[3]);
-       }
+       int i, j;
+       u32 ioc_feature_control;
+       struct pci_dev *pdev = NULL;
+
+       /* RD890 BIOSes may not have completely reconfigured the iommu */
+       if (!is_rd890_iommu(iommu->dev))
+               return;
+
+       /*
+        * First, we need to ensure that the iommu is enabled. This is
+        * controlled by a register in the northbridge
+        */
+       pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0));
+
+       if (!pdev)
+               return;
+
+       /* Select Northbridge indirect register 0x75 and enable writing */
+       pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
+       pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
+
+       /* Enable the iommu */
+       if (!(ioc_feature_control & 0x1))
+               pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
+
+       pci_dev_put(pdev);
+
+       /* Restore the iommu BAR */
+       pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
+                              iommu->stored_addr_lo);
+       pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
+                              iommu->stored_addr_hi);
+
+       /* Restore the l1 indirect regs for each of the 6 l1s */
+       for (i = 0; i < 6; i++)
+               for (j = 0; j < 0x12; j++)
+                       iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
+
+       /* Restore the l2 indirect regs */
+       for (i = 0; i < 0x83; i++)
+               iommu_write_l2(iommu, i, iommu->stored_l2[i]);
+
+       /* Lock PCI setup registers */
+       pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
+                              iommu->stored_addr_lo | 1);
  }
  
  /*
@@ -1147,7 +1237,6 @@ static void enable_iommus(void)
  
         for_each_iommu(iommu) {
                 iommu_disable(iommu);
-               iommu_apply_quirks(iommu);
                 iommu_init_flags(iommu);
                 iommu_set_device_table(iommu);
                 iommu_enable_command_buffer(iommu);
@@ -1173,6 +1262,11 @@ static void disable_iommus(void)
  
  static int amd_iommu_resume(struct sys_device *dev)
  {
+       struct amd_iommu *iommu;
+
+       for_each_iommu(iommu)
+               iommu_apply_resume_quirks(iommu);
+
         /* re-load the hardware */
         enable_iommus();
  
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c

index a2e0caf26e172c8f7b18231e4f4a072161ed66fe..c9cb173684485b47c23b52ba5409bdc1f987aab4 100644 (file)
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -307,7 +307,7 @@ void __init early_gart_iommu_check(void)
                                 continue;
  
                         ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
-                       aper_enabled = ctl & AMD64_GARTEN;
+                       aper_enabled = ctl & GARTEN;
                         aper_order = (ctl >> 1) & 7;
                         aper_size = (32 * 1024 * 1024) << aper_order;
                         aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
@@ -362,7 +362,7 @@ void __init early_gart_iommu_check(void)
                                 continue;
  
                         ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
-                       ctl &= ~AMD64_GARTEN;
+                       ctl &= ~GARTEN;
                         write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
                 }
         }
@@ -505,8 +505,13 @@ out:
  
         /* Fix up the north bridges */
         for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
-               int bus;
-               int dev_base, dev_limit;
+               int bus, dev_base, dev_limit;
+
+               /*
+                * Don't enable translation yet but enable GART IO and CPU
+                * accesses and set DISTLBWALKPRB since GART table memory is UC.
+                */
+               u32 ctl = DISTLBWALKPRB | aper_order << 1;
  
                 bus = bus_dev_ranges[i].bus;
                 dev_base = bus_dev_ranges[i].dev_base;
@@ -515,10 +520,7 @@ out:
                         if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
                                 continue;
  
-                       /* Don't enable translation yet. That is done later.
-                          Assume this BIOS didn't initialise the GART so
-                          just overwrite all previous bits */
-                       write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1);
+                       write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
                         write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
                 }
         }
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c

index 03a5b0385ad6c4402d2192b42aa14e9bfaccc80e..fe73c1844a9a5b7c9a0c902bef0b9f993207acee 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -531,7 +531,7 @@ static int x86_pmu_hw_config(struct perf_event *event)
  /*
   * Setup the hardware configuration for a given attr_type
   */
-static int __hw_perf_event_init(struct perf_event *event)
+static int __x86_pmu_event_init(struct perf_event *event)
  {
         int err;
  
@@ -584,7 +584,7 @@ static void x86_pmu_disable_all(void)
         }
  }
  
-void hw_perf_disable(void)
+static void x86_pmu_disable(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
  
@@ -619,7 +619,7 @@ static void x86_pmu_enable_all(int added)
         }
  }
  
-static const struct pmu pmu;
+static struct pmu pmu;
  
  static inline int is_x86_event(struct perf_event *event)
  {
@@ -801,10 +801,10 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc,
                 hwc->last_tag == cpuc->tags[i];
  }
  
-static int x86_pmu_start(struct perf_event *event);
-static void x86_pmu_stop(struct perf_event *event);
+static void x86_pmu_start(struct perf_event *event, int flags);
+static void x86_pmu_stop(struct perf_event *event, int flags);
  
-void hw_perf_enable(void)
+static void x86_pmu_enable(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         struct perf_event *event;
@@ -840,7 +840,14 @@ void hw_perf_enable(void)
                             match_prev_assignment(hwc, cpuc, i))
                                 continue;
  
-                       x86_pmu_stop(event);
+                       /*
+                        * Ensure we don't accidentally enable a stopped
+                        * counter simply because we rescheduled.
+                        */
+                       if (hwc->state & PERF_HES_STOPPED)
+                               hwc->state |= PERF_HES_ARCH;
+
+                       x86_pmu_stop(event, PERF_EF_UPDATE);
                 }
  
                 for (i = 0; i < cpuc->n_events; i++) {
@@ -852,7 +859,10 @@ void hw_perf_enable(void)
                         else if (i < n_running)
                                 continue;
  
-                       x86_pmu_start(event);
+                       if (hwc->state & PERF_HES_ARCH)
+                               continue;
+
+                       x86_pmu_start(event, PERF_EF_RELOAD);
                 }
                 cpuc->n_added = 0;
                 perf_events_lapic_init();
@@ -953,15 +963,12 @@ static void x86_pmu_enable_event(struct perf_event *event)
  }
  
  /*
- * activate a single event
+ * Add a single event to the PMU.
   *
   * The event is added to the group of enabled events
   * but only if it can be scehduled with existing events.
- *
- * Called with PMU disabled. If successful and return value 1,
- * then guaranteed to call perf_enable() and hw_perf_enable()
   */
-static int x86_pmu_enable(struct perf_event *event)
+static int x86_pmu_add(struct perf_event *event, int flags)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         struct hw_perf_event *hwc;
@@ -970,58 +977,67 @@ static int x86_pmu_enable(struct perf_event *event)
  
         hwc = &event->hw;
  
+       perf_pmu_disable(event->pmu);
         n0 = cpuc->n_events;
-       n = collect_events(cpuc, event, false);
-       if (n < 0)
-               return n;
+       ret = n = collect_events(cpuc, event, false);
+       if (ret < 0)
+               goto out;
+
+       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+       if (!(flags & PERF_EF_START))
+               hwc->state |= PERF_HES_ARCH;
  
         /*
          * If group events scheduling transaction was started,
          * skip the schedulability test here, it will be peformed
-        * at commit time(->commit_txn) as a whole
+        * at commit time (->commit_txn) as a whole
          */
         if (cpuc->group_flag & PERF_EVENT_TXN)
-               goto out;
+               goto done_collect;
  
         ret = x86_pmu.schedule_events(cpuc, n, assign);
         if (ret)
-               return ret;
+               goto out;
         /*
          * copy new assignment, now we know it is possible
          * will be used by hw_perf_enable()
          */
         memcpy(cpuc->assign, assign, n*sizeof(int));
  
-out:
+done_collect:
         cpuc->n_events = n;
         cpuc->n_added += n - n0;
         cpuc->n_txn += n - n0;
  
-       return 0;
+       ret = 0;
+out:
+       perf_pmu_enable(event->pmu);
+       return ret;
  }
  
-static int x86_pmu_start(struct perf_event *event)
+static void x86_pmu_start(struct perf_event *event, int flags)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         int idx = event->hw.idx;
  
-       if (idx == -1)
-               return -EAGAIN;
+       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+               return;
+
+       if (WARN_ON_ONCE(idx == -1))
+               return;
+
+       if (flags & PERF_EF_RELOAD) {
+               WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+               x86_perf_event_set_period(event);
+       }
+
+       event->hw.state = 0;
  
-       x86_perf_event_set_period(event);
         cpuc->events[idx] = event;
         __set_bit(idx, cpuc->active_mask);
         __set_bit(idx, cpuc->running);
         x86_pmu.enable(event);
         perf_event_update_userpage(event);
-
-       return 0;
-}
-
-static void x86_pmu_unthrottle(struct perf_event *event)
-{
-       int ret = x86_pmu_start(event);
-       WARN_ON_ONCE(ret);
  }
  
  void perf_event_print_debug(void)
@@ -1078,27 +1094,29 @@ void perf_event_print_debug(void)
         local_irq_restore(flags);
  }
  
-static void x86_pmu_stop(struct perf_event *event)
+static void x86_pmu_stop(struct perf_event *event, int flags)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         struct hw_perf_event *hwc = &event->hw;
-       int idx = hwc->idx;
  
-       if (!__test_and_clear_bit(idx, cpuc->active_mask))
-               return;
-
-       x86_pmu.disable(event);
-
-       /*
-        * Drain the remaining delta count out of a event
-        * that we are disabling:
-        */
-       x86_perf_event_update(event);
+       if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
+               x86_pmu.disable(event);
+               cpuc->events[hwc->idx] = NULL;
+               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+               hwc->state |= PERF_HES_STOPPED;
+       }
  
-       cpuc->events[idx] = NULL;
+       if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+               /*
+                * Drain the remaining delta count out of a event
+                * that we are disabling:
+                */
+               x86_perf_event_update(event);
+               hwc->state |= PERF_HES_UPTODATE;
+       }
  }
  
-static void x86_pmu_disable(struct perf_event *event)
+static void x86_pmu_del(struct perf_event *event, int flags)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         int i;
@@ -1111,7 +1129,7 @@ static void x86_pmu_disable(struct perf_event *event)
         if (cpuc->group_flag & PERF_EVENT_TXN)
                 return;
  
-       x86_pmu_stop(event);
+       x86_pmu_stop(event, PERF_EF_UPDATE);
  
         for (i = 0; i < cpuc->n_events; i++) {
                 if (event == cpuc->event_list[i]) {
@@ -1134,7 +1152,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
         struct perf_sample_data data;
         struct cpu_hw_events *cpuc;
         struct perf_event *event;
-       struct hw_perf_event *hwc;
         int idx, handled = 0;
         u64 val;
  
@@ -1155,7 +1172,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
                 }
  
                 event = cpuc->events[idx];
-               hwc = &event->hw;
  
                 val = x86_perf_event_update(event);
                 if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
@@ -1171,7 +1187,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
                         continue;
  
                 if (perf_event_overflow(event, 1, &data, regs))
-                       x86_pmu_stop(event);
+                       x86_pmu_stop(event, 0);
         }
  
         if (handled)
@@ -1180,25 +1196,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
         return handled;
  }
  
-void smp_perf_pending_interrupt(struct pt_regs *regs)
-{
-       irq_enter();
-       ack_APIC_irq();
-       inc_irq_stat(apic_pending_irqs);
-       perf_event_do_pending();
-       irq_exit();
-}
-
-void set_perf_event_pending(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
-       if (!x86_pmu.apic || !x86_pmu_initialized())
-               return;
-
-       apic->send_IPI_self(LOCAL_PENDING_VECTOR);
-#endif
-}
-
  void perf_events_lapic_init(void)
  {
         if (!x86_pmu.apic || !x86_pmu_initialized())
@@ -1388,7 +1385,6 @@ void __init init_hw_perf_events(void)
                 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
         }
         x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
-       perf_max_events = x86_pmu.num_counters;
  
         if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
                 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
@@ -1424,6 +1420,7 @@ void __init init_hw_perf_events(void)
         pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
         pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
  
+       perf_pmu_register(&pmu);
         perf_cpu_notifier(x86_pmu_notifier);
  }
  
@@ -1437,10 +1434,11 @@ static inline void x86_pmu_read(struct perf_event *event)
   * Set the flag to make pmu::enable() not perform the
   * schedulability test, it will be performed at commit time
   */
-static void x86_pmu_start_txn(const struct pmu *pmu)
+static void x86_pmu_start_txn(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
  
+       perf_pmu_disable(pmu);
         cpuc->group_flag |= PERF_EVENT_TXN;
         cpuc->n_txn = 0;
  }
@@ -1450,7 +1448,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
   * Clear the flag and pmu::enable() will perform the
   * schedulability test.
   */
-static void x86_pmu_cancel_txn(const struct pmu *pmu)
+static void x86_pmu_cancel_txn(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
  
@@ -1460,6 +1458,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
          */
         cpuc->n_added -= cpuc->n_txn;
         cpuc->n_events -= cpuc->n_txn;
+       perf_pmu_enable(pmu);
  }
  
  /*
@@ -1467,7 +1466,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
   * Perform the group schedulability test as a whole
   * Return 0 if success
   */
-static int x86_pmu_commit_txn(const struct pmu *pmu)
+static int x86_pmu_commit_txn(struct pmu *pmu)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         int assign[X86_PMC_IDX_MAX];
@@ -1489,22 +1488,10 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
         memcpy(cpuc->assign, assign, n*sizeof(int));
  
         cpuc->group_flag &= ~PERF_EVENT_TXN;
-
+       perf_pmu_enable(pmu);
         return 0;
  }
  
-static const struct pmu pmu = {
-       .enable         = x86_pmu_enable,
-       .disable        = x86_pmu_disable,
-       .start          = x86_pmu_start,
-       .stop           = x86_pmu_stop,
-       .read           = x86_pmu_read,
-       .unthrottle     = x86_pmu_unthrottle,
-       .start_txn      = x86_pmu_start_txn,
-       .cancel_txn     = x86_pmu_cancel_txn,
-       .commit_txn     = x86_pmu_commit_txn,
-};
-
  /*
   * validate that we can schedule this event
   */
@@ -1579,12 +1566,22 @@ out:
         return ret;
  }
  
-const struct pmu *hw_perf_event_init(struct perf_event *event)
+int x86_pmu_event_init(struct perf_event *event)
  {
-       const struct pmu *tmp;
+       struct pmu *tmp;
         int err;
  
-       err = __hw_perf_event_init(event);
+       switch (event->attr.type) {
+       case PERF_TYPE_RAW:
+       case PERF_TYPE_HARDWARE:
+       case PERF_TYPE_HW_CACHE:
+               break;
+
+       default:
+               return -ENOENT;
+       }
+
+       err = __x86_pmu_event_init(event);
         if (!err) {
                 /*
                  * we temporarily connect event to its pmu
@@ -1604,26 +1601,31 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
         if (err) {
                 if (event->destroy)
                         event->destroy(event);
-               return ERR_PTR(err);
         }
  
-       return &pmu;
+       return err;
  }
  
-/*
- * callchain support
- */
+static struct pmu pmu = {
+       .pmu_enable     = x86_pmu_enable,
+       .pmu_disable    = x86_pmu_disable,
  
-static inline
-void callchain_store(struct perf_callchain_entry *entry, u64 ip)
-{
-       if (entry->nr < PERF_MAX_STACK_DEPTH)
-               entry->ip[entry->nr++] = ip;
-}
+       .event_init     = x86_pmu_event_init,
  
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
-static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
+       .add            = x86_pmu_add,
+       .del            = x86_pmu_del,
+       .start          = x86_pmu_start,
+       .stop           = x86_pmu_stop,
+       .read           = x86_pmu_read,
  
+       .start_txn      = x86_pmu_start_txn,
+       .cancel_txn     = x86_pmu_cancel_txn,
+       .commit_txn     = x86_pmu_commit_txn,
+};
+
+/*
+ * callchain support
+ */
  
  static void
  backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
@@ -1645,7 +1647,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
  {
         struct perf_callchain_entry *entry = data;
  
-       callchain_store(entry, addr);
+       perf_callchain_store(entry, addr);
  }
  
  static const struct stacktrace_ops backtrace_ops = {
@@ -1656,11 +1658,15 @@ static const struct stacktrace_ops backtrace_ops = {
         .walk_stack             = print_context_stack_bp,
  };
  
-static void
-perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
  {
-       callchain_store(entry, PERF_CONTEXT_KERNEL);
-       callchain_store(entry, regs->ip);
+       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+               /* TODO: We don't support guest os callchain now */
+               return;
+       }
+
+       perf_callchain_store(entry, regs->ip);
  
         dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
  }
@@ -1689,7 +1695,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
                 if (fp < compat_ptr(regs->sp))
                         break;
  
-               callchain_store(entry, frame.return_address);
+               perf_callchain_store(entry, frame.return_address);
                 fp = compat_ptr(frame.next_frame);
         }
         return 1;
@@ -1702,19 +1708,20 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
  }
  #endif
  
-static void
-perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
  {
         struct stack_frame frame;
         const void __user *fp;
  
-       if (!user_mode(regs))
-               regs = task_pt_regs(current);
+       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+               /* TODO: We don't support guest os callchain now */
+               return;
+       }
  
         fp = (void __user *)regs->bp;
  
-       callchain_store(entry, PERF_CONTEXT_USER);
-       callchain_store(entry, regs->ip);
+       perf_callchain_store(entry, regs->ip);
  
         if (perf_callchain_user32(regs, entry))
                 return;
@@ -1731,52 +1738,11 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
                 if ((unsigned long)fp < regs->sp)
                         break;
  
-               callchain_store(entry, frame.return_address);
+               perf_callchain_store(entry, frame.return_address);
                 fp = frame.next_frame;
         }
  }
  
-static void
-perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-       int is_user;
-
-       if (!regs)
-               return;
-
-       is_user = user_mode(regs);
-
-       if (is_user && current->state != TASK_RUNNING)
-               return;
-
-       if (!is_user)
-               perf_callchain_kernel(regs, entry);
-
-       if (current->mm)
-               perf_callchain_user(regs, entry);
-}
-
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-       struct perf_callchain_entry *entry;
-
-       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-               /* TODO: We don't support guest os callchain now */
-               return NULL;
-       }
-
-       if (in_nmi())
-               entry = &__get_cpu_var(pmc_nmi_entry);
-       else
-               entry = &__get_cpu_var(pmc_irq_entry);
-
-       entry->nr = 0;
-
-       perf_do_callchain(regs, entry);
-
-       return entry;
-}
-
  unsigned long perf_instruction_pointer(struct pt_regs *regs)
  {
         unsigned long ip;
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c

index c2897b7b4a3b1c7a6cfe4fe7e70b3d7378432d4c..46d58448c3aff9039fdf0b909d069d00d80ef75e 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -52,7 +52,7 @@ static __initconst const u64 amd_hw_cache_event_ids
   [ C(DTLB) ] = {
         [ C(OP_READ) ] = {
                 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
-               [ C(RESULT_MISS)   ] = 0x0046, /* L1 DTLB and L2 DLTB Miss   */
+               [ C(RESULT_MISS)   ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */
         },
         [ C(OP_WRITE) ] = {
                 [ C(RESULT_ACCESS) ] = 0,
@@ -66,7 +66,7 @@ static __initconst const u64 amd_hw_cache_event_ids
   [ C(ITLB) ] = {
         [ C(OP_READ) ] = {
                 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
-               [ C(RESULT_MISS)   ] = 0x0085, /* Instr. fetch ITLB misses   */
+               [ C(RESULT_MISS)   ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */
         },
         [ C(OP_WRITE) ] = {
                 [ C(RESULT_ACCESS) ] = -1,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c

index ee05c90012d269e66de9ffb6a5952d1434317c1e..c8f5c088cad11ae3f245e1e7374bb43c915170d6 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -713,18 +713,18 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
         struct cpu_hw_events *cpuc;
         int bit, loops;
         u64 status;
-       int handled = 0;
+       int handled;
  
         perf_sample_data_init(&data, 0);
  
         cpuc = &__get_cpu_var(cpu_hw_events);
  
         intel_pmu_disable_all();
-       intel_pmu_drain_bts_buffer();
+       handled = intel_pmu_drain_bts_buffer();
         status = intel_pmu_get_status();
         if (!status) {
                 intel_pmu_enable_all(0);
-               return 0;
+               return handled;
         }
  
         loops = 0;
@@ -763,7 +763,7 @@ again:
                 data.period = event->hw.last_period;
  
                 if (perf_event_overflow(event, 1, &data, regs))
-                       x86_pmu_stop(event);
+                       x86_pmu_stop(event, 0);
         }
  
         /*
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c

index 18018d1311cdf3fa7f550b2fd7894c278505afe6..4977f9c400e5738cb668efc937c69cde22a2772d 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -214,7 +214,7 @@ static void intel_pmu_disable_bts(void)
         update_debugctlmsr(debugctlmsr);
  }
  
-static void intel_pmu_drain_bts_buffer(void)
+static int intel_pmu_drain_bts_buffer(void)
  {
         struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
         struct debug_store *ds = cpuc->ds;
@@ -231,16 +231,16 @@ static void intel_pmu_drain_bts_buffer(void)
         struct pt_regs regs;
  
         if (!event)
-               return;
+               return 0;
  
         if (!ds)
-               return;
+               return 0;
  
         at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
         top = (struct bts_record *)(unsigned long)ds->bts_index;
  
         if (top <= at)
-               return;
+               return 0;
  
         ds->bts_index = ds->bts_buffer_base;
  
@@ -256,7 +256,7 @@ static void intel_pmu_drain_bts_buffer(void)
         perf_prepare_sample(&header, &data, event, &regs);
  
         if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
-               return;
+               return 1;
  
         for (; at < top; at++) {
                 data.ip         = at->from;
@@ -270,6 +270,7 @@ static void intel_pmu_drain_bts_buffer(void)
         /* There's new data available. */
         event->hw.interrupts++;
         event->pending_kill = POLL_IN;
+       return 1;
  }
  
  /*
@@ -491,7 +492,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
                 regs.flags &= ~PERF_EFLAGS_EXACT;
  
         if (perf_event_overflow(event, 1, &data, &regs))
-               x86_pmu_stop(event);
+               x86_pmu_stop(event, 0);
  }
  
  static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c

index 2490151739921e074085a5d4a39a6cfb783f7187..81400b93e69483e18bedf1c541753e2f915910cd 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -18,6 +18,8 @@
  struct p4_event_bind {
         unsigned int opcode;                    /* Event code and ESCR selector */
         unsigned int escr_msr[2];               /* ESCR MSR for this event */
+       unsigned int escr_emask;                /* valid ESCR EventMask bits */
+       unsigned int shared;                    /* event is shared across threads */
         char cntr[2][P4_CNTR_LIMIT];            /* counter index (offset), -1 on abscence */
  };
  
@@ -66,231 +68,435 @@ static struct p4_event_bind p4_event_bind_map[] = {
         [P4_EVENT_TC_DELIVER_MODE] = {
                 .opcode         = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
                 .escr_msr       = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID),
+               .shared         = 1,
                 .cntr           = { {4, 5, -1}, {6, 7, -1} },
         },
         [P4_EVENT_BPU_FETCH_REQUEST] = {
                 .opcode         = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
                 .escr_msr       = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS),
                 .cntr           = { {0, -1, -1}, {2, -1, -1} },
         },
         [P4_EVENT_ITLB_REFERENCE] = {
                 .opcode         = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
                 .escr_msr       = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK),
                 .cntr           = { {0, -1, -1}, {2, -1, -1} },
         },
         [P4_EVENT_MEMORY_CANCEL] = {
                 .opcode         = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
                 .escr_msr       = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF),
                 .cntr           = { {8, 9, -1}, {10, 11, -1} },
         },
         [P4_EVENT_MEMORY_COMPLETE] = {
                 .opcode         = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
                 .escr_msr       = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC),
                 .cntr           = { {8, 9, -1}, {10, 11, -1} },
         },
         [P4_EVENT_LOAD_PORT_REPLAY] = {
                 .opcode         = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
                 .escr_msr       = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD),
                 .cntr           = { {8, 9, -1}, {10, 11, -1} },
         },
         [P4_EVENT_STORE_PORT_REPLAY] = {
                 .opcode         = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
                 .escr_msr       = { MSR_P4_SAAT_ESCR0 ,  MSR_P4_SAAT_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST),
                 .cntr           = { {8, 9, -1}, {10, 11, -1} },
         },
         [P4_EVENT_MOB_LOAD_REPLAY] = {
                 .opcode         = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
                 .escr_msr       = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR),
                 .cntr           = { {0, -1, -1}, {2, -1, -1} },
         },
         [P4_EVENT_PAGE_WALK_TYPE] = {
                 .opcode         = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
                 .escr_msr       = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS),
+               .shared         = 1,
                 .cntr           = { {0, -1, -1}, {2, -1, -1} },
         },
         [P4_EVENT_BSQ_CACHE_REFERENCE] = {
                 .opcode         = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
                 .escr_msr       = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
                 .cntr           = { {0, -1, -1}, {2, -1, -1} },
         },
         [P4_EVENT_IOQ_ALLOCATION] = {
                 .opcode         = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
                 .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN)                 |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER)               |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH),
                 .cntr           = { {0, -1, -1}, {2, -1, -1} },
         },
         [P4_EVENT_IOQ_ACTIVE_ENTRIES] = {       /* shared ESCR */
                 .opcode         = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
                 .escr_msr       = { MSR_P4_FSB_ESCR1,  MSR_P4_FSB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ)        |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC)          |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC)          |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT)          |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP)          |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB)          |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH),
                 .cntr           = { {2, -1, -1}, {3, -1, -1} },
         },
         [P4_EVENT_FSB_DATA_ACTIVITY] = {
                 .opcode         = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
                 .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER),
+               .shared         = 1,
                 .cntr           = { {0, -1, -1}, {2, -1, -1} },
         },
         [P4_EVENT_BSQ_ALLOCATION] = {           /* shared ESCR, broken CCCR1 */
                 .opcode         = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
                 .escr_msr       = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE)      |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE)      |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE)        |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE)        |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2),
                 .cntr           = { {0, -1, -1}, {1, -1, -1} },
         },
         [P4_EVENT_BSQ_ACTIVE_ENTRIES] = {       /* shared ESCR */
                 .opcode         = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
                 .escr_msr       = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0)        |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1)        |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE)     |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE)  |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE)  |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE)    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE)    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1)       |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2),
                 .cntr           = { {2, -1, -1}, {3, -1, -1} },
         },
         [P4_EVENT_SSE_INPUT_ASSIST] = {
                 .opcode         = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
                 .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL),
+               .shared         = 1,
                 .cntr           = { {8, 9, -1}, {10, 11, -1} },
         },
         [P4_EVENT_PACKED_SP_UOP] = {
                 .opcode         = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
                 .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL),
+               .shared         = 1,
                 .cntr           = { {8, 9, -1}, {10, 11, -1} },
         },
         [P4_EVENT_PACKED_DP_UOP] = {
                 .opcode         = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
                 .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL),
+               .shared         = 1,
                 .cntr           = { {8, 9, -1}, {10, 11, -1} },
         },
         [P4_EVENT_SCALAR_SP_UOP] = {
                 .opcode         = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
                 .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL),
+               .shared         = 1,
                 .cntr           = { {8, 9, -1}, {10, 11, -1} },
         },
         [P4_EVENT_SCALAR_DP_UOP] = {
                 .opcode         = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
                 .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL),
+               .shared         = 1,
                 .cntr           = { {8, 9, -1}, {10, 11, -1} },
         },
         [P4_EVENT_64BIT_MMX_UOP] = {
                 .opcode         = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
                 .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL),
+               .shared         = 1,
                 .cntr           = { {8, 9, -1}, {10, 11, -1} },
         },
         [P4_EVENT_128BIT_MMX_UOP] = {
                 .opcode         = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
                 .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL),
+               .shared         = 1,
                 .cntr           = { {8, 9, -1}, {10, 11, -1} },
         },
         [P4_EVENT_X87_FP_UOP] = {
                 .opcode         = P4_OPCODE(P4_EVENT_X87_FP_UOP),
                 .escr_msr       = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL),
+               .shared         = 1,
                 .cntr           = { {8, 9, -1}, {10, 11, -1} },
         },
         [P4_EVENT_TC_MISC] = {
                 .opcode         = P4_OPCODE(P4_EVENT_TC_MISC),
                 .escr_msr       = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH),
                 .cntr           = { {4, 5, -1}, {6, 7, -1} },
         },
         [P4_EVENT_GLOBAL_POWER_EVENTS] = {
                 .opcode         = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
                 .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING),
                 .cntr           = { {0, -1, -1}, {2, -1, -1} },
         },
         [P4_EVENT_TC_MS_XFER] = {
                 .opcode         = P4_OPCODE(P4_EVENT_TC_MS_XFER),
                 .escr_msr       = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC),
                 .cntr           = { {4, 5, -1}, {6, 7, -1} },
         },
         [P4_EVENT_UOP_QUEUE_WRITES] = {
                 .opcode         = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
                 .escr_msr       = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD)     |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER)   |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM),
                 .cntr           = { {4, 5, -1}, {6, 7, -1} },
         },
         [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
                 .opcode         = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
                 .escr_msr       = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL)    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT),
                 .cntr           = { {4, 5, -1}, {6, 7, -1} },
         },
         [P4_EVENT_RETIRED_BRANCH_TYPE] = {
                 .opcode         = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
                 .escr_msr       = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)         |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT),
                 .cntr           = { {4, 5, -1}, {6, 7, -1} },
         },
         [P4_EVENT_RESOURCE_STALL] = {
                 .opcode         = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
                 .escr_msr       = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL),
                 .cntr           = { {12, 13, 16}, {14, 15, 17} },
         },
         [P4_EVENT_WC_BUFFER] = {
                 .opcode         = P4_OPCODE(P4_EVENT_WC_BUFFER),
                 .escr_msr       = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS)               |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS),
+               .shared         = 1,
                 .cntr           = { {8, 9, -1}, {10, 11, -1} },
         },
         [P4_EVENT_B2B_CYCLES] = {
                 .opcode         = P4_OPCODE(P4_EVENT_B2B_CYCLES),
                 .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     = 0,
                 .cntr           = { {0, -1, -1}, {2, -1, -1} },
         },
         [P4_EVENT_BNR] = {
                 .opcode         = P4_OPCODE(P4_EVENT_BNR),
                 .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     = 0,
                 .cntr           = { {0, -1, -1}, {2, -1, -1} },
         },
         [P4_EVENT_SNOOP] = {
                 .opcode         = P4_OPCODE(P4_EVENT_SNOOP),
                 .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     = 0,
                 .cntr           = { {0, -1, -1}, {2, -1, -1} },
         },
         [P4_EVENT_RESPONSE] = {
                 .opcode         = P4_OPCODE(P4_EVENT_RESPONSE),
                 .escr_msr       = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+               .escr_emask     = 0,
                 .cntr           = { {0, -1, -1}, {2, -1, -1} },
         },
         [P4_EVENT_FRONT_END_EVENT] = {
                 .opcode         = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
                 .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS),
                 .cntr           = { {12, 13, 16}, {14, 15, 17} },
         },
         [P4_EVENT_EXECUTION_EVENT] = {
                 .opcode         = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
                 .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3),
                 .cntr           = { {12, 13, 16}, {14, 15, 17} },
         },
         [P4_EVENT_REPLAY_EVENT] = {
                 .opcode         = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
                 .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS),
                 .cntr           = { {12, 13, 16}, {14, 15, 17} },
         },
         [P4_EVENT_INSTR_RETIRED] = {
                 .opcode         = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
                 .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)           |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)            |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG),
                 .cntr           = { {12, 13, 16}, {14, 15, 17} },
         },
         [P4_EVENT_UOPS_RETIRED] = {
                 .opcode         = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
                 .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS),
                 .cntr           = { {12, 13, 16}, {14, 15, 17} },
         },
         [P4_EVENT_UOP_TYPE] = {
                 .opcode         = P4_OPCODE(P4_EVENT_UOP_TYPE),
                 .escr_msr       = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS)                  |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES),
                 .cntr           = { {12, 13, 16}, {14, 15, 17} },
         },
         [P4_EVENT_BRANCH_RETIRED] = {
                 .opcode         = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
                 .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM),
                 .cntr           = { {12, 13, 16}, {14, 15, 17} },
         },
         [P4_EVENT_MISPRED_BRANCH_RETIRED] = {
                 .opcode         = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
                 .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+               .escr_emask     =
+               P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
                 .cntr           = { {12, 13, 16}, {14, 15, 17} },
         },
         [P4_EVENT_X87_ASSIST] = {
                 .opcode         = P4_OPCODE(P4_EVENT_X87_ASSIST),
                 .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU)                    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO)                    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO)                    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU)                    |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA),
                 .cntr           = { {12, 13, 16}, {14, 15, 17} },
         },
         [P4_EVENT_MACHINE_CLEAR] = {
                 .opcode         = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
                 .escr_msr       = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR)                |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR)              |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR),
                 .cntr           = { {12, 13, 16}, {14, 15, 17} },
         },
         [P4_EVENT_INSTR_COMPLETED] = {
                 .opcode         = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
                 .escr_msr       = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+               .escr_emask     =
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS)             |
+                       P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS),
                 .cntr           = { {12, 13, 16}, {14, 15, 17} },
         },
  };
@@ -428,29 +634,73 @@ static u64 p4_pmu_event_map(int hw_event)
         return config;
  }
  
+/* check cpu model specifics */
+static bool p4_event_match_cpu_model(unsigned int event_idx)
+{
+       /* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */
+       if (event_idx == P4_EVENT_INSTR_COMPLETED) {
+               if (boot_cpu_data.x86_model != 3 &&
+                       boot_cpu_data.x86_model != 4 &&
+                       boot_cpu_data.x86_model != 6)
+                       return false;
+       }
+
+       /*
+        * For info
+        * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2
+        */
+
+       return true;
+}
+
  static int p4_validate_raw_event(struct perf_event *event)
  {
-       unsigned int v;
+       unsigned int v, emask;
  
-       /* user data may have out-of-bound event index */
+       /* User data may have out-of-bound event index */
         v = p4_config_unpack_event(event->attr.config);
-       if (v >= ARRAY_SIZE(p4_event_bind_map)) {
-               pr_warning("P4 PMU: Unknown event code: %d\n", v);
+       if (v >= ARRAY_SIZE(p4_event_bind_map))
+               return -EINVAL;
+
+       /* It may be unsupported: */
+       if (!p4_event_match_cpu_model(v))
                 return -EINVAL;
+
+       /*
+        * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as
+        * in Architectural Performance Monitoring, it means not
+        * on _which_ logical cpu to count but rather _when_, ie it
+        * depends on logical cpu state -- count event if one cpu active,
+        * none, both or any, so we just allow user to pass any value
+        * desired.
+        *
+        * In turn we always set Tx_OS/Tx_USR bits bound to logical
+        * cpu without their propagation to another cpu
+        */
+
+       /*
+        * if an event is shared accross the logical threads
+        * the user needs special permissions to be able to use it
+        */
+       if (p4_event_bind_map[v].shared) {
+               if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+                       return -EACCES;
         }
  
+       /* ESCR EventMask bits may be invalid */
+       emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK;
+       if (emask & ~p4_event_bind_map[v].escr_emask)
+               return -EINVAL;
+
         /*
-        * it may have some screwed PEBS bits
+        * it may have some invalid PEBS bits
          */
-       if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) {
-               pr_warning("P4 PMU: PEBS are not supported yet\n");
+       if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE))
                 return -EINVAL;
-       }
+
         v = p4_config_unpack_metric(event->attr.config);
-       if (v >= ARRAY_SIZE(p4_pebs_bind_map)) {
-               pr_warning("P4 PMU: Unknown metric code: %d\n", v);
+       if (v >= ARRAY_SIZE(p4_pebs_bind_map))
                 return -EINVAL;
-       }
  
         return 0;
  }
@@ -478,27 +728,21 @@ static int p4_hw_config(struct perf_event *event)
  
         if (event->attr.type == PERF_TYPE_RAW) {
  
+               /*
+                * Clear bits we reserve to be managed by kernel itself
+                * and never allowed from a user space
+                */
+                event->attr.config &= P4_CONFIG_MASK;
+
                 rc = p4_validate_raw_event(event);
                 if (rc)
                         goto out;
  
                 /*
-                * We don't control raw events so it's up to the caller
-                * to pass sane values (and we don't count the thread number
-                * on HT machine but allow HT-compatible specifics to be
-                * passed on)
-                *
                  * Note that for RAW events we allow user to use P4_CCCR_RESERVED
                  * bits since we keep additional info here (for cache events and etc)
-                *
-                * XXX: HT wide things should check perf_paranoid_cpu() &&
-                *      CAP_SYS_ADMIN
                  */
-               event->hw.config |= event->attr.config &
-                       (p4_config_pack_escr(P4_ESCR_MASK_HT) |
-                        p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED));
-
-               event->hw.config &= ~P4_CCCR_FORCE_OVF;
+               event->hw.config |= event->attr.config;
         }
  
         rc = x86_setup_perfctr(event);
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S

index 17be5ec7cbbad332973b6b46a79cdb3db2832f74..c375c79065f8f6c73172ab16704ece9ae78ea177 100644 (file)
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1023,9 +1023,9 @@ apicinterrupt ERROR_APIC_VECTOR \
  apicinterrupt SPURIOUS_APIC_VECTOR \
         spurious_interrupt smp_spurious_interrupt
  
-#ifdef CONFIG_PERF_EVENTS
-apicinterrupt LOCAL_PENDING_VECTOR \
-       perf_pending_interrupt smp_perf_pending_interrupt
+#ifdef CONFIG_IRQ_WORK
+apicinterrupt IRQ_WORK_VECTOR \
+       irq_work_interrupt smp_irq_work_interrupt
  #endif
  
  /*
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c

index cd37469b54eeed3fc479d2c931d8d2ed933ea411..3afb33f14d2d2c86a3c961d87aaae531d2631ac8 100644 (file)
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -257,14 +257,9 @@ do_ftrace_mod_code(unsigned long ip, void *new_code)
         return mod_code_status;
  }
  
-
-
-
-static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
-
  static unsigned char *ftrace_nop_replace(void)
  {
-       return ftrace_nop;
+       return ideal_nop5;
  }
  
  static int
@@ -338,62 +333,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
  
  int __init ftrace_dyn_arch_init(void *data)
  {
-       extern const unsigned char ftrace_test_p6nop[];
-       extern const unsigned char ftrace_test_nop5[];
-       extern const unsigned char ftrace_test_jmp[];
-       int faulted = 0;
-
-       /*
-        * There is no good nop for all x86 archs.
-        * We will default to using the P6_NOP5, but first we
-        * will test to make sure that the nop will actually
-        * work on this CPU. If it faults, we will then
-        * go to a lesser efficient 5 byte nop. If that fails
-        * we then just use a jmp as our nop. This isn't the most
-        * efficient nop, but we can not use a multi part nop
-        * since we would then risk being preempted in the middle
-        * of that nop, and if we enabled tracing then, it might
-        * cause a system crash.
-        *
-        * TODO: check the cpuid to determine the best nop.
-        */
-       asm volatile (
-               "ftrace_test_jmp:"
-               "jmp ftrace_test_p6nop\n"
-               "nop\n"
-               "nop\n"
-               "nop\n"  /* 2 byte jmp + 3 bytes */
-               "ftrace_test_p6nop:"
-               P6_NOP5
-               "jmp 1f\n"
-               "ftrace_test_nop5:"
-               ".byte 0x66,0x66,0x66,0x66,0x90\n"
-               "1:"
-               ".section .fixup, \"ax\"\n"
-               "2:     movl $1, %0\n"
-               "       jmp ftrace_test_nop5\n"
-               "3:     movl $2, %0\n"
-               "       jmp 1b\n"
-               ".previous\n"
-               _ASM_EXTABLE(ftrace_test_p6nop, 2b)
-               _ASM_EXTABLE(ftrace_test_nop5, 3b)
-               : "=r"(faulted) : "0" (faulted));
-
-       switch (faulted) {
-       case 0:
-               pr_info("converting mcount calls to 0f 1f 44 00 00\n");
-               memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
-               break;
-       case 1:
-               pr_info("converting mcount calls to 66 66 66 66 90\n");
-               memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
-               break;
-       case 2:
-               pr_info("converting mcount calls to jmp . + 5\n");
-               memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
-               break;
-       }
-
         /* The return code is retured via data */
         *(unsigned long *)data = 0;
  
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c

index 91fd0c70a18abddc28eff75e287c70dd5d37af7d..44edb03fc9ec6d046f608c0cf1416cc034bab2fe 100644 (file)
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -67,10 +67,10 @@ static int show_other_interrupts(struct seq_file *p, int prec)
         for_each_online_cpu(j)
                 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
         seq_printf(p, "  Performance monitoring interrupts\n");
-       seq_printf(p, "%*s: ", prec, "PND");
+       seq_printf(p, "%*s: ", prec, "IWI");
         for_each_online_cpu(j)
-               seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
-       seq_printf(p, "  Performance pending work\n");
+               seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
+       seq_printf(p, "  IRQ work interrupts\n");
  #endif
         if (x86_platform_ipi_callback) {
                 seq_printf(p, "%*s: ", prec, "PLT");
@@ -185,7 +185,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
         sum += irq_stats(cpu)->apic_timer_irqs;
         sum += irq_stats(cpu)->irq_spurious_count;
         sum += irq_stats(cpu)->apic_perf_irqs;
-       sum += irq_stats(cpu)->apic_pending_irqs;
+       sum += irq_stats(cpu)->apic_irq_work_irqs;
  #endif
         if (x86_platform_ipi_callback)
                 sum += irq_stats(cpu)->x86_platform_ipis;
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c

new file mode 100644 (file)

index 0000000..ca8f703
--- /dev/null
+++ b/arch/x86/kernel/irq_work.c
@@ -0,0 +1,30 @@
+/*
+ * x86 specific code for irq_work
+ *
+ * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/irq_work.h>
+#include <linux/hardirq.h>
+#include <asm/apic.h>
+
+void smp_irq_work_interrupt(struct pt_regs *regs)
+{
+       irq_enter();
+       ack_APIC_irq();
+       inc_irq_stat(apic_irq_work_irqs);
+       irq_work_run();
+       irq_exit();
+}
+
+void arch_irq_work_raise(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+       if (!cpu_has_apic)
+               return;
+
+       apic->send_IPI_self(IRQ_WORK_VECTOR);
+       apic_wait_icr_idle();
+#endif
+}
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c

index 990ae7cfc5783f131df476506bc9341574a466c2..713969b9266b587c824e37be65d7467280a5da2e 100644 (file)
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -224,9 +224,9 @@ static void __init apic_intr_init(void)
         alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
         alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
  
-       /* Performance monitoring interrupts: */
-# ifdef CONFIG_PERF_EVENTS
-       alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
+       /* IRQ work interrupts: */
+# ifdef CONFIG_IRQ_WORK
+       alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt);
  # endif
  
  #endif
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c

new file mode 100644 (file)

index 0000000..961b6b3
--- /dev/null
+++ b/arch/x86/kernel/jump_label.c
@@ -0,0 +1,50 @@
+/*
+ * jump label x86 support
+ *
+ * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
+ *
+ */
+#include <linux/jump_label.h>
+#include <linux/memory.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/cpu.h>
+#include <asm/kprobes.h>
+#include <asm/alternative.h>
+
+#ifdef HAVE_JUMP_LABEL
+
+union jump_code_union {
+       char code[JUMP_LABEL_NOP_SIZE];
+       struct {
+               char jump;
+               int offset;
+       } __attribute__((packed));
+};
+
+void arch_jump_label_transform(struct jump_entry *entry,
+                              enum jump_label_type type)
+{
+       union jump_code_union code;
+
+       if (type == JUMP_LABEL_ENABLE) {
+               code.jump = 0xe9;
+               code.offset = entry->target -
+                               (entry->code + JUMP_LABEL_NOP_SIZE);
+       } else
+               memcpy(&code, ideal_nop5, JUMP_LABEL_NOP_SIZE);
+       get_online_cpus();
+       mutex_lock(&text_mutex);
+       text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
+       mutex_unlock(&text_mutex);
+       put_online_cpus();
+}
+
+void arch_jump_label_text_poke_early(jump_label_t addr)
+{
+       text_poke_early((void *)addr, ideal_nop5, JUMP_LABEL_NOP_SIZE);
+}
+
+#endif
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c

index 770ebfb349e93efe3367cf0c6caff93b61b8b884..1cbd54c0df99189548a3a03f40fbb75a1703475a 100644 (file)
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -230,9 +230,6 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
         return 0;
  }
  
-/* Dummy buffers for kallsyms_lookup */
-static char __dummy_buf[KSYM_NAME_LEN];
-
  /* Check if paddr is at an instruction boundary */
  static int __kprobes can_probe(unsigned long paddr)
  {
@@ -241,7 +238,7 @@ static int __kprobes can_probe(unsigned long paddr)
         struct insn insn;
         kprobe_opcode_t buf[MAX_INSN_SIZE];
  
-       if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf))
+       if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
                 return 0;
  
         /* Decode instructions */
@@ -1129,7 +1126,7 @@ static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
         *(unsigned long *)addr = val;
  }
  
-void __kprobes kprobes_optinsn_template_holder(void)
+static void __used __kprobes kprobes_optinsn_template_holder(void)
  {
         asm volatile (
                         ".global optprobe_template_entry\n"
@@ -1221,7 +1218,8 @@ static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
         }
         /* Check whether the address range is reserved */
         if (ftrace_text_reserved(src, src + len - 1) ||
-           alternatives_text_reserved(src, src + len - 1))
+           alternatives_text_reserved(src, src + len - 1) ||
+           jump_label_text_reserved(src, src + len - 1))
                 return -EBUSY;
  
         return len;
@@ -1269,11 +1267,9 @@ static int __kprobes can_optimize(unsigned long paddr)
         unsigned long addr, size = 0, offset = 0;
         struct insn insn;
         kprobe_opcode_t buf[MAX_INSN_SIZE];
-       /* Dummy buffers for lookup_symbol_attrs */
-       static char __dummy_buf[KSYM_NAME_LEN];
  
         /* Lookup symbol including addr */
-       if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf))
+       if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
                 return 0;
  
         /* Check there is enough space for a relative jump. */
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c

index 1c355c550960ab75279c644b52269cd5a368239a..8f295609173524cdf06a20c6c0ad4fbe1de12263 100644 (file)
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -239,6 +239,9 @@ int module_finalize(const Elf_Ehdr *hdr,
                 apply_paravirt(pseg, pseg + para->sh_size);
         }
  
+       /* make jump label nops */
+       jump_label_apply_nops(me);
+
         return 0;
  }
  
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c

index 0f7f130caa6778d6077868e6e768b88f1439fc24..6015ee13e22be402cf4ef6f09ac7837db470c88b 100644 (file)
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -601,7 +601,7 @@ static void gart_fixup_northbridges(struct sys_device *dev)
                  * Don't enable translations just yet.  That is the next
                  * step.  Restore the pre-suspend aperture settings.
                  */
-               pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1);
+               gart_set_size_and_enable(dev, aperture_order);
                 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
         }
  }
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c

index c3a4fbb2b996d00277d6523cb76b74e2c5944621..00e167870f714ab0799838865eec35bd13ce5601 100644 (file)
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,6 +112,7 @@
  #include <asm/numa_64.h>
  #endif
  #include <asm/mce.h>
+#include <asm/alternative.h>
  
  /*
   * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -726,6 +727,7 @@ void __init setup_arch(char **cmdline_p)
  {
         int acpi = 0;
         int k8 = 0;
+       unsigned long flags;
  
  #ifdef CONFIG_X86_32
         memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
@@ -1071,6 +1073,10 @@ void __init setup_arch(char **cmdline_p)
         x86_init.oem.banner();
  
         mcheck_init();
+
+       local_irq_save(flags);
+       arch_init_ideal_nop5();
+       local_irq_restore(flags);
  }
  
  #ifdef CONFIG_X86_32
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c

index 81ed28cb36e692eb4413c294408d5498bee876e8..8a3f9f64f86f9e7fee5bc5112bf50a04fbe37b15 100644 (file)
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3163,8 +3163,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
         sync_lapic_to_cr8(vcpu);
  
         save_host_msrs(vcpu);
-       fs_selector = kvm_read_fs();
-       gs_selector = kvm_read_gs();
+       savesegment(fs, fs_selector);
+       savesegment(gs, gs_selector);
         ldt_selector = kvm_read_ldt();
         svm->vmcb->save.cr2 = vcpu->arch.cr2;
         /* required for live migration with NPT */
@@ -3251,10 +3251,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
         vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
         vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
  
-       kvm_load_fs(fs_selector);
-       kvm_load_gs(gs_selector);
-       kvm_load_ldt(ldt_selector);
         load_host_msrs(vcpu);
+       loadsegment(fs, fs_selector);
+#ifdef CONFIG_X86_64
+       load_gs_index(gs_selector);
+       wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
+#else
+       loadsegment(gs, gs_selector);
+#endif
+       kvm_load_ldt(ldt_selector);
  
         reload_tss(vcpu);
  
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c

index 49b25eee25acc075538a411fc24c23a326f02fd4..7bddfab120139435d1d10885520cc5293fefcc07 100644 (file)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -803,7 +803,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
          */
         vmx->host_state.ldt_sel = kvm_read_ldt();
         vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
-       vmx->host_state.fs_sel = kvm_read_fs();
+       savesegment(fs, vmx->host_state.fs_sel);
         if (!(vmx->host_state.fs_sel & 7)) {
                 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
                 vmx->host_state.fs_reload_needed = 0;
@@ -811,7 +811,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
                 vmcs_write16(HOST_FS_SELECTOR, 0);
                 vmx->host_state.fs_reload_needed = 1;
         }
-       vmx->host_state.gs_sel = kvm_read_gs();
+       savesegment(gs, vmx->host_state.gs_sel);
         if (!(vmx->host_state.gs_sel & 7))
                 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
         else {
@@ -841,27 +841,21 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
  
  static void __vmx_load_host_state(struct vcpu_vmx *vmx)
  {
-       unsigned long flags;
-
         if (!vmx->host_state.loaded)
                 return;
  
         ++vmx->vcpu.stat.host_state_reload;
         vmx->host_state.loaded = 0;
         if (vmx->host_state.fs_reload_needed)
-               kvm_load_fs(vmx->host_state.fs_sel);
+               loadsegment(fs, vmx->host_state.fs_sel);
         if (vmx->host_state.gs_ldt_reload_needed) {
                 kvm_load_ldt(vmx->host_state.ldt_sel);
-               /*
-                * If we have to reload gs, we must take care to
-                * preserve our gs base.
-                */
-               local_irq_save(flags);
-               kvm_load_gs(vmx->host_state.gs_sel);
  #ifdef CONFIG_X86_64
-               wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
+               load_gs_index(vmx->host_state.gs_sel);
+               wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
+#else
+               loadsegment(gs, vmx->host_state.gs_sel);
  #endif
-               local_irq_restore(flags);
         }
         reload_tss();
  #ifdef CONFIG_X86_64
@@ -2589,8 +2583,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
-       vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs());    /* 22.2.4 */
-       vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs());    /* 22.2.4 */
+       vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
+       vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
  #ifdef CONFIG_X86_64
         rdmsrl(MSR_FS_BASE, a);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c

index 4c4508e8a2043015c1cce3eb49a46c0c435e9297..a24c6cfdccc47da8a12f16b2cd4a09caaf825b9a 100644 (file)
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -251,6 +251,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
         if (!(address >= VMALLOC_START && address < VMALLOC_END))
                 return -1;
  
+       WARN_ON_ONCE(in_nmi());
+
         /*
          * Synchronize this task's top level page-table
          * with the 'reference' page table.
@@ -369,6 +371,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
         if (!(address >= VMALLOC_START && address < VMALLOC_END))
                 return -1;
  
+       WARN_ON_ONCE(in_nmi());
+
         /*
          * Copy kernel mappings over when needed. This can also
          * happen within a race in page table update. In the later
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c

index b3b531a4f8e587e3560b2087e9c483b2cb1b0f93..d87dd6d042d64309fedac698dd1a5b28a8448594 100644 (file)
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -631,6 +631,8 @@ bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
         if (!pte)
                 return false;
  
+       WARN_ON_ONCE(in_nmi());
+
         if (error_code & 2)
                 kmemcheck_access(regs, address, KMEMCHECK_WRITE);
         else
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c

index 3855096c59b81910fe827c76151e84bbac836770..2d49d4e19a3619c0be2c7d17a892b8aea582048f 100644 (file)
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -14,6 +14,7 @@
  #include <asm/ptrace.h>
  #include <asm/uaccess.h>
  #include <asm/stacktrace.h>
+#include <linux/compat.h>
  
  static void backtrace_warning_symbol(void *data, char *msg,
                                      unsigned long symbol)
@@ -48,14 +49,12 @@ static struct stacktrace_ops backtrace_ops = {
         .walk_stack     = print_context_stack,
  };
  
-struct frame_head {
-       struct frame_head *bp;
-       unsigned long ret;
-} __attribute__((packed));
-
-static struct frame_head *dump_user_backtrace(struct frame_head *head)
+#ifdef CONFIG_COMPAT
+static struct stack_frame_ia32 *
+dump_user_backtrace_32(struct stack_frame_ia32 *head)
  {
-       struct frame_head bufhead[2];
+       struct stack_frame_ia32 bufhead[2];
+       struct stack_frame_ia32 *fp;
  
         /* Also check accessibility of one struct frame_head beyond */
         if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
@@ -63,20 +62,66 @@ static struct frame_head *dump_user_backtrace(struct frame_head *head)
         if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
                 return NULL;
  
-       oprofile_add_trace(bufhead[0].ret);
+       fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame);
+
+       oprofile_add_trace(bufhead[0].return_address);
+
+       /* frame pointers should strictly progress back up the stack
+       * (towards higher addresses) */
+       if (head >= fp)
+               return NULL;
+
+       return fp;
+}
+
+static inline int
+x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
+{
+       struct stack_frame_ia32 *head;
+
+       /* User process is 32-bit */
+       if (!current || !test_thread_flag(TIF_IA32))
+               return 0;
+
+       head = (struct stack_frame_ia32 *) regs->bp;
+       while (depth-- && head)
+               head = dump_user_backtrace_32(head);
+
+       return 1;
+}
+
+#else
+static inline int
+x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
+{
+       return 0;
+}
+#endif /* CONFIG_COMPAT */
+
+static struct stack_frame *dump_user_backtrace(struct stack_frame *head)
+{
+       struct stack_frame bufhead[2];
+
+       /* Also check accessibility of one struct stack_frame beyond */
+       if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
+               return NULL;
+       if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
+               return NULL;
+
+       oprofile_add_trace(bufhead[0].return_address);
  
         /* frame pointers should strictly progress back up the stack
          * (towards higher addresses) */
-       if (head >= bufhead[0].bp)
+       if (head >= bufhead[0].next_frame)
                 return NULL;
  
-       return bufhead[0].bp;
+       return bufhead[0].next_frame;
  }
  
  void
  x86_backtrace(struct pt_regs * const regs, unsigned int depth)
  {
-       struct frame_head *head = (struct frame_head *)frame_pointer(regs);
+       struct stack_frame *head = (struct stack_frame *)frame_pointer(regs);
  
         if (!user_mode_vm(regs)) {
                 unsigned long stack = kernel_stack_pointer(regs);
@@ -86,6 +131,9 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
                 return;
         }
  
+       if (x86_backtrace_32(regs, depth))
+               return;
+
         while (depth-- && head)
                 head = dump_user_backtrace(head);
  }
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c

index f1575c9a2572074cad464ae2f9c51ee88e63bce3..bd1489c3ce09b7416c400701f7fb407eae673615 100644 (file)
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -695,9 +695,6 @@ static int __init ppro_init(char **cpu_type)
         return 1;
  }
  
-/* in order to get sysfs right */
-static int using_nmi;
-
  int __init op_nmi_init(struct oprofile_operations *ops)
  {
         __u8 vendor = boot_cpu_data.x86_vendor;
@@ -705,8 +702,6 @@ int __init op_nmi_init(struct oprofile_operations *ops)
         char *cpu_type = NULL;
         int ret = 0;
  
-       using_nmi = 0;
-
         if (!cpu_has_apic)
                 return -ENODEV;
  
@@ -790,13 +785,11 @@ int __init op_nmi_init(struct oprofile_operations *ops)
         if (ret)
                 return ret;
  
-       using_nmi = 1;
         printk(KERN_INFO "oprofile: using NMI interrupt.\n");
         return 0;
  }
  
  void op_nmi_exit(void)
  {
-       if (using_nmi)
-               exit_sysfs();
+       exit_sysfs();
  }
diff --git a/block/bsg.c b/block/bsg.c

index 82d58829ba591eeef13a3ed4eca96b8a019c4d85..0c00870553a3fca2c58c325b780b92689fc677a6 100644 (file)
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -426,7 +426,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
         /*
          * fill in all the output members
          */
-       hdr->device_status = status_byte(rq->errors);
+       hdr->device_status = rq->errors & 0xff;
         hdr->transport_status = host_byte(rq->errors);
         hdr->driver_status = driver_byte(rq->errors);
         hdr->info = 0;
diff --git a/drivers/atm/iphase.c b/drivers/atm/iphase.c

index ee9ddeb53417c7da782d252f7d9e4f8113ee44b4..8cb0347dec2848e4d6c33f5276b9ba986a9f3fac 100644 (file)
--- a/drivers/atm/iphase.c
+++ b/drivers/atm/iphase.c
@@ -3156,7 +3156,6 @@ static int __devinit ia_init_one(struct pci_dev *pdev,
  {  
         struct atm_dev *dev;  
         IADEV *iadev;  
-        unsigned long flags;
         int ret;
  
         iadev = kzalloc(sizeof(*iadev), GFP_KERNEL);
@@ -3188,19 +3187,14 @@ static int __devinit ia_init_one(struct pci_dev *pdev,
         ia_dev[iadev_count] = iadev;
         _ia_dev[iadev_count] = dev;
         iadev_count++;
-       spin_lock_init(&iadev->misc_lock);
-       /* First fixes first. I don't want to think about this now. */
-       spin_lock_irqsave(&iadev->misc_lock, flags); 
         if (ia_init(dev) || ia_start(dev)) {  
                 IF_INIT(printk("IA register failed!\n");)
                 iadev_count--;
                 ia_dev[iadev_count] = NULL;
                 _ia_dev[iadev_count] = NULL;
-               spin_unlock_irqrestore(&iadev->misc_lock, flags); 
                 ret = -EINVAL;
                 goto err_out_deregister_dev;
         }
-       spin_unlock_irqrestore(&iadev->misc_lock, flags); 
         IF_EVENT(printk("iadev_count = %d\n", iadev_count);)
  
         iadev->next_board = ia_boards;  
diff --git a/drivers/atm/iphase.h b/drivers/atm/iphase.h

index b2cd20f549cb4d474edb0a2a9a2f419d40d5072b..077735e0e04bfdd1d12f048ac8006f8a598832f8 100644 (file)
--- a/drivers/atm/iphase.h
+++ b/drivers/atm/iphase.h
@@ -1022,7 +1022,7 @@ typedef struct iadev_t {
         struct dle_q rx_dle_q;  
         struct free_desc_q *rx_free_desc_qhead;  
         struct sk_buff_head rx_dma_q;  
-        spinlock_t rx_lock, misc_lock;
+       spinlock_t rx_lock;
         struct atm_vcc **rx_open;       /* list of all open VCs */  
          u16 num_rx_desc, rx_buf_sz, rxing;
          u32 rx_pkt_ram, rx_tmp_cnt;
diff --git a/drivers/atm/solos-pci.c b/drivers/atm/solos-pci.c

index f916ddf63938a03444c2ad6e4d8a6b63c46212aa..f46138ab38b6c310fffc589681f727840a05c646 100644 (file)
--- a/drivers/atm/solos-pci.c
+++ b/drivers/atm/solos-pci.c
@@ -444,6 +444,7 @@ static ssize_t console_show(struct device *dev, struct device_attribute *attr,
         struct atm_dev *atmdev = container_of(dev, struct atm_dev, class_dev);
         struct solos_card *card = atmdev->dev_data;
         struct sk_buff *skb;
+       unsigned int len;
  
         spin_lock(&card->cli_queue_lock);
         skb = skb_dequeue(&card->cli_queue[SOLOS_CHAN(atmdev)]);
@@ -451,11 +452,12 @@ static ssize_t console_show(struct device *dev, struct device_attribute *attr,
         if(skb == NULL)
                 return sprintf(buf, "No data.\n");
  
-       memcpy(buf, skb->data, skb->len);
-       dev_dbg(&card->dev->dev, "len: %d\n", skb->len);
+       len = skb->len;
+       memcpy(buf, skb->data, len);
+       dev_dbg(&card->dev->dev, "len: %d\n", len);
  
         kfree_skb(skb);
-       return skb->len;
+       return len;
  }
  
  static int send_command(struct solos_card *card, int dev, const char *buf, size_t size)
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig

index de277689da6153fac725e54e50f752e7a6ee5978..4b9359a6f6ca45c4cd5f8803b5e7cdb9ba496678 100644 (file)
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -488,4 +488,21 @@ config BLK_DEV_HD
  
           If unsure, say N.
  
+config BLK_DEV_RBD
+       tristate "Rados block device (RBD)"
+       depends on INET && EXPERIMENTAL && BLOCK
+       select CEPH_LIB
+       select LIBCRC32C
+       select CRYPTO_AES
+       select CRYPTO
+       default n
+       help
+         Say Y here if you want include the Rados block device, which stripes
+         a block device over objects stored in the Ceph distributed object
+         store.
+
+         More information at http://ceph.newdream.net/.
+
+         If unsure, say N.
+
  endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile

index aff5ac925c34332f235e523164f7f85cd76b6751..d7f463d6312d6494c3ef795c5b96ff11fa0ec801 100644 (file)
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -37,5 +37,6 @@ obj-$(CONFIG_BLK_DEV_HD)      += hd.o
  
  obj-$(CONFIG_XEN_BLKDEV_FRONTEND)      += xen-blkfront.o
  obj-$(CONFIG_BLK_DEV_DRBD)     += drbd/
+obj-$(CONFIG_BLK_DEV_RBD)     += rbd.o
  
  swim_mod-objs  := swim.o swim_asm.o
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c

index e9da874d04192b125561f4b71d8ba21ce55fadcc..03688c2da319c007f4923c4ffd989e4f9666b755 100644 (file)
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -113,7 +113,7 @@ static void ps3disk_scatter_gather(struct ps3_storage_device *dev,
                         memcpy(buf, dev->bounce_buf+offset, size);
                 offset += size;
                 flush_kernel_dcache_page(bvec->bv_page);
-               bvec_kunmap_irq(bvec, &flags);
+               bvec_kunmap_irq(buf, &flags);
                 i++;
         }
  }
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c

new file mode 100644 (file)

index 0000000..6ec9d53
--- /dev/null
+++ b/drivers/block/rbd.c
@@ -0,0 +1,1841 @@
+/*
+   rbd.c -- Export ceph rados objects as a Linux block device
+
+
+   based on drivers/block/osdblk.c:
+
+   Copyright 2009 Red Hat, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+
+   Instructions for use
+   --------------------
+
+   1) Map a Linux block device to an existing rbd image.
+
+      Usage: <mon ip addr> <options> <pool name> <rbd image name> [snap name]
+
+      $ echo "192.168.0.1 name=admin rbd foo" > /sys/class/rbd/add
+
+      The snapshot name can be "-" or omitted to map the image read/write.
+
+   2) List all active blkdev<->object mappings.
+
+      In this example, we have performed step #1 twice, creating two blkdevs,
+      mapped to two separate rados objects in the rados rbd pool
+
+      $ cat /sys/class/rbd/list
+      #id     major   client_name     pool    name    snap    KB
+      0       254     client4143      rbd     foo     -      1024000
+
+      The columns, in order, are:
+      - blkdev unique id
+      - blkdev assigned major
+      - rados client id
+      - rados pool name
+      - rados block device name
+      - mapped snapshot ("-" if none)
+      - device size in KB
+
+
+   3) Create a snapshot.
+
+      Usage: <blkdev id> <snapname>
+
+      $ echo "0 mysnap" > /sys/class/rbd/snap_create
+
+
+   4) Listing a snapshot.
+
+      $ cat /sys/class/rbd/snaps_list
+      #id     snap    KB
+      0       -       1024000 (*)
+      0       foo     1024000
+
+      The columns, in order, are:
+      - blkdev unique id
+      - snapshot name, '-' means none (active read/write version)
+      - size of device at time of snapshot
+      - the (*) indicates this is the active version
+
+   5) Rollback to snapshot.
+
+      Usage: <blkdev id> <snapname>
+
+      $ echo "0 mysnap" > /sys/class/rbd/snap_rollback
+
+
+   6) Mapping an image using snapshot.
+
+      A snapshot mapping is read-only. This is being done by passing
+      snap=<snapname> to the options when adding a device.
+
+      $ echo "192.168.0.1 name=admin,snap=mysnap rbd foo" > /sys/class/rbd/add
+
+
+   7) Remove an active blkdev<->rbd image mapping.
+
+      In this example, we remove the mapping with blkdev unique id 1.
+
+      $ echo 1 > /sys/class/rbd/remove
+
+
+   NOTE:  The actual creation and deletion of rados objects is outside the scope
+   of this driver.
+
+ */
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osd_client.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/decode.h>
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+
+#include "rbd_types.h"
+
+#define DRV_NAME "rbd"
+#define DRV_NAME_LONG "rbd (rados block device)"
+
+#define RBD_MINORS_PER_MAJOR   256             /* max minors per blkdev */
+
+#define RBD_MAX_MD_NAME_LEN    (96 + sizeof(RBD_SUFFIX))
+#define RBD_MAX_POOL_NAME_LEN  64
+#define RBD_MAX_SNAP_NAME_LEN  32
+#define RBD_MAX_OPT_LEN                1024
+
+#define RBD_SNAP_HEAD_NAME     "-"
+
+#define DEV_NAME_LEN           32
+
+/*
+ * block device image metadata (in-memory version)
+ */
+struct rbd_image_header {
+       u64 image_size;
+       char block_name[32];
+       __u8 obj_order;
+       __u8 crypt_type;
+       __u8 comp_type;
+       struct rw_semaphore snap_rwsem;
+       struct ceph_snap_context *snapc;
+       size_t snap_names_len;
+       u64 snap_seq;
+       u32 total_snaps;
+
+       char *snap_names;
+       u64 *snap_sizes;
+};
+
+/*
+ * an instance of the client.  multiple devices may share a client.
+ */
+struct rbd_client {
+       struct ceph_client      *client;
+       struct kref             kref;
+       struct list_head        node;
+};
+
+/*
+ * a single io request
+ */
+struct rbd_request {
+       struct request          *rq;            /* blk layer request */
+       struct bio              *bio;           /* cloned bio */
+       struct page             **pages;        /* list of used pages */
+       u64                     len;
+};
+
+/*
+ * a single device
+ */
+struct rbd_device {
+       int                     id;             /* blkdev unique id */
+
+       int                     major;          /* blkdev assigned major */
+       struct gendisk          *disk;          /* blkdev's gendisk and rq */
+       struct request_queue    *q;
+
+       struct ceph_client      *client;
+       struct rbd_client       *rbd_client;
+
+       char                    name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
+
+       spinlock_t              lock;           /* queue lock */
+
+       struct rbd_image_header header;
+       char                    obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
+       int                     obj_len;
+       char                    obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
+       char                    pool_name[RBD_MAX_POOL_NAME_LEN];
+       int                     poolid;
+
+       char                    snap_name[RBD_MAX_SNAP_NAME_LEN];
+       u32 cur_snap;   /* index+1 of current snapshot within snap context
+                          0 - for the head */
+       int read_only;
+
+       struct list_head        node;
+};
+
+static spinlock_t node_lock;      /* protects client get/put */
+
+static struct class *class_rbd;          /* /sys/class/rbd */
+static DEFINE_MUTEX(ctl_mutex);          /* Serialize open/close/setup/teardown */
+static LIST_HEAD(rbd_dev_list);    /* devices */
+static LIST_HEAD(rbd_client_list);      /* clients */
+
+
+static int rbd_open(struct block_device *bdev, fmode_t mode)
+{
+       struct gendisk *disk = bdev->bd_disk;
+       struct rbd_device *rbd_dev = disk->private_data;
+
+       set_device_ro(bdev, rbd_dev->read_only);
+
+       if ((mode & FMODE_WRITE) && rbd_dev->read_only)
+               return -EROFS;
+
+       return 0;
+}
+
+static const struct block_device_operations rbd_bd_ops = {
+       .owner                  = THIS_MODULE,
+       .open                   = rbd_open,
+};
+
+/*
+ * Initialize an rbd client instance.
+ * We own *opt.
+ */
+static struct rbd_client *rbd_client_create(struct ceph_options *opt)
+{
+       struct rbd_client *rbdc;
+       int ret = -ENOMEM;
+
+       dout("rbd_client_create\n");
+       rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
+       if (!rbdc)
+               goto out_opt;
+
+       kref_init(&rbdc->kref);
+       INIT_LIST_HEAD(&rbdc->node);
+
+       rbdc->client = ceph_create_client(opt, rbdc);
+       if (IS_ERR(rbdc->client))
+               goto out_rbdc;
+       opt = NULL; /* Now rbdc->client is responsible for opt */
+
+       ret = ceph_open_session(rbdc->client);
+       if (ret < 0)
+               goto out_err;
+
+       spin_lock(&node_lock);
+       list_add_tail(&rbdc->node, &rbd_client_list);
+       spin_unlock(&node_lock);
+
+       dout("rbd_client_create created %p\n", rbdc);
+       return rbdc;
+
+out_err:
+       ceph_destroy_client(rbdc->client);
+out_rbdc:
+       kfree(rbdc);
+out_opt:
+       if (opt)
+               ceph_destroy_options(opt);
+       return ERR_PTR(ret);
+}
+
+/*
+ * Find a ceph client with specific addr and configuration.
+ */
+static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
+{
+       struct rbd_client *client_node;
+
+       if (opt->flags & CEPH_OPT_NOSHARE)
+               return NULL;
+
+       list_for_each_entry(client_node, &rbd_client_list, node)
+               if (ceph_compare_options(opt, client_node->client) == 0)
+                       return client_node;
+       return NULL;
+}
+
+/*
+ * Get a ceph client with specific addr and configuration, if one does
+ * not exist create it.
+ */
+static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
+                         char *options)
+{
+       struct rbd_client *rbdc;
+       struct ceph_options *opt;
+       int ret;
+
+       ret = ceph_parse_options(&opt, options, mon_addr,
+                                mon_addr + strlen(mon_addr), NULL, NULL);
+       if (ret < 0)
+               return ret;
+
+       spin_lock(&node_lock);
+       rbdc = __rbd_client_find(opt);
+       if (rbdc) {
+               ceph_destroy_options(opt);
+
+               /* using an existing client */
+               kref_get(&rbdc->kref);
+               rbd_dev->rbd_client = rbdc;
+               rbd_dev->client = rbdc->client;
+               spin_unlock(&node_lock);
+               return 0;
+       }
+       spin_unlock(&node_lock);
+
+       rbdc = rbd_client_create(opt);
+       if (IS_ERR(rbdc))
+               return PTR_ERR(rbdc);
+
+       rbd_dev->rbd_client = rbdc;
+       rbd_dev->client = rbdc->client;
+       return 0;
+}
+
+/*
+ * Destroy ceph client
+ */
+static void rbd_client_release(struct kref *kref)
+{
+       struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
+
+       dout("rbd_release_client %p\n", rbdc);
+       spin_lock(&node_lock);
+       list_del(&rbdc->node);
+       spin_unlock(&node_lock);
+
+       ceph_destroy_client(rbdc->client);
+       kfree(rbdc);
+}
+
+/*
+ * Drop reference to ceph client node. If it's not referenced anymore, release
+ * it.
+ */
+static void rbd_put_client(struct rbd_device *rbd_dev)
+{
+       kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
+       rbd_dev->rbd_client = NULL;
+       rbd_dev->client = NULL;
+}
+
+
+/*
+ * Create a new header structure, translate header format from the on-disk
+ * header.
+ */
+static int rbd_header_from_disk(struct rbd_image_header *header,
+                                struct rbd_image_header_ondisk *ondisk,
+                                int allocated_snaps,
+                                gfp_t gfp_flags)
+{
+       int i;
+       u32 snap_count = le32_to_cpu(ondisk->snap_count);
+       int ret = -ENOMEM;
+
+       init_rwsem(&header->snap_rwsem);
+
+       header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
+       header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
+                               snap_count *
+                                sizeof(struct rbd_image_snap_ondisk),
+                               gfp_flags);
+       if (!header->snapc)
+               return -ENOMEM;
+       if (snap_count) {
+               header->snap_names = kmalloc(header->snap_names_len,
+                                            GFP_KERNEL);
+               if (!header->snap_names)
+                       goto err_snapc;
+               header->snap_sizes = kmalloc(snap_count * sizeof(u64),
+                                            GFP_KERNEL);
+               if (!header->snap_sizes)
+                       goto err_names;
+       } else {
+               header->snap_names = NULL;
+               header->snap_sizes = NULL;
+       }
+       memcpy(header->block_name, ondisk->block_name,
+              sizeof(ondisk->block_name));
+
+       header->image_size = le64_to_cpu(ondisk->image_size);
+       header->obj_order = ondisk->options.order;
+       header->crypt_type = ondisk->options.crypt_type;
+       header->comp_type = ondisk->options.comp_type;
+
+       atomic_set(&header->snapc->nref, 1);
+       header->snap_seq = le64_to_cpu(ondisk->snap_seq);
+       header->snapc->num_snaps = snap_count;
+       header->total_snaps = snap_count;
+
+       if (snap_count &&
+           allocated_snaps == snap_count) {
+               for (i = 0; i < snap_count; i++) {
+                       header->snapc->snaps[i] =
+                               le64_to_cpu(ondisk->snaps[i].id);
+                       header->snap_sizes[i] =
+                               le64_to_cpu(ondisk->snaps[i].image_size);
+               }
+
+               /* copy snapshot names */
+               memcpy(header->snap_names, &ondisk->snaps[i],
+                       header->snap_names_len);
+       }
+
+       return 0;
+
+err_names:
+       kfree(header->snap_names);
+err_snapc:
+       kfree(header->snapc);
+       return ret;
+}
+
+static int snap_index(struct rbd_image_header *header, int snap_num)
+{
+       return header->total_snaps - snap_num;
+}
+
+static u64 cur_snap_id(struct rbd_device *rbd_dev)
+{
+       struct rbd_image_header *header = &rbd_dev->header;
+
+       if (!rbd_dev->cur_snap)
+               return 0;
+
+       return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
+}
+
+static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
+                       u64 *seq, u64 *size)
+{
+       int i;
+       char *p = header->snap_names;
+
+       for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
+               if (strcmp(snap_name, p) == 0)
+                       break;
+       }
+       if (i == header->total_snaps)
+               return -ENOENT;
+       if (seq)
+               *seq = header->snapc->snaps[i];
+
+       if (size)
+               *size = header->snap_sizes[i];
+
+       return i;
+}
+
+static int rbd_header_set_snap(struct rbd_device *dev,
+                              const char *snap_name,
+                              u64 *size)
+{
+       struct rbd_image_header *header = &dev->header;
+       struct ceph_snap_context *snapc = header->snapc;
+       int ret = -ENOENT;
+
+       down_write(&header->snap_rwsem);
+
+       if (!snap_name ||
+           !*snap_name ||
+           strcmp(snap_name, "-") == 0 ||
+           strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) {
+               if (header->total_snaps)
+                       snapc->seq = header->snap_seq;
+               else
+                       snapc->seq = 0;
+               dev->cur_snap = 0;
+               dev->read_only = 0;
+               if (size)
+                       *size = header->image_size;
+       } else {
+               ret = snap_by_name(header, snap_name, &snapc->seq, size);
+               if (ret < 0)
+                       goto done;
+
+               dev->cur_snap = header->total_snaps - ret;
+               dev->read_only = 1;
+       }
+
+       ret = 0;
+done:
+       up_write(&header->snap_rwsem);
+       return ret;
+}
+
+static void rbd_header_free(struct rbd_image_header *header)
+{
+       kfree(header->snapc);
+       kfree(header->snap_names);
+       kfree(header->snap_sizes);
+}
+
+/*
+ * get the actual striped segment name, offset and length
+ */
+static u64 rbd_get_segment(struct rbd_image_header *header,
+                          const char *block_name,
+                          u64 ofs, u64 len,
+                          char *seg_name, u64 *segofs)
+{
+       u64 seg = ofs >> header->obj_order;
+
+       if (seg_name)
+               snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
+                        "%s.%012llx", block_name, seg);
+
+       ofs = ofs & ((1 << header->obj_order) - 1);
+       len = min_t(u64, len, (1 << header->obj_order) - ofs);
+
+       if (segofs)
+               *segofs = ofs;
+
+       return len;
+}
+
+/*
+ * bio helpers
+ */
+
+static void bio_chain_put(struct bio *chain)
+{
+       struct bio *tmp;
+
+       while (chain) {
+               tmp = chain;
+               chain = chain->bi_next;
+               bio_put(tmp);
+       }
+}
+
+/*
+ * zeros a bio chain, starting at specific offset
+ */
+static void zero_bio_chain(struct bio *chain, int start_ofs)
+{
+       struct bio_vec *bv;
+       unsigned long flags;
+       void *buf;
+       int i;
+       int pos = 0;
+
+       while (chain) {
+               bio_for_each_segment(bv, chain, i) {
+                       if (pos + bv->bv_len > start_ofs) {
+                               int remainder = max(start_ofs - pos, 0);
+                               buf = bvec_kmap_irq(bv, &flags);
+                               memset(buf + remainder, 0,
+                                      bv->bv_len - remainder);
+                               bvec_kunmap_irq(buf, &flags);
+                       }
+                       pos += bv->bv_len;
+               }
+
+               chain = chain->bi_next;
+       }
+}
+
+/*
+ * bio_chain_clone - clone a chain of bios up to a certain length.
+ * might return a bio_pair that will need to be released.
+ */
+static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
+                                  struct bio_pair **bp,
+                                  int len, gfp_t gfpmask)
+{
+       struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
+       int total = 0;
+
+       if (*bp) {
+               bio_pair_release(*bp);
+               *bp = NULL;
+       }
+
+       while (old_chain && (total < len)) {
+               tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
+               if (!tmp)
+                       goto err_out;
+
+               if (total + old_chain->bi_size > len) {
+                       struct bio_pair *bp;
+
+                       /*
+                        * this split can only happen with a single paged bio,
+                        * split_bio will BUG_ON if this is not the case
+                        */
+                       dout("bio_chain_clone split! total=%d remaining=%d"
+                            "bi_size=%d\n",
+                            (int)total, (int)len-total,
+                            (int)old_chain->bi_size);
+
+                       /* split the bio. We'll release it either in the next
+                          call, or it will have to be released outside */
+                       bp = bio_split(old_chain, (len - total) / 512ULL);
+                       if (!bp)
+                               goto err_out;
+
+                       __bio_clone(tmp, &bp->bio1);
+
+                       *next = &bp->bio2;
+               } else {
+                       __bio_clone(tmp, old_chain);
+                       *next = old_chain->bi_next;
+               }
+
+               tmp->bi_bdev = NULL;
+               gfpmask &= ~__GFP_WAIT;
+               tmp->bi_next = NULL;
+
+               if (!new_chain) {
+                       new_chain = tail = tmp;
+               } else {
+                       tail->bi_next = tmp;
+                       tail = tmp;
+               }
+               old_chain = old_chain->bi_next;
+
+               total += tmp->bi_size;
+       }
+
+       BUG_ON(total < len);
+
+       if (tail)
+               tail->bi_next = NULL;
+
+       *old = old_chain;
+
+       return new_chain;
+
+err_out:
+       dout("bio_chain_clone with err\n");
+       bio_chain_put(new_chain);
+       return NULL;
+}
+
+/*
+ * helpers for osd request op vectors.
+ */
+static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
+                           int num_ops,
+                           int opcode,
+                           u32 payload_len)
+{
+       *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
+                      GFP_NOIO);
+       if (!*ops)
+               return -ENOMEM;
+       (*ops)[0].op = opcode;
+       /*
+        * op extent offset and length will be set later on
+        * in calc_raw_layout()
+        */
+       (*ops)[0].payload_len = payload_len;
+       return 0;
+}
+
+static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
+{
+       kfree(ops);
+}
+
+/*
+ * Send ceph osd request
+ */
+static int rbd_do_request(struct request *rq,
+                         struct rbd_device *dev,
+                         struct ceph_snap_context *snapc,
+                         u64 snapid,
+                         const char *obj, u64 ofs, u64 len,
+                         struct bio *bio,
+                         struct page **pages,
+                         int num_pages,
+                         int flags,
+                         struct ceph_osd_req_op *ops,
+                         int num_reply,
+                         void (*rbd_cb)(struct ceph_osd_request *req,
+                                        struct ceph_msg *msg))
+{
+       struct ceph_osd_request *req;
+       struct ceph_file_layout *layout;
+       int ret;
+       u64 bno;
+       struct timespec mtime = CURRENT_TIME;
+       struct rbd_request *req_data;
+       struct ceph_osd_request_head *reqhead;
+       struct rbd_image_header *header = &dev->header;
+
+       ret = -ENOMEM;
+       req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
+       if (!req_data)
+               goto done;
+
+       dout("rbd_do_request len=%lld ofs=%lld\n", len, ofs);
+
+       down_read(&header->snap_rwsem);
+
+       req = ceph_osdc_alloc_request(&dev->client->osdc, flags,
+                                     snapc,
+                                     ops,
+                                     false,
+                                     GFP_NOIO, pages, bio);
+       if (IS_ERR(req)) {
+               up_read(&header->snap_rwsem);
+               ret = PTR_ERR(req);
+               goto done_pages;
+       }
+
+       req->r_callback = rbd_cb;
+
+       req_data->rq = rq;
+       req_data->bio = bio;
+       req_data->pages = pages;
+       req_data->len = len;
+
+       req->r_priv = req_data;
+
+       reqhead = req->r_request->front.iov_base;
+       reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
+
+       strncpy(req->r_oid, obj, sizeof(req->r_oid));
+       req->r_oid_len = strlen(req->r_oid);
+
+       layout = &req->r_file_layout;
+       memset(layout, 0, sizeof(*layout));
+       layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+       layout->fl_stripe_count = cpu_to_le32(1);
+       layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+       layout->fl_pg_preferred = cpu_to_le32(-1);
+       layout->fl_pg_pool = cpu_to_le32(dev->poolid);
+       ceph_calc_raw_layout(&dev->client->osdc, layout, snapid,
+                            ofs, &len, &bno, req, ops);
+
+       ceph_osdc_build_request(req, ofs, &len,
+                               ops,
+                               snapc,
+                               &mtime,
+                               req->r_oid, req->r_oid_len);
+       up_read(&header->snap_rwsem);
+
+       ret = ceph_osdc_start_request(&dev->client->osdc, req, false);
+       if (ret < 0)
+               goto done_err;
+
+       if (!rbd_cb) {
+               ret = ceph_osdc_wait_request(&dev->client->osdc, req);
+               ceph_osdc_put_request(req);
+       }
+       return ret;
+
+done_err:
+       bio_chain_put(req_data->bio);
+       ceph_osdc_put_request(req);
+done_pages:
+       kfree(req_data);
+done:
+       if (rq)
+               blk_end_request(rq, ret, len);
+       return ret;
+}
+
+/*
+ * Ceph osd op callback
+ */
+static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
+{
+       struct rbd_request *req_data = req->r_priv;
+       struct ceph_osd_reply_head *replyhead;
+       struct ceph_osd_op *op;
+       __s32 rc;
+       u64 bytes;
+       int read_op;
+
+       /* parse reply */
+       replyhead = msg->front.iov_base;
+       WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
+       op = (void *)(replyhead + 1);
+       rc = le32_to_cpu(replyhead->result);
+       bytes = le64_to_cpu(op->extent.length);
+       read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
+
+       dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
+
+       if (rc == -ENOENT && read_op) {
+               zero_bio_chain(req_data->bio, 0);
+               rc = 0;
+       } else if (rc == 0 && read_op && bytes < req_data->len) {
+               zero_bio_chain(req_data->bio, bytes);
+               bytes = req_data->len;
+       }
+
+       blk_end_request(req_data->rq, rc, bytes);
+
+       if (req_data->bio)
+               bio_chain_put(req_data->bio);
+
+       ceph_osdc_put_request(req);
+       kfree(req_data);
+}
+
+/*
+ * Do a synchronous ceph osd operation
+ */
+static int rbd_req_sync_op(struct rbd_device *dev,
+                          struct ceph_snap_context *snapc,
+                          u64 snapid,
+                          int opcode,
+                          int flags,
+                          struct ceph_osd_req_op *orig_ops,
+                          int num_reply,
+                          const char *obj,
+                          u64 ofs, u64 len,
+                          char *buf)
+{
+       int ret;
+       struct page **pages;
+       int num_pages;
+       struct ceph_osd_req_op *ops = orig_ops;
+       u32 payload_len;
+
+       num_pages = calc_pages_for(ofs , len);
+       pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+       if (IS_ERR(pages))
+               return PTR_ERR(pages);
+
+       if (!orig_ops) {
+               payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
+               ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
+               if (ret < 0)
+                       goto done;
+
+               if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
+                       ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
+                       if (ret < 0)
+                               goto done_ops;
+               }
+       }
+
+       ret = rbd_do_request(NULL, dev, snapc, snapid,
+                         obj, ofs, len, NULL,
+                         pages, num_pages,
+                         flags,
+                         ops,
+                         2,
+                         NULL);
+       if (ret < 0)
+               goto done_ops;
+
+       if ((flags & CEPH_OSD_FLAG_READ) && buf)
+               ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
+
+done_ops:
+       if (!orig_ops)
+               rbd_destroy_ops(ops);
+done:
+       ceph_release_page_vector(pages, num_pages);
+       return ret;
+}
+
+/*
+ * Do an asynchronous ceph osd operation
+ */
+static int rbd_do_op(struct request *rq,
+                    struct rbd_device *rbd_dev ,
+                    struct ceph_snap_context *snapc,
+                    u64 snapid,
+                    int opcode, int flags, int num_reply,
+                    u64 ofs, u64 len,
+                    struct bio *bio)
+{
+       char *seg_name;
+       u64 seg_ofs;
+       u64 seg_len;
+       int ret;
+       struct ceph_osd_req_op *ops;
+       u32 payload_len;
+
+       seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
+       if (!seg_name)
+               return -ENOMEM;
+
+       seg_len = rbd_get_segment(&rbd_dev->header,
+                                 rbd_dev->header.block_name,
+                                 ofs, len,
+                                 seg_name, &seg_ofs);
+
+       payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
+
+       ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
+       if (ret < 0)
+               goto done;
+
+       /* we've taken care of segment sizes earlier when we
+          cloned the bios. We should never have a segment
+          truncated at this point */
+       BUG_ON(seg_len < len);
+
+       ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
+                            seg_name, seg_ofs, seg_len,
+                            bio,
+                            NULL, 0,
+                            flags,
+                            ops,
+                            num_reply,
+                            rbd_req_cb);
+done:
+       kfree(seg_name);
+       return ret;
+}
+
+/*
+ * Request async osd write
+ */
+static int rbd_req_write(struct request *rq,
+                        struct rbd_device *rbd_dev,
+                        struct ceph_snap_context *snapc,
+                        u64 ofs, u64 len,
+                        struct bio *bio)
+{
+       return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
+                        CEPH_OSD_OP_WRITE,
+                        CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+                        2,
+                        ofs, len, bio);
+}
+
+/*
+ * Request async osd read
+ */
+static int rbd_req_read(struct request *rq,
+                        struct rbd_device *rbd_dev,
+                        u64 snapid,
+                        u64 ofs, u64 len,
+                        struct bio *bio)
+{
+       return rbd_do_op(rq, rbd_dev, NULL,
+                        (snapid ? snapid : CEPH_NOSNAP),
+                        CEPH_OSD_OP_READ,
+                        CEPH_OSD_FLAG_READ,
+                        2,
+                        ofs, len, bio);
+}
+
+/*
+ * Request sync osd read
+ */
+static int rbd_req_sync_read(struct rbd_device *dev,
+                         struct ceph_snap_context *snapc,
+                         u64 snapid,
+                         const char *obj,
+                         u64 ofs, u64 len,
+                         char *buf)
+{
+       return rbd_req_sync_op(dev, NULL,
+                              (snapid ? snapid : CEPH_NOSNAP),
+                              CEPH_OSD_OP_READ,
+                              CEPH_OSD_FLAG_READ,
+                              NULL,
+                              1, obj, ofs, len, buf);
+}
+
+/*
+ * Request sync osd read
+ */
+static int rbd_req_sync_rollback_obj(struct rbd_device *dev,
+                                    u64 snapid,
+                                    const char *obj)
+{
+       struct ceph_osd_req_op *ops;
+       int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_ROLLBACK, 0);
+       if (ret < 0)
+               return ret;
+
+       ops[0].snap.snapid = snapid;
+
+       ret = rbd_req_sync_op(dev, NULL,
+                              CEPH_NOSNAP,
+                              0,
+                              CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+                              ops,
+                              1, obj, 0, 0, NULL);
+
+       rbd_destroy_ops(ops);
+
+       if (ret < 0)
+               return ret;
+
+       return ret;
+}
+
+/*
+ * Request sync osd read
+ */
+static int rbd_req_sync_exec(struct rbd_device *dev,
+                            const char *obj,
+                            const char *cls,
+                            const char *method,
+                            const char *data,
+                            int len)
+{
+       struct ceph_osd_req_op *ops;
+       int cls_len = strlen(cls);
+       int method_len = strlen(method);
+       int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
+                                   cls_len + method_len + len);
+       if (ret < 0)
+               return ret;
+
+       ops[0].cls.class_name = cls;
+       ops[0].cls.class_len = (__u8)cls_len;
+       ops[0].cls.method_name = method;
+       ops[0].cls.method_len = (__u8)method_len;
+       ops[0].cls.argc = 0;
+       ops[0].cls.indata = data;
+       ops[0].cls.indata_len = len;
+
+       ret = rbd_req_sync_op(dev, NULL,
+                              CEPH_NOSNAP,
+                              0,
+                              CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+                              ops,
+                              1, obj, 0, 0, NULL);
+
+       rbd_destroy_ops(ops);
+
+       dout("cls_exec returned %d\n", ret);
+       return ret;
+}
+
+/*
+ * block device queue callback
+ */
+static void rbd_rq_fn(struct request_queue *q)
+{
+       struct rbd_device *rbd_dev = q->queuedata;
+       struct request *rq;
+       struct bio_pair *bp = NULL;
+
+       rq = blk_fetch_request(q);
+
+       while (1) {
+               struct bio *bio;
+               struct bio *rq_bio, *next_bio = NULL;
+               bool do_write;
+               int size, op_size = 0;
+               u64 ofs;
+
+               /* peek at request from block layer */
+               if (!rq)
+                       break;
+
+               dout("fetched request\n");
+
+               /* filter out block requests we don't understand */
+               if ((rq->cmd_type != REQ_TYPE_FS)) {
+                       __blk_end_request_all(rq, 0);
+                       goto next;
+               }
+
+               /* deduce our operation (read, write) */
+               do_write = (rq_data_dir(rq) == WRITE);
+
+               size = blk_rq_bytes(rq);
+               ofs = blk_rq_pos(rq) * 512ULL;
+               rq_bio = rq->bio;
+               if (do_write && rbd_dev->read_only) {
+                       __blk_end_request_all(rq, -EROFS);
+                       goto next;
+               }
+
+               spin_unlock_irq(q->queue_lock);
+
+               dout("%s 0x%x bytes at 0x%llx\n",
+                    do_write ? "write" : "read",
+                    size, blk_rq_pos(rq) * 512ULL);
+
+               do {
+                       /* a bio clone to be passed down to OSD req */
+                       dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
+                       op_size = rbd_get_segment(&rbd_dev->header,
+                                                 rbd_dev->header.block_name,
+                                                 ofs, size,
+                                                 NULL, NULL);
+                       bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
+                                             op_size, GFP_ATOMIC);
+                       if (!bio) {
+                               spin_lock_irq(q->queue_lock);
+                               __blk_end_request_all(rq, -ENOMEM);
+                               goto next;
+                       }
+
+                       /* init OSD command: write or read */
+                       if (do_write)
+                               rbd_req_write(rq, rbd_dev,
+                                             rbd_dev->header.snapc,
+                                             ofs,
+                                             op_size, bio);
+                       else
+                               rbd_req_read(rq, rbd_dev,
+                                            cur_snap_id(rbd_dev),
+                                            ofs,
+                                            op_size, bio);
+
+                       size -= op_size;
+                       ofs += op_size;
+
+                       rq_bio = next_bio;
+               } while (size > 0);
+
+               if (bp)
+                       bio_pair_release(bp);
+
+               spin_lock_irq(q->queue_lock);
+next:
+               rq = blk_fetch_request(q);
+       }
+}
+
+/*
+ * a queue callback. Makes sure that we don't create a bio that spans across
+ * multiple osd objects. One exception would be with a single page bios,
+ * which we handle later at bio_chain_clone
+ */
+static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
+                         struct bio_vec *bvec)
+{
+       struct rbd_device *rbd_dev = q->queuedata;
+       unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9);
+       sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
+       unsigned int bio_sectors = bmd->bi_size >> 9;
+       int max;
+
+       max =  (chunk_sectors - ((sector & (chunk_sectors - 1))
+                                + bio_sectors)) << 9;
+       if (max < 0)
+               max = 0; /* bio_add cannot handle a negative return */
+       if (max <= bvec->bv_len && bio_sectors == 0)
+               return bvec->bv_len;
+       return max;
+}
+
+static void rbd_free_disk(struct rbd_device *rbd_dev)
+{
+       struct gendisk *disk = rbd_dev->disk;
+
+       if (!disk)
+               return;
+
+       rbd_header_free(&rbd_dev->header);
+
+       if (disk->flags & GENHD_FL_UP)
+               del_gendisk(disk);
+       if (disk->queue)
+               blk_cleanup_queue(disk->queue);
+       put_disk(disk);
+}
+
+/*
+ * reload the ondisk the header 
+ */
+static int rbd_read_header(struct rbd_device *rbd_dev,
+                          struct rbd_image_header *header)
+{
+       ssize_t rc;
+       struct rbd_image_header_ondisk *dh;
+       int snap_count = 0;
+       u64 snap_names_len = 0;
+
+       while (1) {
+               int len = sizeof(*dh) +
+                         snap_count * sizeof(struct rbd_image_snap_ondisk) +
+                         snap_names_len;
+
+               rc = -ENOMEM;
+               dh = kmalloc(len, GFP_KERNEL);
+               if (!dh)
+                       return -ENOMEM;
+
+               rc = rbd_req_sync_read(rbd_dev,
+                                      NULL, CEPH_NOSNAP,
+                                      rbd_dev->obj_md_name,
+                                      0, len,
+                                      (char *)dh);
+               if (rc < 0)
+                       goto out_dh;
+
+               rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
+               if (rc < 0)
+                       goto out_dh;
+
+               if (snap_count != header->total_snaps) {
+                       snap_count = header->total_snaps;
+                       snap_names_len = header->snap_names_len;
+                       rbd_header_free(header);
+                       kfree(dh);
+                       continue;
+               }
+               break;
+       }
+
+out_dh:
+       kfree(dh);
+       return rc;
+}
+
+/*
+ * create a snapshot
+ */
+static int rbd_header_add_snap(struct rbd_device *dev,
+                              const char *snap_name,
+                              gfp_t gfp_flags)
+{
+       int name_len = strlen(snap_name);
+       u64 new_snapid;
+       int ret;
+       void *data, *data_start, *data_end;
+
+       /* we should create a snapshot only if we're pointing at the head */
+       if (dev->cur_snap)
+               return -EINVAL;
+
+       ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid,
+                                     &new_snapid);
+       dout("created snapid=%lld\n", new_snapid);
+       if (ret < 0)
+               return ret;
+
+       data = kmalloc(name_len + 16, gfp_flags);
+       if (!data)
+               return -ENOMEM;
+
+       data_start = data;
+       data_end = data + name_len + 16;
+
+       ceph_encode_string_safe(&data, data_end, snap_name, name_len, bad);
+       ceph_encode_64_safe(&data, data_end, new_snapid, bad);
+
+       ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
+                               data_start, data - data_start);
+
+       kfree(data_start);
+
+       if (ret < 0)
+               return ret;
+
+       dev->header.snapc->seq =  new_snapid;
+
+       return 0;
+bad:
+       return -ERANGE;
+}
+
+/*
+ * only read the first part of the ondisk header, without the snaps info
+ */
+static int rbd_update_snaps(struct rbd_device *rbd_dev)
+{
+       int ret;
+       struct rbd_image_header h;
+       u64 snap_seq;
+
+       ret = rbd_read_header(rbd_dev, &h);
+       if (ret < 0)
+               return ret;
+
+       down_write(&rbd_dev->header.snap_rwsem);
+
+       snap_seq = rbd_dev->header.snapc->seq;
+
+       kfree(rbd_dev->header.snapc);
+       kfree(rbd_dev->header.snap_names);
+       kfree(rbd_dev->header.snap_sizes);
+
+       rbd_dev->header.total_snaps = h.total_snaps;
+       rbd_dev->header.snapc = h.snapc;
+       rbd_dev->header.snap_names = h.snap_names;
+       rbd_dev->header.snap_sizes = h.snap_sizes;
+       rbd_dev->header.snapc->seq = snap_seq;
+
+       up_write(&rbd_dev->header.snap_rwsem);
+
+       return 0;
+}
+
+static int rbd_init_disk(struct rbd_device *rbd_dev)
+{
+       struct gendisk *disk;
+       struct request_queue *q;
+       int rc;
+       u64 total_size = 0;
+
+       /* contact OSD, request size info about the object being mapped */
+       rc = rbd_read_header(rbd_dev, &rbd_dev->header);
+       if (rc)
+               return rc;
+
+       rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size);
+       if (rc)
+               return rc;
+
+       /* create gendisk info */
+       rc = -ENOMEM;
+       disk = alloc_disk(RBD_MINORS_PER_MAJOR);
+       if (!disk)
+               goto out;
+
+       sprintf(disk->disk_name, DRV_NAME "%d", rbd_dev->id);
+       disk->major = rbd_dev->major;
+       disk->first_minor = 0;
+       disk->fops = &rbd_bd_ops;
+       disk->private_data = rbd_dev;
+
+       /* init rq */
+       rc = -ENOMEM;
+       q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
+       if (!q)
+               goto out_disk;
+       blk_queue_merge_bvec(q, rbd_merge_bvec);
+       disk->queue = q;
+
+       q->queuedata = rbd_dev;
+
+       rbd_dev->disk = disk;
+       rbd_dev->q = q;
+
+       /* finally, announce the disk to the world */
+       set_capacity(disk, total_size / 512ULL);
+       add_disk(disk);
+
+       pr_info("%s: added with size 0x%llx\n",
+               disk->disk_name, (unsigned long long)total_size);
+       return 0;
+
+out_disk:
+       put_disk(disk);
+out:
+       return rc;
+}
+
+/********************************************************************
+ * /sys/class/rbd/
+ *                   add       map rados objects to blkdev
+ *                   remove    unmap rados objects
+ *                   list      show mappings
+ *******************************************************************/
+
+static void class_rbd_release(struct class *cls)
+{
+       kfree(cls);
+}
+
+static ssize_t class_rbd_list(struct class *c,
+                             struct class_attribute *attr,
+                             char *data)
+{
+       int n = 0;
+       struct list_head *tmp;
+       int max = PAGE_SIZE;
+
+       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+       n += snprintf(data, max,
+                     "#id\tmajor\tclient_name\tpool\tname\tsnap\tKB\n");
+
+       list_for_each(tmp, &rbd_dev_list) {
+               struct rbd_device *rbd_dev;
+
+               rbd_dev = list_entry(tmp, struct rbd_device, node);
+               n += snprintf(data+n, max-n,
+                             "%d\t%d\tclient%lld\t%s\t%s\t%s\t%lld\n",
+                             rbd_dev->id,
+                             rbd_dev->major,
+                             ceph_client_id(rbd_dev->client),
+                             rbd_dev->pool_name,
+                             rbd_dev->obj, rbd_dev->snap_name,
+                             rbd_dev->header.image_size >> 10);
+               if (n == max)
+                       break;
+       }
+
+       mutex_unlock(&ctl_mutex);
+       return n;
+}
+
+static ssize_t class_rbd_add(struct class *c,
+                            struct class_attribute *attr,
+                            const char *buf, size_t count)
+{
+       struct ceph_osd_client *osdc;
+       struct rbd_device *rbd_dev;
+       ssize_t rc = -ENOMEM;
+       int irc, new_id = 0;
+       struct list_head *tmp;
+       char *mon_dev_name;
+       char *options;
+
+       if (!try_module_get(THIS_MODULE))
+               return -ENODEV;
+
+       mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
+       if (!mon_dev_name)
+               goto err_out_mod;
+
+       options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
+       if (!options)
+               goto err_mon_dev;
+
+       /* new rbd_device object */
+       rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
+       if (!rbd_dev)
+               goto err_out_opt;
+
+       /* static rbd_device initialization */
+       spin_lock_init(&rbd_dev->lock);
+       INIT_LIST_HEAD(&rbd_dev->node);
+
+       /* generate unique id: find highest unique id, add one */
+       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+       list_for_each(tmp, &rbd_dev_list) {
+               struct rbd_device *rbd_dev;
+
+               rbd_dev = list_entry(tmp, struct rbd_device, node);
+               if (rbd_dev->id >= new_id)
+                       new_id = rbd_dev->id + 1;
+       }
+
+       rbd_dev->id = new_id;
+
+       /* add to global list */
+       list_add_tail(&rbd_dev->node, &rbd_dev_list);
+
+       /* parse add command */
+       if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s "
+                  "%" __stringify(RBD_MAX_OPT_LEN) "s "
+                  "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s "
+                  "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s"
+                  "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
+                  mon_dev_name, options, rbd_dev->pool_name,
+                  rbd_dev->obj, rbd_dev->snap_name) < 4) {
+               rc = -EINVAL;
+               goto err_out_slot;
+       }
+
+       if (rbd_dev->snap_name[0] == 0)
+               rbd_dev->snap_name[0] = '-';
+
+       rbd_dev->obj_len = strlen(rbd_dev->obj);
+       snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s",
+                rbd_dev->obj, RBD_SUFFIX);
+
+       /* initialize rest of new object */
+       snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id);
+       rc = rbd_get_client(rbd_dev, mon_dev_name, options);
+       if (rc < 0)
+               goto err_out_slot;
+
+       mutex_unlock(&ctl_mutex);
+
+       /* pick the pool */
+       osdc = &rbd_dev->client->osdc;
+       rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
+       if (rc < 0)
+               goto err_out_client;
+       rbd_dev->poolid = rc;
+
+       /* register our block device */
+       irc = register_blkdev(0, rbd_dev->name);
+       if (irc < 0) {
+               rc = irc;
+               goto err_out_client;
+       }
+       rbd_dev->major = irc;
+
+       /* set up and announce blkdev mapping */
+       rc = rbd_init_disk(rbd_dev);
+       if (rc)
+               goto err_out_blkdev;
+
+       return count;
+
+err_out_blkdev:
+       unregister_blkdev(rbd_dev->major, rbd_dev->name);
+err_out_client:
+       rbd_put_client(rbd_dev);
+       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+err_out_slot:
+       list_del_init(&rbd_dev->node);
+       mutex_unlock(&ctl_mutex);
+
+       kfree(rbd_dev);
+err_out_opt:
+       kfree(options);
+err_mon_dev:
+       kfree(mon_dev_name);
+err_out_mod:
+       dout("Error adding device %s\n", buf);
+       module_put(THIS_MODULE);
+       return rc;
+}
+
+static struct rbd_device *__rbd_get_dev(unsigned long id)
+{
+       struct list_head *tmp;
+       struct rbd_device *rbd_dev;
+
+       list_for_each(tmp, &rbd_dev_list) {
+               rbd_dev = list_entry(tmp, struct rbd_device, node);
+               if (rbd_dev->id == id)
+                       return rbd_dev;
+       }
+       return NULL;
+}
+
+static ssize_t class_rbd_remove(struct class *c,
+                               struct class_attribute *attr,
+                               const char *buf,
+                               size_t count)
+{
+       struct rbd_device *rbd_dev = NULL;
+       int target_id, rc;
+       unsigned long ul;
+
+       rc = strict_strtoul(buf, 10, &ul);
+       if (rc)
+               return rc;
+
+       /* convert to int; abort if we lost anything in the conversion */
+       target_id = (int) ul;
+       if (target_id != ul)
+               return -EINVAL;
+
+       /* remove object from list immediately */
+       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+       rbd_dev = __rbd_get_dev(target_id);
+       if (rbd_dev)
+               list_del_init(&rbd_dev->node);
+
+       mutex_unlock(&ctl_mutex);
+
+       if (!rbd_dev)
+               return -ENOENT;
+
+       rbd_put_client(rbd_dev);
+
+       /* clean up and free blkdev */
+       rbd_free_disk(rbd_dev);
+       unregister_blkdev(rbd_dev->major, rbd_dev->name);
+       kfree(rbd_dev);
+
+       /* release module ref */
+       module_put(THIS_MODULE);
+
+       return count;
+}
+
+static ssize_t class_rbd_snaps_list(struct class *c,
+                             struct class_attribute *attr,
+                             char *data)
+{
+       struct rbd_device *rbd_dev = NULL;
+       struct list_head *tmp;
+       struct rbd_image_header *header;
+       int i, n = 0, max = PAGE_SIZE;
+       int ret;
+
+       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+       n += snprintf(data, max, "#id\tsnap\tKB\n");
+
+       list_for_each(tmp, &rbd_dev_list) {
+               char *names, *p;
+               struct ceph_snap_context *snapc;
+
+               rbd_dev = list_entry(tmp, struct rbd_device, node);
+               header = &rbd_dev->header;
+
+               down_read(&header->snap_rwsem);
+
+               names = header->snap_names;
+               snapc = header->snapc;
+
+               n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n",
+                             rbd_dev->id, RBD_SNAP_HEAD_NAME,
+                             header->image_size >> 10,
+                             (!rbd_dev->cur_snap ? " (*)" : ""));
+               if (n == max)
+                       break;
+
+               p = names;
+               for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
+                       n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n",
+                             rbd_dev->id, p, header->snap_sizes[i] >> 10,
+                             (rbd_dev->cur_snap &&
+                              (snap_index(header, i) == rbd_dev->cur_snap) ?
+                              " (*)" : ""));
+                       if (n == max)
+                               break;
+               }
+
+               up_read(&header->snap_rwsem);
+       }
+
+
+       ret = n;
+       mutex_unlock(&ctl_mutex);
+       return ret;
+}
+
+static ssize_t class_rbd_snaps_refresh(struct class *c,
+                               struct class_attribute *attr,
+                               const char *buf,
+                               size_t count)
+{
+       struct rbd_device *rbd_dev = NULL;
+       int target_id, rc;
+       unsigned long ul;
+       int ret = count;
+
+       rc = strict_strtoul(buf, 10, &ul);
+       if (rc)
+               return rc;
+
+       /* convert to int; abort if we lost anything in the conversion */
+       target_id = (int) ul;
+       if (target_id != ul)
+               return -EINVAL;
+
+       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+       rbd_dev = __rbd_get_dev(target_id);
+       if (!rbd_dev) {
+               ret = -ENOENT;
+               goto done;
+       }
+
+       rc = rbd_update_snaps(rbd_dev);
+       if (rc < 0)
+               ret = rc;
+
+done:
+       mutex_unlock(&ctl_mutex);
+       return ret;
+}
+
+static ssize_t class_rbd_snap_create(struct class *c,
+                               struct class_attribute *attr,
+                               const char *buf,
+                               size_t count)
+{
+       struct rbd_device *rbd_dev = NULL;
+       int target_id, ret;
+       char *name;
+
+       name = kmalloc(RBD_MAX_SNAP_NAME_LEN + 1, GFP_KERNEL);
+       if (!name)
+               return -ENOMEM;
+
+       /* parse snaps add command */
+       if (sscanf(buf, "%d "
+                  "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
+                  &target_id,
+                  name) != 2) {
+               ret = -EINVAL;
+               goto done;
+       }
+
+       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+       rbd_dev = __rbd_get_dev(target_id);
+       if (!rbd_dev) {
+               ret = -ENOENT;
+               goto done_unlock;
+       }
+
+       ret = rbd_header_add_snap(rbd_dev,
+                                 name, GFP_KERNEL);
+       if (ret < 0)
+               goto done_unlock;
+
+       ret = rbd_update_snaps(rbd_dev);
+       if (ret < 0)
+               goto done_unlock;
+
+       ret = count;
+done_unlock:
+       mutex_unlock(&ctl_mutex);
+done:
+       kfree(name);
+       return ret;
+}
+
+static ssize_t class_rbd_rollback(struct class *c,
+                               struct class_attribute *attr,
+                               const char *buf,
+                               size_t count)
+{
+       struct rbd_device *rbd_dev = NULL;
+       int target_id, ret;
+       u64 snapid;
+       char snap_name[RBD_MAX_SNAP_NAME_LEN];
+       u64 cur_ofs;
+       char *seg_name;
+
+       /* parse snaps add command */
+       if (sscanf(buf, "%d "
+                  "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
+                  &target_id,
+                  snap_name) != 2) {
+               return -EINVAL;
+       }
+
+       ret = -ENOMEM;
+       seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
+       if (!seg_name)
+               return ret;
+
+       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+       rbd_dev = __rbd_get_dev(target_id);
+       if (!rbd_dev) {
+               ret = -ENOENT;
+               goto done_unlock;
+       }
+
+       ret = snap_by_name(&rbd_dev->header, snap_name, &snapid, NULL);
+       if (ret < 0)
+               goto done_unlock;
+
+       dout("snapid=%lld\n", snapid);
+
+       cur_ofs = 0;
+       while (cur_ofs < rbd_dev->header.image_size) {
+               cur_ofs += rbd_get_segment(&rbd_dev->header,
+                                          rbd_dev->obj,
+                                          cur_ofs, (u64)-1,
+                                          seg_name, NULL);
+               dout("seg_name=%s\n", seg_name);
+
+               ret = rbd_req_sync_rollback_obj(rbd_dev, snapid, seg_name);
+               if (ret < 0)
+                       pr_warning("could not roll back obj %s err=%d\n",
+                                  seg_name, ret);
+       }
+
+       ret = rbd_update_snaps(rbd_dev);
+       if (ret < 0)
+               goto done_unlock;
+
+       ret = count;
+
+done_unlock:
+       mutex_unlock(&ctl_mutex);
+       kfree(seg_name);
+
+       return ret;
+}
+
+static struct class_attribute class_rbd_attrs[] = {
+       __ATTR(add,             0200, NULL, class_rbd_add),
+       __ATTR(remove,          0200, NULL, class_rbd_remove),
+       __ATTR(list,            0444, class_rbd_list, NULL),
+       __ATTR(snaps_refresh,   0200, NULL, class_rbd_snaps_refresh),
+       __ATTR(snap_create,     0200, NULL, class_rbd_snap_create),
+       __ATTR(snaps_list,      0444, class_rbd_snaps_list, NULL),
+       __ATTR(snap_rollback,   0200, NULL, class_rbd_rollback),
+       __ATTR_NULL
+};
+
+/*
+ * create control files in sysfs
+ * /sys/class/rbd/...
+ */
+static int rbd_sysfs_init(void)
+{
+       int ret = -ENOMEM;
+
+       class_rbd = kzalloc(sizeof(*class_rbd), GFP_KERNEL);
+       if (!class_rbd)
+               goto out;
+
+       class_rbd->name = DRV_NAME;
+       class_rbd->owner = THIS_MODULE;
+       class_rbd->class_release = class_rbd_release;
+       class_rbd->class_attrs = class_rbd_attrs;
+
+       ret = class_register(class_rbd);
+       if (ret)
+               goto out_class;
+       return 0;
+
+out_class:
+       kfree(class_rbd);
+       class_rbd = NULL;
+       pr_err(DRV_NAME ": failed to create class rbd\n");
+out:
+       return ret;
+}
+
+static void rbd_sysfs_cleanup(void)
+{
+       if (class_rbd)
+               class_destroy(class_rbd);
+       class_rbd = NULL;
+}
+
+int __init rbd_init(void)
+{
+       int rc;
+
+       rc = rbd_sysfs_init();
+       if (rc)
+               return rc;
+       spin_lock_init(&node_lock);
+       pr_info("loaded " DRV_NAME_LONG "\n");
+       return 0;
+}
+
+void __exit rbd_exit(void)
+{
+       rbd_sysfs_cleanup();
+}
+
+module_init(rbd_init);
+module_exit(rbd_exit);
+
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+MODULE_DESCRIPTION("rados block device");
+
+/* following authorship retained from original osdblk.c */
+MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h

new file mode 100644 (file)

index 0000000..fc6c678
--- /dev/null
+++ b/drivers/block/rbd_types.h
@@ -0,0 +1,73 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2010 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_RBD_TYPES_H
+#define CEPH_RBD_TYPES_H
+
+#include <linux/types.h>
+
+/*
+ * rbd image 'foo' consists of objects
+ *   foo.rbd      - image metadata
+ *   foo.00000000
+ *   foo.00000001
+ *   ...          - data
+ */
+
+#define RBD_SUFFIX             ".rbd"
+#define RBD_DIRECTORY           "rbd_directory"
+#define RBD_INFO                "rbd_info"
+
+#define RBD_DEFAULT_OBJ_ORDER  22   /* 4MB */
+#define RBD_MIN_OBJ_ORDER       16
+#define RBD_MAX_OBJ_ORDER       30
+
+#define RBD_MAX_OBJ_NAME_LEN   96
+#define RBD_MAX_SEG_NAME_LEN   128
+
+#define RBD_COMP_NONE          0
+#define RBD_CRYPT_NONE         0
+
+#define RBD_HEADER_TEXT                "<<< Rados Block Device Image >>>\n"
+#define RBD_HEADER_SIGNATURE   "RBD"
+#define RBD_HEADER_VERSION     "001.005"
+
+struct rbd_info {
+       __le64 max_id;
+} __attribute__ ((packed));
+
+struct rbd_image_snap_ondisk {
+       __le64 id;
+       __le64 image_size;
+} __attribute__((packed));
+
+struct rbd_image_header_ondisk {
+       char text[40];
+       char block_name[24];
+       char signature[4];
+       char version[8];
+       struct {
+               __u8 order;
+               __u8 crypt_type;
+               __u8 comp_type;
+               __u8 unused;
+       } __attribute__((packed)) options;
+       __le64 image_size;
+       __le64 snap_seq;
+       __le32 snap_count;
+       __le32 reserved;
+       __le64 snap_names_len;
+       struct rbd_image_snap_ondisk snaps[0];
+} __attribute__((packed));
+
+
+#endif
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c

index 1101e251a629343b0715b4b02a9b664828e1ef3e..8320490226b78145f95c7e7801a15080419adc19 100644 (file)
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -2,7 +2,6 @@
  #include <linux/spinlock.h>
  #include <linux/slab.h>
  #include <linux/blkdev.h>
-#include <linux/smp_lock.h>
  #include <linux/hdreg.h>
  #include <linux/virtio.h>
  #include <linux/virtio_blk.h>
@@ -222,8 +221,8 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
         return err;
  }
  
-static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode,
-                        unsigned cmd, unsigned long data)
+static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
+                            unsigned int cmd, unsigned long data)
  {
         struct gendisk *disk = bdev->bd_disk;
         struct virtio_blk *vblk = disk->private_data;
@@ -238,18 +237,6 @@ static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode,
                               (void __user *)data);
  }
  
-static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
-                            unsigned int cmd, unsigned long param)
-{
-       int ret;
-
-       lock_kernel();
-       ret = virtblk_locked_ioctl(bdev, mode, cmd, param);
-       unlock_kernel();
-
-       return ret;
-}
-
  /* We provide getgeo only to please some old bootloader/partitioning tools */
  static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
  {
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c

index 70312da4c968f9e4af7c39e9a5a649b4952f8bd3..564808a5c3c06c36434791bafb95c22472c8920a 100644 (file)
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -199,7 +199,7 @@ static void amd64_cleanup(void)
                 struct pci_dev *dev = k8_northbridges[i];
                 /* disable gart translation */
                 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &tmp);
-               tmp &= ~AMD64_GARTEN;
+               tmp &= ~GARTEN;
                 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, tmp);
         }
  }
@@ -313,7 +313,7 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp,
         if (order < 0 || !agp_aperture_valid(aper, (32*1024*1024)<<order))
                 return -1;
  
-       pci_write_config_dword(nb, AMD64_GARTAPERTURECTL, order << 1);
+       gart_set_size_and_enable(nb, order);
         pci_write_config_dword(nb, AMD64_GARTAPERTUREBASE, aper >> 25);
  
         return 0;
diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c

index d2abf51439836383fd9b03612a44bdf1779448fa..64255cef8a7db93f780c700d712c5fa4b7fe5907 100644 (file)
--- a/drivers/char/agp/generic.c
+++ b/drivers/char/agp/generic.c
@@ -984,7 +984,9 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge)
  
         bridge->driver->cache_flush();
  #ifdef CONFIG_X86
-       set_memory_uc((unsigned long)table, 1 << page_order);
+       if (set_memory_uc((unsigned long)table, 1 << page_order))
+               printk(KERN_WARNING "Could not set GATT table memory to UC!");
+
         bridge->gatt_table = (void *)table;
  #else
         bridge->gatt_table = ioremap_nocache(virt_to_phys(table),
diff --git a/drivers/char/tpm/tpm.c b/drivers/char/tpm/tpm.c

index 05ad4a17a28f238ecc197e67051f60a50faad769..7c4133582dbae484f449d49f97b2ee0f179a9891 100644 (file)
--- a/drivers/char/tpm/tpm.c
+++ b/drivers/char/tpm/tpm.c
@@ -47,6 +47,16 @@ enum tpm_duration {
  #define TPM_MAX_PROTECTED_ORDINAL 12
  #define TPM_PROTECTED_ORDINAL_MASK 0xFF
  
+/*
+ * Bug workaround - some TPM's don't flush the most
+ * recently changed pcr on suspend, so force the flush
+ * with an extend to the selected _unused_ non-volatile pcr.
+ */
+static int tpm_suspend_pcr;
+module_param_named(suspend_pcr, tpm_suspend_pcr, uint, 0644);
+MODULE_PARM_DESC(suspend_pcr,
+                "PCR to use for dummy writes to faciltate flush on suspend.");
+
  static LIST_HEAD(tpm_chip_list);
  static DEFINE_SPINLOCK(driver_lock);
  static DECLARE_BITMAP(dev_mask, TPM_NUM_DEVICES);
@@ -1077,18 +1087,6 @@ static struct tpm_input_header savestate_header = {
         .ordinal = TPM_ORD_SAVESTATE
  };
  
-/* Bug workaround - some TPM's don't flush the most
- * recently changed pcr on suspend, so force the flush
- * with an extend to the selected _unused_ non-volatile pcr.
- */
-static int tpm_suspend_pcr;
-static int __init tpm_suspend_setup(char *str)
-{
-       get_option(&str, &tpm_suspend_pcr);
-       return 1;
-}
-__setup("tpm_suspend_pcr=", tpm_suspend_setup);
-
  /*
   * We are about to suspend. Save the TPM state
   * so that it can be restored.
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c

index c810481a5bc23ae3ca57127729c83e454d681534..6c1b676643a9ef43fe3984bb8495996960cf4337 100644 (file)
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -48,6 +48,9 @@ struct ports_driver_data {
         /* Used for exporting per-port information to debugfs */
         struct dentry *debugfs_dir;
  
+       /* List of all the devices we're handling */
+       struct list_head portdevs;
+
         /* Number of devices this driver is handling */
         unsigned int index;
  
@@ -108,6 +111,9 @@ struct port_buffer {
   * ports for that device (vdev->priv).
   */
  struct ports_device {
+       /* Next portdev in the list, head is in the pdrvdata struct */
+       struct list_head list;
+
         /*
          * Workqueue handlers where we process deferred work after
          * notification
@@ -178,15 +184,21 @@ struct port {
         struct console cons;
  
         /* Each port associates with a separate char device */
-       struct cdev cdev;
+       struct cdev *cdev;
         struct device *dev;
  
+       /* Reference-counting to handle port hot-unplugs and file operations */
+       struct kref kref;
+
         /* A waitqueue for poll() or blocking read operations */
         wait_queue_head_t waitqueue;
  
         /* The 'name' of the port that we expose via sysfs properties */
         char *name;
  
+       /* We can notify apps of host connect / disconnect events via SIGIO */
+       struct fasync_struct *async_queue;
+
         /* The 'id' to identify the port with the Host */
         u32 id;
  
@@ -221,6 +233,41 @@ out:
         return port;
  }
  
+static struct port *find_port_by_devt_in_portdev(struct ports_device *portdev,
+                                                dev_t dev)
+{
+       struct port *port;
+       unsigned long flags;
+
+       spin_lock_irqsave(&portdev->ports_lock, flags);
+       list_for_each_entry(port, &portdev->ports, list)
+               if (port->cdev->dev == dev)
+                       goto out;
+       port = NULL;
+out:
+       spin_unlock_irqrestore(&portdev->ports_lock, flags);
+
+       return port;
+}
+
+static struct port *find_port_by_devt(dev_t dev)
+{
+       struct ports_device *portdev;
+       struct port *port;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pdrvdata_lock, flags);
+       list_for_each_entry(portdev, &pdrvdata.portdevs, list) {
+               port = find_port_by_devt_in_portdev(portdev, dev);
+               if (port)
+                       goto out;
+       }
+       port = NULL;
+out:
+       spin_unlock_irqrestore(&pdrvdata_lock, flags);
+       return port;
+}
+
  static struct port *find_port_by_id(struct ports_device *portdev, u32 id)
  {
         struct port *port;
@@ -410,7 +457,10 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id,
  static ssize_t send_control_msg(struct port *port, unsigned int event,
                                 unsigned int value)
  {
-       return __send_control_msg(port->portdev, port->id, event, value);
+       /* Did the port get unplugged before userspace closed it? */
+       if (port->portdev)
+               return __send_control_msg(port->portdev, port->id, event, value);
+       return 0;
  }
  
  /* Callers must take the port->outvq_lock */
@@ -459,9 +509,12 @@ static ssize_t send_buf(struct port *port, void *in_buf, size_t in_count,
  
         /*
          * Wait till the host acknowledges it pushed out the data we
-        * sent.  This is done for ports in blocking mode or for data
-        * from the hvc_console; the tty operations are performed with
-        * spinlocks held so we can't sleep here.
+        * sent.  This is done for data from the hvc_console; the tty
+        * operations are performed with spinlocks held so we can't
+        * sleep here.  An alternative would be to copy the data to a
+        * buffer and relax the spinning requirement.  The downside is
+        * we need to kmalloc a GFP_ATOMIC buffer each time the
+        * console driver writes something out.
          */
         while (!virtqueue_get_buf(out_vq, &len))
                 cpu_relax();
@@ -522,6 +575,10 @@ static ssize_t fill_readbuf(struct port *port, char *out_buf, size_t out_count,
  /* The condition that must be true for polling to end */
  static bool will_read_block(struct port *port)
  {
+       if (!port->guest_connected) {
+               /* Port got hot-unplugged. Let's exit. */
+               return false;
+       }
         return !port_has_data(port) && port->host_connected;
  }
  
@@ -572,6 +629,9 @@ static ssize_t port_fops_read(struct file *filp, char __user *ubuf,
                 if (ret < 0)
                         return ret;
         }
+       /* Port got hot-unplugged. */
+       if (!port->guest_connected)
+               return -ENODEV;
         /*
          * We could've received a disconnection message while we were
          * waiting for more data.
@@ -613,6 +673,9 @@ static ssize_t port_fops_write(struct file *filp, const char __user *ubuf,
                 if (ret < 0)
                         return ret;
         }
+       /* Port got hot-unplugged. */
+       if (!port->guest_connected)
+               return -ENODEV;
  
         count = min((size_t)(32 * 1024), count);
  
@@ -626,6 +689,14 @@ static ssize_t port_fops_write(struct file *filp, const char __user *ubuf,
                 goto free_buf;
         }
  
+       /*
+        * We now ask send_buf() to not spin for generic ports -- we
+        * can re-use the same code path that non-blocking file
+        * descriptors take for blocking file descriptors since the
+        * wait is already done and we're certain the write will go
+        * through to the host.
+        */
+       nonblock = true;
         ret = send_buf(port, buf, count, nonblock);
  
         if (nonblock && ret > 0)
@@ -645,6 +716,10 @@ static unsigned int port_fops_poll(struct file *filp, poll_table *wait)
         port = filp->private_data;
         poll_wait(filp, &port->waitqueue, wait);
  
+       if (!port->guest_connected) {
+               /* Port got unplugged */
+               return POLLHUP;
+       }
         ret = 0;
         if (!will_read_block(port))
                 ret |= POLLIN | POLLRDNORM;
@@ -656,6 +731,8 @@ static unsigned int port_fops_poll(struct file *filp, poll_table *wait)
         return ret;
  }
  
+static void remove_port(struct kref *kref);
+
  static int port_fops_release(struct inode *inode, struct file *filp)
  {
         struct port *port;
@@ -676,6 +753,16 @@ static int port_fops_release(struct inode *inode, struct file *filp)
         reclaim_consumed_buffers(port);
         spin_unlock_irq(&port->outvq_lock);
  
+       /*
+        * Locks aren't necessary here as a port can't be opened after
+        * unplug, and if a port isn't unplugged, a kref would already
+        * exist for the port.  Plus, taking ports_lock here would
+        * create a dependency on other locks taken by functions
+        * inside remove_port if we're the last holder of the port,
+        * creating many problems.
+        */
+       kref_put(&port->kref, remove_port);
+
         return 0;
  }
  
@@ -683,22 +770,31 @@ static int port_fops_open(struct inode *inode, struct file *filp)
  {
         struct cdev *cdev = inode->i_cdev;
         struct port *port;
+       int ret;
  
-       port = container_of(cdev, struct port, cdev);
+       port = find_port_by_devt(cdev->dev);
         filp->private_data = port;
  
+       /* Prevent against a port getting hot-unplugged at the same time */
+       spin_lock_irq(&port->portdev->ports_lock);
+       kref_get(&port->kref);
+       spin_unlock_irq(&port->portdev->ports_lock);
+
         /*
          * Don't allow opening of console port devices -- that's done
          * via /dev/hvc
          */
-       if (is_console_port(port))
-               return -ENXIO;
+       if (is_console_port(port)) {
+               ret = -ENXIO;
+               goto out;
+       }
  
         /* Allow only one process to open a particular port at a time */
         spin_lock_irq(&port->inbuf_lock);
         if (port->guest_connected) {
                 spin_unlock_irq(&port->inbuf_lock);
-               return -EMFILE;
+               ret = -EMFILE;
+               goto out;
         }
  
         port->guest_connected = true;
@@ -713,10 +809,23 @@ static int port_fops_open(struct inode *inode, struct file *filp)
         reclaim_consumed_buffers(port);
         spin_unlock_irq(&port->outvq_lock);
  
+       nonseekable_open(inode, filp);
+
         /* Notify host of port being opened */
         send_control_msg(filp->private_data, VIRTIO_CONSOLE_PORT_OPEN, 1);
  
         return 0;
+out:
+       kref_put(&port->kref, remove_port);
+       return ret;
+}
+
+static int port_fops_fasync(int fd, struct file *filp, int mode)
+{
+       struct port *port;
+
+       port = filp->private_data;
+       return fasync_helper(fd, filp, mode, &port->async_queue);
  }
  
  /*
@@ -732,6 +841,8 @@ static const struct file_operations port_fops = {
         .write = port_fops_write,
         .poll  = port_fops_poll,
         .release = port_fops_release,
+       .fasync = port_fops_fasync,
+       .llseek = no_llseek,
  };
  
  /*
@@ -990,6 +1101,12 @@ static unsigned int fill_queue(struct virtqueue *vq, spinlock_t *lock)
         return nr_added_bufs;
  }
  
+static void send_sigio_to_port(struct port *port)
+{
+       if (port->async_queue && port->guest_connected)
+               kill_fasync(&port->async_queue, SIGIO, POLL_OUT);
+}
+
  static int add_port(struct ports_device *portdev, u32 id)
  {
         char debugfs_name[16];
@@ -1004,6 +1121,7 @@ static int add_port(struct ports_device *portdev, u32 id)
                 err = -ENOMEM;
                 goto fail;
         }
+       kref_init(&port->kref);
  
         port->portdev = portdev;
         port->id = id;
@@ -1011,6 +1129,7 @@ static int add_port(struct ports_device *portdev, u32 id)
         port->name = NULL;
         port->inbuf = NULL;
         port->cons.hvc = NULL;
+       port->async_queue = NULL;
  
         port->cons.ws.ws_row = port->cons.ws.ws_col = 0;
  
@@ -1021,14 +1140,20 @@ static int add_port(struct ports_device *portdev, u32 id)
         port->in_vq = portdev->in_vqs[port->id];
         port->out_vq = portdev->out_vqs[port->id];
  
-       cdev_init(&port->cdev, &port_fops);
+       port->cdev = cdev_alloc();
+       if (!port->cdev) {
+               dev_err(&port->portdev->vdev->dev, "Error allocating cdev\n");
+               err = -ENOMEM;
+               goto free_port;
+       }
+       port->cdev->ops = &port_fops;
  
         devt = MKDEV(portdev->chr_major, id);
-       err = cdev_add(&port->cdev, devt, 1);
+       err = cdev_add(port->cdev, devt, 1);
         if (err < 0) {
                 dev_err(&port->portdev->vdev->dev,
                         "Error %d adding cdev for port %u\n", err, id);
-               goto free_port;
+               goto free_cdev;
         }
         port->dev = device_create(pdrvdata.class, &port->portdev->vdev->dev,
                                   devt, port, "vport%up%u",
@@ -1093,7 +1218,7 @@ free_inbufs:
  free_device:
         device_destroy(pdrvdata.class, port->dev->devt);
  free_cdev:
-       cdev_del(&port->cdev);
+       cdev_del(port->cdev);
  free_port:
         kfree(port);
  fail:
@@ -1102,21 +1227,45 @@ fail:
         return err;
  }
  
-/* Remove all port-specific data. */
-static int remove_port(struct port *port)
+/* No users remain, remove all port-specific data. */
+static void remove_port(struct kref *kref)
+{
+       struct port *port;
+
+       port = container_of(kref, struct port, kref);
+
+       sysfs_remove_group(&port->dev->kobj, &port_attribute_group);
+       device_destroy(pdrvdata.class, port->dev->devt);
+       cdev_del(port->cdev);
+
+       kfree(port->name);
+
+       debugfs_remove(port->debugfs_file);
+
+       kfree(port);
+}
+
+/*
+ * Port got unplugged.  Remove port from portdev's list and drop the
+ * kref reference.  If no userspace has this port opened, it will
+ * result in immediate removal the port.
+ */
+static void unplug_port(struct port *port)
  {
         struct port_buffer *buf;
  
+       spin_lock_irq(&port->portdev->ports_lock);
+       list_del(&port->list);
+       spin_unlock_irq(&port->portdev->ports_lock);
+
         if (port->guest_connected) {
                 port->guest_connected = false;
                 port->host_connected = false;
                 wake_up_interruptible(&port->waitqueue);
-               send_control_msg(port, VIRTIO_CONSOLE_PORT_OPEN, 0);
-       }
  
-       spin_lock_irq(&port->portdev->ports_lock);
-       list_del(&port->list);
-       spin_unlock_irq(&port->portdev->ports_lock);
+               /* Let the app know the port is going down. */
+               send_sigio_to_port(port);
+       }
  
         if (is_console_port(port)) {
                 spin_lock_irq(&pdrvdata_lock);
@@ -1135,9 +1284,6 @@ static int remove_port(struct port *port)
                 hvc_remove(port->cons.hvc);
  #endif
         }
-       sysfs_remove_group(&port->dev->kobj, &port_attribute_group);
-       device_destroy(pdrvdata.class, port->dev->devt);
-       cdev_del(&port->cdev);
  
         /* Remove unused data this port might have received. */
         discard_port_data(port);
@@ -1148,12 +1294,19 @@ static int remove_port(struct port *port)
         while ((buf = virtqueue_detach_unused_buf(port->in_vq)))
                 free_buf(buf);
  
-       kfree(port->name);
-
-       debugfs_remove(port->debugfs_file);
+       /*
+        * We should just assume the device itself has gone off --
+        * else a close on an open port later will try to send out a
+        * control message.
+        */
+       port->portdev = NULL;
  
-       kfree(port);
-       return 0;
+       /*
+        * Locks around here are not necessary - a port can't be
+        * opened after we removed the port struct from ports_list
+        * above.
+        */
+       kref_put(&port->kref, remove_port);
  }
  
  /* Any private messages that the Host and Guest want to share */
@@ -1192,7 +1345,7 @@ static void handle_control_message(struct ports_device *portdev,
                 add_port(portdev, cpkt->id);
                 break;
         case VIRTIO_CONSOLE_PORT_REMOVE:
-               remove_port(port);
+               unplug_port(port);
                 break;
         case VIRTIO_CONSOLE_CONSOLE_PORT:
                 if (!cpkt->value)
@@ -1234,6 +1387,12 @@ static void handle_control_message(struct ports_device *portdev,
                 spin_lock_irq(&port->outvq_lock);
                 reclaim_consumed_buffers(port);
                 spin_unlock_irq(&port->outvq_lock);
+
+               /*
+                * If the guest is connected, it'll be interested in
+                * knowing the host connection state changed.
+                */
+               send_sigio_to_port(port);
                 break;
         case VIRTIO_CONSOLE_PORT_NAME:
                 /*
@@ -1330,6 +1489,9 @@ static void in_intr(struct virtqueue *vq)
  
         wake_up_interruptible(&port->waitqueue);
  
+       /* Send a SIGIO indicating new data in case the process asked for it */
+       send_sigio_to_port(port);
+
         if (is_console_port(port) && hvc_poll(port->cons.hvc))
                 hvc_kick();
  }
@@ -1566,6 +1728,10 @@ static int __devinit virtcons_probe(struct virtio_device *vdev)
                 add_port(portdev, 0);
         }
  
+       spin_lock_irq(&pdrvdata_lock);
+       list_add_tail(&portdev->list, &pdrvdata.portdevs);
+       spin_unlock_irq(&pdrvdata_lock);
+
         __send_control_msg(portdev, VIRTIO_CONSOLE_BAD_ID,
                            VIRTIO_CONSOLE_DEVICE_READY, 1);
         return 0;
@@ -1589,23 +1755,41 @@ static void virtcons_remove(struct virtio_device *vdev)
  {
         struct ports_device *portdev;
         struct port *port, *port2;
-       struct port_buffer *buf;
-       unsigned int len;
  
         portdev = vdev->priv;
  
+       spin_lock_irq(&pdrvdata_lock);
+       list_del(&portdev->list);
+       spin_unlock_irq(&pdrvdata_lock);
+
+       /* Disable interrupts for vqs */
+       vdev->config->reset(vdev);
+       /* Finish up work that's lined up */
         cancel_work_sync(&portdev->control_work);
  
         list_for_each_entry_safe(port, port2, &portdev->ports, list)
-               remove_port(port);
+               unplug_port(port);
  
         unregister_chrdev(portdev->chr_major, "virtio-portsdev");
  
-       while ((buf = virtqueue_get_buf(portdev->c_ivq, &len)))
-               free_buf(buf);
+       /*
+        * When yanking out a device, we immediately lose the
+        * (device-side) queues.  So there's no point in keeping the
+        * guest side around till we drop our final reference.  This
+        * also means that any ports which are in an open state will
+        * have to just stop using the port, as the vqs are going
+        * away.
+        */
+       if (use_multiport(portdev)) {
+               struct port_buffer *buf;
+               unsigned int len;
  
-       while ((buf = virtqueue_detach_unused_buf(portdev->c_ivq)))
-               free_buf(buf);
+               while ((buf = virtqueue_get_buf(portdev->c_ivq, &len)))
+                       free_buf(buf);
+
+               while ((buf = virtqueue_detach_unused_buf(portdev->c_ivq)))
+                       free_buf(buf);
+       }
  
         vdev->config->del_vqs(vdev);
         kfree(portdev->in_vqs);
@@ -1652,6 +1836,7 @@ static int __init init(void)
                            PTR_ERR(pdrvdata.debugfs_dir));
         }
         INIT_LIST_HEAD(&pdrvdata.consoles);
+       INIT_LIST_HEAD(&pdrvdata.portdevs);
  
         return register_virtio_driver(&virtio_console);
  }
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c

index 1b05896648bce47cff20ed04acbf5d1ee6175a8b..9dcb17d51aee737bcbb4ac478807c04311588d6c 100644 (file)
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -2840,7 +2840,7 @@ static int __devinit pci_probe(struct pci_dev *dev,
                                const struct pci_device_id *ent)
  {
         struct fw_ohci *ohci;
-       u32 bus_options, max_receive, link_speed, version, link_enh;
+       u32 bus_options, max_receive, link_speed, version;
         u64 guid;
         int i, err, n_ir, n_it;
         size_t size;
@@ -2894,23 +2894,6 @@ static int __devinit pci_probe(struct pci_dev *dev,
         if (param_quirks)
                 ohci->quirks = param_quirks;
  
-       /* TI OHCI-Lynx and compatible: set recommended configuration bits. */
-       if (dev->vendor == PCI_VENDOR_ID_TI) {
-               pci_read_config_dword(dev, PCI_CFG_TI_LinkEnh, &link_enh);
-
-               /* adjust latency of ATx FIFO: use 1.7 KB threshold */
-               link_enh &= ~TI_LinkEnh_atx_thresh_mask;
-               link_enh |= TI_LinkEnh_atx_thresh_1_7K;
-
-               /* use priority arbitration for asynchronous responses */
-               link_enh |= TI_LinkEnh_enab_unfair;
-
-               /* required for aPhyEnhanceEnable to work */
-               link_enh |= TI_LinkEnh_enab_accel;
-
-               pci_write_config_dword(dev, PCI_CFG_TI_LinkEnh, link_enh);
-       }
-
         ar_context_init(&ohci->ar_request_ctx, ohci,
                         OHCI1394_AsReqRcvContextControlSet);
  
diff --git a/drivers/firewire/ohci.h b/drivers/firewire/ohci.h

index 0e6c5a466908d58156f4fe7dd978c469efb94aad..ef5e7336da68ddf6af413ecd0cc616a812ae1490 100644 (file)
--- a/drivers/firewire/ohci.h
+++ b/drivers/firewire/ohci.h
@@ -155,12 +155,4 @@
  
  #define OHCI1394_phy_tcode             0xe
  
-/* TI extensions */
-
-#define PCI_CFG_TI_LinkEnh             0xf4
-#define  TI_LinkEnh_enab_accel         0x00000002
-#define  TI_LinkEnh_enab_unfair                0x00000080
-#define  TI_LinkEnh_atx_thresh_mask    0x00003000
-#define  TI_LinkEnh_atx_thresh_1_7K    0x00001000
-
  #endif /* _FIREWIRE_OHCI_H */
diff --git a/drivers/gpu/drm/radeon/radeon_cursor.c b/drivers/gpu/drm/radeon/radeon_cursor.c

index 5731fc9b1ae3ae9274188a5bf8cdae7aa78f3b33..3eef567b0421ae71826abd77ac3bc035a5ec1c33 100644 (file)
--- a/drivers/gpu/drm/radeon/radeon_cursor.c
+++ b/drivers/gpu/drm/radeon/radeon_cursor.c
@@ -203,6 +203,7 @@ int radeon_crtc_cursor_move(struct drm_crtc *crtc,
         struct radeon_crtc *radeon_crtc = to_radeon_crtc(crtc);
         struct radeon_device *rdev = crtc->dev->dev_private;
         int xorigin = 0, yorigin = 0;
+       int w = radeon_crtc->cursor_width;
  
         if (x < 0)
                 xorigin = -x + 1;
@@ -213,22 +214,7 @@ int radeon_crtc_cursor_move(struct drm_crtc *crtc,
         if (yorigin >= CURSOR_HEIGHT)
                 yorigin = CURSOR_HEIGHT - 1;
  
-       radeon_lock_cursor(crtc, true);
-       if (ASIC_IS_DCE4(rdev)) {
-               /* cursors are offset into the total surface */
-               x += crtc->x;
-               y += crtc->y;
-               DRM_DEBUG("x %d y %d c->x %d c->y %d\n", x, y, crtc->x, crtc->y);
-
-               /* XXX: check if evergreen has the same issues as avivo chips */
-               WREG32(EVERGREEN_CUR_POSITION + radeon_crtc->crtc_offset,
-                      ((xorigin ? 0 : x) << 16) |
-                      (yorigin ? 0 : y));
-               WREG32(EVERGREEN_CUR_HOT_SPOT + radeon_crtc->crtc_offset, (xorigin << 16) | yorigin);
-               WREG32(EVERGREEN_CUR_SIZE + radeon_crtc->crtc_offset,
-                      ((radeon_crtc->cursor_width - 1) << 16) | (radeon_crtc->cursor_height - 1));
-       } else if (ASIC_IS_AVIVO(rdev)) {
-               int w = radeon_crtc->cursor_width;
+       if (ASIC_IS_AVIVO(rdev)) {
                 int i = 0;
                 struct drm_crtc *crtc_p;
  
@@ -260,7 +246,17 @@ int radeon_crtc_cursor_move(struct drm_crtc *crtc,
                         if (w <= 0)
                                 w = 1;
                 }
+       }
  
+       radeon_lock_cursor(crtc, true);
+       if (ASIC_IS_DCE4(rdev)) {
+               WREG32(EVERGREEN_CUR_POSITION + radeon_crtc->crtc_offset,
+                      ((xorigin ? 0 : x) << 16) |
+                      (yorigin ? 0 : y));
+               WREG32(EVERGREEN_CUR_HOT_SPOT + radeon_crtc->crtc_offset, (xorigin << 16) | yorigin);
+               WREG32(EVERGREEN_CUR_SIZE + radeon_crtc->crtc_offset,
+                      ((w - 1) << 16) | (radeon_crtc->cursor_height - 1));
+       } else if (ASIC_IS_AVIVO(rdev)) {
                 WREG32(AVIVO_D1CUR_POSITION + radeon_crtc->crtc_offset,
                              ((xorigin ? 0 : x) << 16) |
                              (yorigin ? 0 : y));
diff --git a/drivers/hid/hid-cando.c b/drivers/hid/hid-cando.c

index 4267a6fdc277a183cc840342fb5508cadda90815..5925bdcd417dbbf74d878b52fedcbc24a4d4de31 100644 (file)
--- a/drivers/hid/hid-cando.c
+++ b/drivers/hid/hid-cando.c
@@ -237,6 +237,8 @@ static const struct hid_device_id cando_devices[] = {
                         USB_DEVICE_ID_CANDO_MULTI_TOUCH) },
         { HID_USB_DEVICE(USB_VENDOR_ID_CANDO,
                         USB_DEVICE_ID_CANDO_MULTI_TOUCH_11_6) },
+       { HID_USB_DEVICE(USB_VENDOR_ID_CANDO,
+               USB_DEVICE_ID_CANDO_MULTI_TOUCH_15_6) },
         { }
  };
  MODULE_DEVICE_TABLE(hid, cando_devices);
diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c

index 3f7292486024b8feace0b72775c03a2ae122e8fc..a0dea3d1296e65ebc9b84e24ec1aeacd69aa59d7 100644 (file)
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -1292,6 +1292,7 @@ static const struct hid_device_id hid_blacklist[] = {
         { HID_USB_DEVICE(USB_VENDOR_ID_BTC, USB_DEVICE_ID_BTC_EMPREX_REMOTE_2) },
         { HID_USB_DEVICE(USB_VENDOR_ID_CANDO, USB_DEVICE_ID_CANDO_MULTI_TOUCH) },
         { HID_USB_DEVICE(USB_VENDOR_ID_CANDO, USB_DEVICE_ID_CANDO_MULTI_TOUCH_11_6) },
+       { HID_USB_DEVICE(USB_VENDOR_ID_CANDO, USB_DEVICE_ID_CANDO_MULTI_TOUCH_15_6) },
         { HID_USB_DEVICE(USB_VENDOR_ID_CHERRY, USB_DEVICE_ID_CHERRY_CYMOTION) },
         { HID_USB_DEVICE(USB_VENDOR_ID_CHERRY, USB_DEVICE_ID_CHERRY_CYMOTION_SOLAR) },
         { HID_USB_DEVICE(USB_VENDOR_ID_CHICONY, USB_DEVICE_ID_CHICONY_TACTICAL_PAD) },
diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h

index 765a4f53eb5cb663fd319d0a71386ae1a9ee0fa5..c5ae5f1545bd0a18d516edab6f04b3c18d0fca71 100644 (file)
--- a/drivers/hid/hid-ids.h
+++ b/drivers/hid/hid-ids.h
@@ -134,6 +134,7 @@
  #define USB_VENDOR_ID_CANDO            0x2087
  #define USB_DEVICE_ID_CANDO_MULTI_TOUCH        0x0a01
  #define USB_DEVICE_ID_CANDO_MULTI_TOUCH_11_6 0x0b03
+#define USB_DEVICE_ID_CANDO_MULTI_TOUCH_15_6 0x0f01
  
  #define USB_VENDOR_ID_CH               0x068e
  #define USB_DEVICE_ID_CH_PRO_PEDALS    0x00f2
@@ -503,6 +504,7 @@
  
  #define USB_VENDOR_ID_TURBOX           0x062a
  #define USB_DEVICE_ID_TURBOX_KEYBOARD  0x0201
+#define USB_DEVICE_ID_TURBOX_TOUCHSCREEN_MOSART        0x7100
  
  #define USB_VENDOR_ID_TWINHAN          0x6253
  #define USB_DEVICE_ID_TWINHAN_IR_REMOTE        0x0100
diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c

index 47d70c523d93474a658bbaa5aa5b1cfb327f194b..a3866b5c0c43da58bceae7bb463a2780846454be 100644 (file)
--- a/drivers/hid/hidraw.c
+++ b/drivers/hid/hidraw.c
@@ -109,6 +109,12 @@ static ssize_t hidraw_write(struct file *file, const char __user *buffer, size_t
         int ret = 0;
  
         mutex_lock(&minors_lock);
+
+       if (!hidraw_table[minor]) {
+               ret = -ENODEV;
+               goto out;
+       }
+
         dev = hidraw_table[minor]->hid;
  
         if (!dev->hid_output_raw_report) {
@@ -244,6 +250,10 @@ static long hidraw_ioctl(struct file *file, unsigned int cmd,
  
         mutex_lock(&minors_lock);
         dev = hidraw_table[minor];
+       if (!dev) {
+               ret = -ENODEV;
+               goto out;
+       }
  
         switch (cmd) {
                 case HIDIOCGRDESCSIZE:
@@ -317,6 +327,7 @@ static long hidraw_ioctl(struct file *file, unsigned int cmd,
  
                 ret = -ENOTTY;
         }
+out:
         mutex_unlock(&minors_lock);
         return ret;
  }
diff --git a/drivers/hid/usbhid/hid-quirks.c b/drivers/hid/usbhid/hid-quirks.c

index 70da3181c8a0467663c15abd324dd6fbe801ea51..f0260c699adb45ac9d5b11f119c1491a95e04504 100644 (file)
--- a/drivers/hid/usbhid/hid-quirks.c
+++ b/drivers/hid/usbhid/hid-quirks.c
@@ -36,6 +36,7 @@ static const struct hid_blacklist {
         { USB_VENDOR_ID_DWAV, USB_DEVICE_ID_EGALAX_TOUCHCONTROLLER, HID_QUIRK_MULTI_INPUT | HID_QUIRK_NOGET },
         { USB_VENDOR_ID_DWAV, USB_DEVICE_ID_DWAV_EGALAX_MULTITOUCH, HID_QUIRK_MULTI_INPUT },
         { USB_VENDOR_ID_MOJO, USB_DEVICE_ID_RETRO_ADAPTER, HID_QUIRK_MULTI_INPUT },
+       { USB_VENDOR_ID_TURBOX, USB_DEVICE_ID_TURBOX_TOUCHSCREEN_MOSART, HID_QUIRK_MULTI_INPUT },
         { USB_VENDOR_ID_HAPP, USB_DEVICE_ID_UGCI_DRIVING, HID_QUIRK_BADPAD | HID_QUIRK_MULTI_INPUT },
         { USB_VENDOR_ID_HAPP, USB_DEVICE_ID_UGCI_FLYING, HID_QUIRK_BADPAD | HID_QUIRK_MULTI_INPUT },
         { USB_VENDOR_ID_HAPP, USB_DEVICE_ID_UGCI_FIGHTING, HID_QUIRK_BADPAD | HID_QUIRK_MULTI_INPUT },
diff --git a/drivers/i2c/busses/i2c-davinci.c b/drivers/i2c/busses/i2c-davinci.c

index b8feac5f2ef4f6f4be4d410ec7a5a2ce2a6d4946..5795c8398c7c3a25af82319a0ce1d46741def8b2 100644 (file)
--- a/drivers/i2c/busses/i2c-davinci.c
+++ b/drivers/i2c/busses/i2c-davinci.c
@@ -331,21 +331,16 @@ i2c_davinci_xfer_msg(struct i2c_adapter *adap, struct i2c_msg *msg, int stop)
         INIT_COMPLETION(dev->cmd_complete);
         dev->cmd_err = 0;
  
-       /* Take I2C out of reset, configure it as master and set the
-        * start bit */
-       flag = DAVINCI_I2C_MDR_IRS | DAVINCI_I2C_MDR_MST | DAVINCI_I2C_MDR_STT;
+       /* Take I2C out of reset and configure it as master */
+       flag = DAVINCI_I2C_MDR_IRS | DAVINCI_I2C_MDR_MST;
  
         /* if the slave address is ten bit address, enable XA bit */
         if (msg->flags & I2C_M_TEN)
                 flag |= DAVINCI_I2C_MDR_XA;
         if (!(msg->flags & I2C_M_RD))
                 flag |= DAVINCI_I2C_MDR_TRX;
-       if (stop)
-               flag |= DAVINCI_I2C_MDR_STP;
-       if (msg->len == 0) {
+       if (msg->len == 0)
                 flag |= DAVINCI_I2C_MDR_RM;
-               flag &= ~DAVINCI_I2C_MDR_STP;
-       }
  
         /* Enable receive or transmit interrupts */
         w = davinci_i2c_read_reg(dev, DAVINCI_I2C_IMR_REG);
@@ -357,18 +352,29 @@ i2c_davinci_xfer_msg(struct i2c_adapter *adap, struct i2c_msg *msg, int stop)
  
         dev->terminate = 0;
  
+       /*
+        * Write mode register first as needed for correct behaviour
+        * on OMAP-L138, but don't set STT yet to avoid a race with XRDY
+        * occuring before we have loaded DXR
+        */
+       davinci_i2c_write_reg(dev, DAVINCI_I2C_MDR_REG, flag);
+
         /*
          * First byte should be set here, not after interrupt,
          * because transmit-data-ready interrupt can come before
          * NACK-interrupt during sending of previous message and
          * ICDXR may have wrong data
+        * It also saves us one interrupt, slightly faster
          */
         if ((!(msg->flags & I2C_M_RD)) && dev->buf_len) {
                 davinci_i2c_write_reg(dev, DAVINCI_I2C_DXR_REG, *dev->buf++);
                 dev->buf_len--;
         }
  
-       /* write the data into mode register; start transmitting */
+       /* Set STT to begin transmit now DXR is loaded */
+       flag |= DAVINCI_I2C_MDR_STT;
+       if (stop && msg->len != 0)
+               flag |= DAVINCI_I2C_MDR_STP;
         davinci_i2c_write_reg(dev, DAVINCI_I2C_MDR_REG, flag);
  
         r = wait_for_completion_interruptible_timeout(&dev->cmd_complete,
diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c

index d1ff9408dc1f2d68cdbe305c6777ba259eed64e4..4c2a62b75b5cf188dd896dff100c80bfe253aded 100644 (file)
--- a/drivers/i2c/busses/i2c-imx.c
+++ b/drivers/i2c/busses/i2c-imx.c
@@ -159,15 +159,9 @@ static int i2c_imx_bus_busy(struct imx_i2c_struct *i2c_imx, int for_busy)
  
  static int i2c_imx_trx_complete(struct imx_i2c_struct *i2c_imx)
  {
-       int result;
-
-       result = wait_event_interruptible_timeout(i2c_imx->queue,
-               i2c_imx->i2csr & I2SR_IIF, HZ / 10);
+       wait_event_timeout(i2c_imx->queue, i2c_imx->i2csr & I2SR_IIF, HZ / 10);
  
-       if (unlikely(result < 0)) {
-               dev_dbg(&i2c_imx->adapter.dev, "<%s> result < 0\n", __func__);
-               return result;
-       } else if (unlikely(!(i2c_imx->i2csr & I2SR_IIF))) {
+       if (unlikely(!(i2c_imx->i2csr & I2SR_IIF))) {
                 dev_dbg(&i2c_imx->adapter.dev, "<%s> Timeout\n", __func__);
                 return -ETIMEDOUT;
         }
@@ -295,7 +289,7 @@ static irqreturn_t i2c_imx_isr(int irq, void *dev_id)
                 i2c_imx->i2csr = temp;
                 temp &= ~I2SR_IIF;
                 writeb(temp, i2c_imx->base + IMX_I2C_I2SR);
-               wake_up_interruptible(&i2c_imx->queue);
+               wake_up(&i2c_imx->queue);
                 return IRQ_HANDLED;
         }
  
diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c

index c908c5f83645c901f87823e2a587ba65d9b5ee97..af9ee313c10b67b9f0a034e6806757aca085c4ed 100644 (file)
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -28,7 +28,7 @@ struct evdev {
         int minor;
         struct input_handle handle;
         wait_queue_head_t wait;
-       struct evdev_client *grab;
+       struct evdev_client __rcu *grab;
         struct list_head client_list;
         spinlock_t client_lock; /* protects client_list */
         struct mutex mutex;
@@ -669,6 +669,9 @@ static long evdev_do_ioctl(struct file *file, unsigned int cmd,
  
                 if ((_IOC_NR(cmd) & ~ABS_MAX) == _IOC_NR(EVIOCGABS(0))) {
  
+                       if (!dev->absinfo)
+                               return -EINVAL;
+
                         t = _IOC_NR(cmd) & ABS_MAX;
                         abs = dev->absinfo[t];
  
@@ -680,10 +683,13 @@ static long evdev_do_ioctl(struct file *file, unsigned int cmd,
                 }
         }
  
-       if (_IOC_DIR(cmd) == _IOC_READ) {
+       if (_IOC_DIR(cmd) == _IOC_WRITE) {
  
                 if ((_IOC_NR(cmd) & ~ABS_MAX) == _IOC_NR(EVIOCSABS(0))) {
  
+                       if (!dev->absinfo)
+                               return -EINVAL;
+
                         t = _IOC_NR(cmd) & ABS_MAX;
  
                         if (copy_from_user(&abs, p, min_t(size_t,
diff --git a/drivers/input/misc/hp_sdc_rtc.c b/drivers/input/misc/hp_sdc_rtc.c

index c19066479057ff3567d7168c38922e0fd0b84da3..7e2c12a5b83933e23ce665783d2886b4ee6f211f 100644 (file)
--- a/drivers/input/misc/hp_sdc_rtc.c
+++ b/drivers/input/misc/hp_sdc_rtc.c
@@ -104,7 +104,7 @@ static int hp_sdc_rtc_do_read_bbrtc (struct rtc_time *rtctm)
         t.endidx =              91;
         t.seq =                 tseq;
         t.act.semaphore =       &tsem;
-       init_MUTEX_LOCKED(&tsem);
+       sema_init(&tsem, 0);
         
         if (hp_sdc_enqueue_transaction(&t)) return -1;
         
@@ -698,7 +698,7 @@ static int __init hp_sdc_rtc_init(void)
                 return -ENODEV;
  #endif
  
-       init_MUTEX(&i8042tregs);
+       sema_init(&i8042tregs, 1);
  
         if ((ret = hp_sdc_request_timer_irq(&hp_sdc_rtc_isr)))
                 return ret;
diff --git a/drivers/input/serio/hil_mlc.c b/drivers/input/serio/hil_mlc.c

index c92f4edfee7beef3a326e0af4e43275093fba8dc..e5624d8f1709e6c576547c6e78ba66e766d39743 100644 (file)
--- a/drivers/input/serio/hil_mlc.c
+++ b/drivers/input/serio/hil_mlc.c
@@ -915,15 +915,15 @@ int hil_mlc_register(hil_mlc *mlc)
         mlc->ostarted = 0;
  
         rwlock_init(&mlc->lock);
-       init_MUTEX(&mlc->osem);
+       sema_init(&mlc->osem, 1);
  
-       init_MUTEX(&mlc->isem);
+       sema_init(&mlc->isem, 1);
         mlc->icount = -1;
         mlc->imatch = 0;
  
         mlc->opercnt = 0;
  
-       init_MUTEX_LOCKED(&(mlc->csem));
+       sema_init(&(mlc->csem), 0);
  
         hil_mlc_clear_di_scratch(mlc);
         hil_mlc_clear_di_map(mlc, 0);
diff --git a/drivers/input/serio/hp_sdc.c b/drivers/input/serio/hp_sdc.c

index bcc2d30ec245ef8349e81745958e0cf8112dfcbf..8c0b51c31424ce2e0c9b15a2a9eb467424e9b3ec 100644 (file)
--- a/drivers/input/serio/hp_sdc.c
+++ b/drivers/input/serio/hp_sdc.c
@@ -905,7 +905,7 @@ static int __init hp_sdc_init(void)
         ts_sync[1]      = 0x0f;
         ts_sync[2] = ts_sync[3] = ts_sync[4] = ts_sync[5] = 0;
         t_sync.act.semaphore = &s_sync;
-       init_MUTEX_LOCKED(&s_sync);
+       sema_init(&s_sync, 0);
         hp_sdc_enqueue_transaction(&t_sync);
         down(&s_sync); /* Wait for t_sync to complete */
  
@@ -1039,7 +1039,7 @@ static int __init hp_sdc_register(void)
                 return hp_sdc.dev_err;
         }
  
-       init_MUTEX_LOCKED(&tq_init_sem);
+       sema_init(&tq_init_sem, 0);
  
         tq_init.actidx          = 0;
         tq_init.idx             = 1;
diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c

index 1c4ee6e77937f596378f88587787052106cd3c58..bf64e49d996acd121a215c7d0bdc3c1bc1fd86b6 100644 (file)
--- a/drivers/macintosh/adb.c
+++ b/drivers/macintosh/adb.c
@@ -83,7 +83,7 @@ static struct adb_driver *adb_controller;
  BLOCKING_NOTIFIER_HEAD(adb_client_list);
  static int adb_got_sleep;
  static int adb_inited;
-static DECLARE_MUTEX(adb_probe_mutex);
+static DEFINE_SEMAPHORE(adb_probe_mutex);
  static int sleepy_trackpad;
  static int autopoll_devs;
  int __adb_probe_sync;
diff --git a/drivers/media/video/v4l2-compat-ioctl32.c b/drivers/media/video/v4l2-compat-ioctl32.c

index 073f01390cdd0a00de7e34dc7cd30360b72912c4..86294ed35c9b643cc7bab7411904476d0f7467a7 100644 (file)
--- a/drivers/media/video/v4l2-compat-ioctl32.c
+++ b/drivers/media/video/v4l2-compat-ioctl32.c
@@ -193,17 +193,24 @@ static int put_video_window32(struct video_window *kp, struct video_window32 __u
  struct video_code32 {
         char            loadwhat[16];   /* name or tag of file being passed */
         compat_int_t    datasize;
-       unsigned char   *data;
+       compat_uptr_t   data;
  };
  
-static int get_microcode32(struct video_code *kp, struct video_code32 __user *up)
+static struct video_code __user *get_microcode32(struct video_code32 *kp)
  {
-       if (!access_ok(VERIFY_READ, up, sizeof(struct video_code32)) ||
-               copy_from_user(kp->loadwhat, up->loadwhat, sizeof(up->loadwhat)) ||
-               get_user(kp->datasize, &up->datasize) ||
-               copy_from_user(kp->data, up->data, up->datasize))
-                       return -EFAULT;
-       return 0;
+       struct video_code __user *up;
+
+       up = compat_alloc_user_space(sizeof(*up));
+
+       /*
+        * NOTE! We don't actually care if these fail. If the
+        * user address is invalid, the native ioctl will do
+        * the error handling for us
+        */
+       (void) copy_to_user(up->loadwhat, kp->loadwhat, sizeof(up->loadwhat));
+       (void) put_user(kp->datasize, &up->datasize);
+       (void) put_user(compat_ptr(kp->data), &up->data);
+       return up;
  }
  
  #define VIDIOCGTUNER32         _IOWR('v', 4, struct video_tuner32)
@@ -739,7 +746,7 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar
                 struct video_tuner vt;
                 struct video_buffer vb;
                 struct video_window vw;
-               struct video_code vc;
+               struct video_code32 vc;
                 struct video_audio va;
  #endif
                 struct v4l2_format v2f;
@@ -818,8 +825,11 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar
                 break;
  
         case VIDIOCSMICROCODE:
-               err = get_microcode32(&karg.vc, up);
-               compatible_arg = 0;
+               /* Copy the 32-bit "video_code32" to kernel space */
+               if (copy_from_user(&karg.vc, up, sizeof(karg.vc)))
+                       return -EFAULT;
+               /* Convert the 32-bit version to a 64-bit version in user space */
+               up = get_microcode32(&karg.vc);
                 break;
  
         case VIDIOCSFREQ:
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c

index 5db49b124ffa158793e0cfb3d1db72321679d440..09eee6df0653c84fc5c08023f8913a6afbb884a7 100644 (file)
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -1631,6 +1631,19 @@ int mmc_suspend_host(struct mmc_host *host)
         if (host->bus_ops && !host->bus_dead) {
                 if (host->bus_ops->suspend)
                         err = host->bus_ops->suspend(host);
+               if (err == -ENOSYS || !host->bus_ops->resume) {
+                       /*
+                        * We simply "remove" the card in this case.
+                        * It will be redetected on resume.
+                        */
+                       if (host->bus_ops->remove)
+                               host->bus_ops->remove(host);
+                       mmc_claim_host(host);
+                       mmc_detach_bus(host);
+                       mmc_release_host(host);
+                       host->pm_flags = 0;
+                       err = 0;
+               }
         }
         mmc_bus_put(host);
  
diff --git a/drivers/mtd/nand/mxc_nand.c b/drivers/mtd/nand/mxc_nand.c

index b2828e84d24377fdbf2be2d9af3d2492ee7feea2..214b03afdd482920adda308e51088092d884a2d1 100644 (file)
--- a/drivers/mtd/nand/mxc_nand.c
+++ b/drivers/mtd/nand/mxc_nand.c
@@ -30,6 +30,8 @@
  #include <linux/clk.h>
  #include <linux/err.h>
  #include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/completion.h>
  
  #include <asm/mach/flash.h>
  #include <mach/mxc_nand.h>
@@ -151,7 +153,7 @@ struct mxc_nand_host {
         int                     irq;
         int                     eccsize;
  
-       wait_queue_head_t       irq_waitq;
+       struct completion       op_completion;
  
         uint8_t                 *data_buf;
         unsigned int            buf_start;
@@ -164,6 +166,7 @@ struct mxc_nand_host {
         void                    (*send_read_id)(struct mxc_nand_host *);
         uint16_t                (*get_dev_status)(struct mxc_nand_host *);
         int                     (*check_int)(struct mxc_nand_host *);
+       void                    (*irq_control)(struct mxc_nand_host *, int);
  };
  
  /* OOB placement block for use with hardware ecc generation */
@@ -216,9 +219,12 @@ static irqreturn_t mxc_nfc_irq(int irq, void *dev_id)
  {
         struct mxc_nand_host *host = dev_id;
  
-       disable_irq_nosync(irq);
+       if (!host->check_int(host))
+               return IRQ_NONE;
  
-       wake_up(&host->irq_waitq);
+       host->irq_control(host, 0);
+
+       complete(&host->op_completion);
  
         return IRQ_HANDLED;
  }
@@ -245,11 +251,54 @@ static int check_int_v1_v2(struct mxc_nand_host *host)
         if (!(tmp & NFC_V1_V2_CONFIG2_INT))
                 return 0;
  
-       writew(tmp & ~NFC_V1_V2_CONFIG2_INT, NFC_V1_V2_CONFIG2);
+       if (!cpu_is_mx21())
+               writew(tmp & ~NFC_V1_V2_CONFIG2_INT, NFC_V1_V2_CONFIG2);
  
         return 1;
  }
  
+/*
+ * It has been observed that the i.MX21 cannot read the CONFIG2:INT bit
+ * if interrupts are masked (CONFIG1:INT_MSK is set). To handle this, the
+ * driver can enable/disable the irq line rather than simply masking the
+ * interrupts.
+ */
+static void irq_control_mx21(struct mxc_nand_host *host, int activate)
+{
+       if (activate)
+               enable_irq(host->irq);
+       else
+               disable_irq_nosync(host->irq);
+}
+
+static void irq_control_v1_v2(struct mxc_nand_host *host, int activate)
+{
+       uint16_t tmp;
+
+       tmp = readw(NFC_V1_V2_CONFIG1);
+
+       if (activate)
+               tmp &= ~NFC_V1_V2_CONFIG1_INT_MSK;
+       else
+               tmp |= NFC_V1_V2_CONFIG1_INT_MSK;
+
+       writew(tmp, NFC_V1_V2_CONFIG1);
+}
+
+static void irq_control_v3(struct mxc_nand_host *host, int activate)
+{
+       uint32_t tmp;
+
+       tmp = readl(NFC_V3_CONFIG2);
+
+       if (activate)
+               tmp &= ~NFC_V3_CONFIG2_INT_MSK;
+       else
+               tmp |= NFC_V3_CONFIG2_INT_MSK;
+
+       writel(tmp, NFC_V3_CONFIG2);
+}
+
  /* This function polls the NANDFC to wait for the basic operation to
   * complete by checking the INT bit of config2 register.
   */
@@ -259,10 +308,9 @@ static void wait_op_done(struct mxc_nand_host *host, int useirq)
  
         if (useirq) {
                 if (!host->check_int(host)) {
-
-                       enable_irq(host->irq);
-
-                       wait_event(host->irq_waitq, host->check_int(host));
+                       INIT_COMPLETION(host->op_completion);
+                       host->irq_control(host, 1);
+                       wait_for_completion(&host->op_completion);
                 }
         } else {
                 while (max_retries-- > 0) {
@@ -799,6 +847,7 @@ static void preset_v3(struct mtd_info *mtd)
                 NFC_V3_CONFIG2_2CMD_PHASES |
                 NFC_V3_CONFIG2_SPAS(mtd->oobsize >> 1) |
                 NFC_V3_CONFIG2_ST_CMD(0x70) |
+               NFC_V3_CONFIG2_INT_MSK |
                 NFC_V3_CONFIG2_NUM_ADDR_PHASE0;
  
         if (chip->ecc.mode == NAND_ECC_HW)
@@ -1024,6 +1073,10 @@ static int __init mxcnd_probe(struct platform_device *pdev)
                 host->send_read_id = send_read_id_v1_v2;
                 host->get_dev_status = get_dev_status_v1_v2;
                 host->check_int = check_int_v1_v2;
+               if (cpu_is_mx21())
+                       host->irq_control = irq_control_mx21;
+               else
+                       host->irq_control = irq_control_v1_v2;
         }
  
         if (nfc_is_v21()) {
@@ -1062,6 +1115,7 @@ static int __init mxcnd_probe(struct platform_device *pdev)
                 host->send_read_id = send_read_id_v3;
                 host->check_int = check_int_v3;
                 host->get_dev_status = get_dev_status_v3;
+               host->irq_control = irq_control_v3;
                 oob_smallpage = &nandv2_hw_eccoob_smallpage;
                 oob_largepage = &nandv2_hw_eccoob_largepage;
         } else
@@ -1093,14 +1147,34 @@ static int __init mxcnd_probe(struct platform_device *pdev)
                 this->options |= NAND_USE_FLASH_BBT;
         }
  
-       init_waitqueue_head(&host->irq_waitq);
+       init_completion(&host->op_completion);
  
         host->irq = platform_get_irq(pdev, 0);
  
+       /*
+        * mask the interrupt. For i.MX21 explicitely call
+        * irq_control_v1_v2 to use the mask bit. We can't call
+        * disable_irq_nosync() for an interrupt we do not own yet.
+        */
+       if (cpu_is_mx21())
+               irq_control_v1_v2(host, 0);
+       else
+               host->irq_control(host, 0);
+
         err = request_irq(host->irq, mxc_nfc_irq, IRQF_DISABLED, DRIVER_NAME, host);
         if (err)
                 goto eirq;
  
+       host->irq_control(host, 0);
+
+       /*
+        * Now that the interrupt is disabled make sure the interrupt
+        * mask bit is cleared on i.MX21. Otherwise we can't read
+        * the interrupt status bit on this machine.
+        */
+       if (cpu_is_mx21())
+               irq_control_v1_v2(host, 1);
+
         /* first scan to find the device and get the page size */
         if (nand_scan_ident(mtd, 1, NULL)) {
                 err = -ENXIO;
diff --git a/drivers/net/3c527.c b/drivers/net/3c527.c

index 70705d1306b93e161260852d39f6adf6fe308226..eca55c52bdfdf2ae8c8445bfac2901cfd9d99bd3 100644 (file)
--- a/drivers/net/3c527.c
+++ b/drivers/net/3c527.c
@@ -522,7 +522,7 @@ static int __init mc32_probe1(struct net_device *dev, int slot)
         lp->tx_len              = lp->exec_box->data[9];   /* Transmit list count */
         lp->rx_len              = lp->exec_box->data[11];  /* Receive list count */
  
-       init_MUTEX_LOCKED(&lp->cmd_mutex);
+       sema_init(&lp->cmd_mutex, 0);
         init_completion(&lp->execution_cmd);
         init_completion(&lp->xceiver_cmd);
  
diff --git a/drivers/net/b44.c b/drivers/net/b44.c

index 1e620e287ae0cc9fbe9894a105fedbfdc9fbb6be..efeffdf9e5fab30d2bd97a07e5e866fec60ed595 100644 (file)
--- a/drivers/net/b44.c
+++ b/drivers/net/b44.c
@@ -2170,8 +2170,6 @@ static int __devinit b44_init_one(struct ssb_device *sdev,
         dev->irq = sdev->irq;
         SET_ETHTOOL_OPS(dev, &b44_ethtool_ops);
  
-       netif_carrier_off(dev);
-
         err = ssb_bus_powerup(sdev->bus, 0);
         if (err) {
                 dev_err(sdev->dev,
@@ -2213,6 +2211,8 @@ static int __devinit b44_init_one(struct ssb_device *sdev,
                 goto err_out_powerdown;
         }
  
+       netif_carrier_off(dev);
+
         ssb_set_drvdata(sdev, dev);
  
         /* Chip reset provides power to the b44 MAC & PCI cores, which
diff --git a/drivers/net/ehea/ehea_main.c b/drivers/net/ehea/ehea_main.c

index a333b42111b8c2ba20b92eca94bf5648704c4f9a..6372610ed24093b8ed99fdf002e44442be49f696 100644 (file)
--- a/drivers/net/ehea/ehea_main.c
+++ b/drivers/net/ehea/ehea_main.c
@@ -533,8 +533,15 @@ static inline void ehea_fill_skb(struct net_device *dev,
         int length = cqe->num_bytes_transfered - 4;     /*remove CRC */
  
         skb_put(skb, length);
-       skb->ip_summed = CHECKSUM_UNNECESSARY;
         skb->protocol = eth_type_trans(skb, dev);
+
+       /* The packet was not an IPV4 packet so a complemented checksum was
+          calculated. The value is found in the Internet Checksum field. */
+       if (cqe->status & EHEA_CQE_BLIND_CKSUM) {
+               skb->ip_summed = CHECKSUM_COMPLETE;
+               skb->csum = csum_unfold(~cqe->inet_checksum_value);
+       } else
+               skb->ip_summed = CHECKSUM_UNNECESSARY;
  }
  
  static inline struct sk_buff *get_skb_by_index(struct sk_buff **skb_array,
diff --git a/drivers/net/ehea/ehea_qmr.h b/drivers/net/ehea/ehea_qmr.h

index f608a6c54af5845727494c9749e2b786390dfa78..38104734a3be82b0fb4480526295510a4769e8bf 100644 (file)
--- a/drivers/net/ehea/ehea_qmr.h
+++ b/drivers/net/ehea/ehea_qmr.h
@@ -150,6 +150,7 @@ struct ehea_rwqe {
  #define EHEA_CQE_TYPE_RQ           0x60
  #define EHEA_CQE_STAT_ERR_MASK     0x700F
  #define EHEA_CQE_STAT_FAT_ERR_MASK 0xF
+#define EHEA_CQE_BLIND_CKSUM       0x8000
  #define EHEA_CQE_STAT_ERR_TCP      0x4000
  #define EHEA_CQE_STAT_ERR_IP       0x2000
  #define EHEA_CQE_STAT_ERR_CRC      0x1000
diff --git a/drivers/net/fec.c b/drivers/net/fec.c

index 768b840aeb6b7b0bf2ceec87469bb0b4925b5d20..cce32d43175f5c3ed5f6e7ef2ebd56eede8e9139 100644 (file)
--- a/drivers/net/fec.c
+++ b/drivers/net/fec.c
@@ -678,24 +678,37 @@ static int fec_enet_mii_probe(struct net_device *dev)
  {
         struct fec_enet_private *fep = netdev_priv(dev);
         struct phy_device *phy_dev = NULL;
-       int ret;
+       char mdio_bus_id[MII_BUS_ID_SIZE];
+       char phy_name[MII_BUS_ID_SIZE + 3];
+       int phy_id;
  
         fep->phy_dev = NULL;
  
-       /* find the first phy */
-       phy_dev = phy_find_first(fep->mii_bus);
-       if (!phy_dev) {
-               printk(KERN_ERR "%s: no PHY found\n", dev->name);
-               return -ENODEV;
+       /* check for attached phy */
+       for (phy_id = 0; (phy_id < PHY_MAX_ADDR); phy_id++) {
+               if ((fep->mii_bus->phy_mask & (1 << phy_id)))
+                       continue;
+               if (fep->mii_bus->phy_map[phy_id] == NULL)
+                       continue;
+               if (fep->mii_bus->phy_map[phy_id]->phy_id == 0)
+                       continue;
+               strncpy(mdio_bus_id, fep->mii_bus->id, MII_BUS_ID_SIZE);
+               break;
         }
  
-       /* attach the mac to the phy */
-       ret = phy_connect_direct(dev, phy_dev,
-                            &fec_enet_adjust_link, 0,
-                            PHY_INTERFACE_MODE_MII);
-       if (ret) {
-               printk(KERN_ERR "%s: Could not attach to PHY\n", dev->name);
-               return ret;
+       if (phy_id >= PHY_MAX_ADDR) {
+               printk(KERN_INFO "%s: no PHY, assuming direct connection "
+                       "to switch\n", dev->name);
+               strncpy(mdio_bus_id, "0", MII_BUS_ID_SIZE);
+               phy_id = 0;
+       }
+
+       snprintf(phy_name, MII_BUS_ID_SIZE, PHY_ID_FMT, mdio_bus_id, phy_id);
+       phy_dev = phy_connect(dev, phy_name, &fec_enet_adjust_link, 0,
+               PHY_INTERFACE_MODE_MII);
+       if (IS_ERR(phy_dev)) {
+               printk(KERN_ERR "%s: could not attach to PHY\n", dev->name);
+               return PTR_ERR(phy_dev);
         }
  
         /* mask with MAC supported features */
@@ -738,7 +751,7 @@ static int fec_enet_mii_init(struct platform_device *pdev)
         fep->mii_bus->read = fec_enet_mdio_read;
         fep->mii_bus->write = fec_enet_mdio_write;
         fep->mii_bus->reset = fec_enet_mdio_reset;
-       snprintf(fep->mii_bus->id, MII_BUS_ID_SIZE, "%x", pdev->id);
+       snprintf(fep->mii_bus->id, MII_BUS_ID_SIZE, "%x", pdev->id + 1);
         fep->mii_bus->priv = fep;
         fep->mii_bus->parent = &pdev->dev;
  
@@ -1311,6 +1324,9 @@ fec_probe(struct platform_device *pdev)
         if (ret)
                 goto failed_mii_init;
  
+       /* Carrier starts down, phylib will bring it up */
+       netif_carrier_off(ndev);
+
         ret = register_netdev(ndev);
         if (ret)
                 goto failed_register;
diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c

index 4b52c767ad056ced9aafcafea8299384b807c169..3e5d0b6b6516133039192fa5dfd1fdf88660d81b 100644 (file)
--- a/drivers/net/hamradio/6pack.c
+++ b/drivers/net/hamradio/6pack.c
@@ -608,7 +608,7 @@ static int sixpack_open(struct tty_struct *tty)
  
         spin_lock_init(&sp->lock);
         atomic_set(&sp->refcnt, 1);
-       init_MUTEX_LOCKED(&sp->dead_sem);
+       sema_init(&sp->dead_sem, 0);
  
         /* !!! length of the buffers. MTU is IP MTU, not PACLEN!  */
  
diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c

index 66e88bd59caada26f9cdaa4b6dbb7c1a78026e1f..4c628393c8b157cbc09de52d902b5fa8c3d370a3 100644 (file)
--- a/drivers/net/hamradio/mkiss.c
+++ b/drivers/net/hamradio/mkiss.c
@@ -747,7 +747,7 @@ static int mkiss_open(struct tty_struct *tty)
  
         spin_lock_init(&ax->buflock);
         atomic_set(&ax->refcnt, 1);
-       init_MUTEX_LOCKED(&ax->dead_sem);
+       sema_init(&ax->dead_sem, 0);
  
         ax->tty = tty;
         tty->disc_data = ax;
diff --git a/drivers/net/irda/sir_dev.c b/drivers/net/irda/sir_dev.c

index 1b051dab7b298a761a5d9e4f2ff38af16f062238..51d74447f8f8cb7d428c7bbdf22b14f71cdfc4aa 100644 (file)
--- a/drivers/net/irda/sir_dev.c
+++ b/drivers/net/irda/sir_dev.c
@@ -909,7 +909,7 @@ struct sir_dev * sirdev_get_instance(const struct sir_driver *drv, const char *n
         dev->tx_skb = NULL;
  
         spin_lock_init(&dev->tx_lock);
-       init_MUTEX(&dev->fsm.sem);
+       sema_init(&dev->fsm.sem, 1);
  
         dev->drv = drv;
         dev->netdev = ndev;
diff --git a/drivers/net/ppp_async.c b/drivers/net/ppp_async.c

index af50a530daee25bf3debf8493690861c2ea2fcc2..78d70a6481bfa7f986e165e427b8c76bd75ae835 100644 (file)
--- a/drivers/net/ppp_async.c
+++ b/drivers/net/ppp_async.c
@@ -184,7 +184,7 @@ ppp_asynctty_open(struct tty_struct *tty)
         tasklet_init(&ap->tsk, ppp_async_process, (unsigned long) ap);
  
         atomic_set(&ap->refcnt, 1);
-       init_MUTEX_LOCKED(&ap->dead_sem);
+       sema_init(&ap->dead_sem, 0);
  
         ap->chan.private = ap;
         ap->chan.ops = &async_ops;
diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c

index a0da4a17b025ce08469659a420d37944861876d3..992db2fa136e9c5e6f5130c053393aa662d4e5c9 100644 (file)
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -1212,7 +1212,8 @@ static void rtl8169_update_counters(struct net_device *dev)
         if ((RTL_R8(ChipCmd) & CmdRxEnb) == 0)
                 return;
  
-       counters = pci_alloc_consistent(tp->pci_dev, sizeof(*counters), &paddr);
+       counters = dma_alloc_coherent(&tp->pci_dev->dev, sizeof(*counters),
+                                     &paddr, GFP_KERNEL);
         if (!counters)
                 return;
  
@@ -1233,7 +1234,8 @@ static void rtl8169_update_counters(struct net_device *dev)
         RTL_W32(CounterAddrLow, 0);
         RTL_W32(CounterAddrHigh, 0);
  
-       pci_free_consistent(tp->pci_dev, sizeof(*counters), counters, paddr);
+       dma_free_coherent(&tp->pci_dev->dev, sizeof(*counters), counters,
+                         paddr);
  }
  
  static void rtl8169_get_ethtool_stats(struct net_device *dev,
@@ -3292,15 +3294,15 @@ static int rtl8169_open(struct net_device *dev)
  
         /*
          * Rx and Tx desscriptors needs 256 bytes alignment.
-        * pci_alloc_consistent provides more.
+        * dma_alloc_coherent provides more.
          */
-       tp->TxDescArray = pci_alloc_consistent(pdev, R8169_TX_RING_BYTES,
-                                              &tp->TxPhyAddr);
+       tp->TxDescArray = dma_alloc_coherent(&pdev->dev, R8169_TX_RING_BYTES,
+                                            &tp->TxPhyAddr, GFP_KERNEL);
         if (!tp->TxDescArray)
                 goto err_pm_runtime_put;
  
-       tp->RxDescArray = pci_alloc_consistent(pdev, R8169_RX_RING_BYTES,
-                                              &tp->RxPhyAddr);
+       tp->RxDescArray = dma_alloc_coherent(&pdev->dev, R8169_RX_RING_BYTES,
+                                            &tp->RxPhyAddr, GFP_KERNEL);
         if (!tp->RxDescArray)
                 goto err_free_tx_0;
  
@@ -3334,12 +3336,12 @@ out:
  err_release_ring_2:
         rtl8169_rx_clear(tp);
  err_free_rx_1:
-       pci_free_consistent(pdev, R8169_RX_RING_BYTES, tp->RxDescArray,
-                           tp->RxPhyAddr);
+       dma_free_coherent(&pdev->dev, R8169_RX_RING_BYTES, tp->RxDescArray,
+                         tp->RxPhyAddr);
         tp->RxDescArray = NULL;
  err_free_tx_0:
-       pci_free_consistent(pdev, R8169_TX_RING_BYTES, tp->TxDescArray,
-                           tp->TxPhyAddr);
+       dma_free_coherent(&pdev->dev, R8169_TX_RING_BYTES, tp->TxDescArray,
+                         tp->TxPhyAddr);
         tp->TxDescArray = NULL;
  err_pm_runtime_put:
         pm_runtime_put_noidle(&pdev->dev);
@@ -3975,7 +3977,7 @@ static void rtl8169_free_rx_skb(struct rtl8169_private *tp,
  {
         struct pci_dev *pdev = tp->pci_dev;
  
-       pci_unmap_single(pdev, le64_to_cpu(desc->addr), tp->rx_buf_sz,
+       dma_unmap_single(&pdev->dev, le64_to_cpu(desc->addr), tp->rx_buf_sz,
                          PCI_DMA_FROMDEVICE);
         dev_kfree_skb(*sk_buff);
         *sk_buff = NULL;
@@ -4000,7 +4002,7 @@ static inline void rtl8169_map_to_asic(struct RxDesc *desc, dma_addr_t mapping,
  static struct sk_buff *rtl8169_alloc_rx_skb(struct pci_dev *pdev,
                                             struct net_device *dev,
                                             struct RxDesc *desc, int rx_buf_sz,
-                                           unsigned int align)
+                                           unsigned int align, gfp_t gfp)
  {
         struct sk_buff *skb;
         dma_addr_t mapping;
@@ -4008,13 +4010,13 @@ static struct sk_buff *rtl8169_alloc_rx_skb(struct pci_dev *pdev,
  
         pad = align ? align : NET_IP_ALIGN;
  
-       skb = netdev_alloc_skb(dev, rx_buf_sz + pad);
+       skb = __netdev_alloc_skb(dev, rx_buf_sz + pad, gfp);
         if (!skb)
                 goto err_out;
  
         skb_reserve(skb, align ? ((pad - 1) & (unsigned long)skb->data) : pad);
  
-       mapping = pci_map_single(pdev, skb->data, rx_buf_sz,
+       mapping = dma_map_single(&pdev->dev, skb->data, rx_buf_sz,
                                  PCI_DMA_FROMDEVICE);
  
         rtl8169_map_to_asic(desc, mapping, rx_buf_sz);
@@ -4039,7 +4041,7 @@ static void rtl8169_rx_clear(struct rtl8169_private *tp)
  }
  
  static u32 rtl8169_rx_fill(struct rtl8169_private *tp, struct net_device *dev,
-                          u32 start, u32 end)
+                          u32 start, u32 end, gfp_t gfp)
  {
         u32 cur;
  
@@ -4054,7 +4056,7 @@ static u32 rtl8169_rx_fill(struct rtl8169_private *tp, struct net_device *dev,
  
                 skb = rtl8169_alloc_rx_skb(tp->pci_dev, dev,
                                            tp->RxDescArray + i,
-                                          tp->rx_buf_sz, tp->align);
+                                          tp->rx_buf_sz, tp->align, gfp);
                 if (!skb)
                         break;
  
@@ -4082,7 +4084,7 @@ static int rtl8169_init_ring(struct net_device *dev)
         memset(tp->tx_skb, 0x0, NUM_TX_DESC * sizeof(struct ring_info));
         memset(tp->Rx_skbuff, 0x0, NUM_RX_DESC * sizeof(struct sk_buff *));
  
-       if (rtl8169_rx_fill(tp, dev, 0, NUM_RX_DESC) != NUM_RX_DESC)
+       if (rtl8169_rx_fill(tp, dev, 0, NUM_RX_DESC, GFP_KERNEL) != NUM_RX_DESC)
                 goto err_out;
  
         rtl8169_mark_as_last_descriptor(tp->RxDescArray + NUM_RX_DESC - 1);
@@ -4099,7 +4101,8 @@ static void rtl8169_unmap_tx_skb(struct pci_dev *pdev, struct ring_info *tx_skb,
  {
         unsigned int len = tx_skb->len;
  
-       pci_unmap_single(pdev, le64_to_cpu(desc->addr), len, PCI_DMA_TODEVICE);
+       dma_unmap_single(&pdev->dev, le64_to_cpu(desc->addr), len,
+                        PCI_DMA_TODEVICE);
         desc->opts1 = 0x00;
         desc->opts2 = 0x00;
         desc->addr = 0x00;
@@ -4243,7 +4246,8 @@ static int rtl8169_xmit_frags(struct rtl8169_private *tp, struct sk_buff *skb,
                 txd = tp->TxDescArray + entry;
                 len = frag->size;
                 addr = ((void *) page_address(frag->page)) + frag->page_offset;
-               mapping = pci_map_single(tp->pci_dev, addr, len, PCI_DMA_TODEVICE);
+               mapping = dma_map_single(&tp->pci_dev->dev, addr, len,
+                                        PCI_DMA_TODEVICE);
  
                 /* anti gcc 2.95.3 bugware (sic) */
                 status = opts1 | len | (RingEnd * !((entry + 1) % NUM_TX_DESC));
@@ -4313,7 +4317,8 @@ static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb,
                 tp->tx_skb[entry].skb = skb;
         }
  
-       mapping = pci_map_single(tp->pci_dev, skb->data, len, PCI_DMA_TODEVICE);
+       mapping = dma_map_single(&tp->pci_dev->dev, skb->data, len,
+                                PCI_DMA_TODEVICE);
  
         tp->tx_skb[entry].len = len;
         txd->addr = cpu_to_le64(mapping);
@@ -4477,8 +4482,8 @@ static inline bool rtl8169_try_rx_copy(struct sk_buff **sk_buff,
         if (!skb)
                 goto out;
  
-       pci_dma_sync_single_for_cpu(tp->pci_dev, addr, pkt_size,
-                                   PCI_DMA_FROMDEVICE);
+       dma_sync_single_for_cpu(&tp->pci_dev->dev, addr, pkt_size,
+                               PCI_DMA_FROMDEVICE);
         skb_copy_from_linear_data(*sk_buff, skb->data, pkt_size);
         *sk_buff = skb;
         done = true;
@@ -4549,11 +4554,11 @@ static int rtl8169_rx_interrupt(struct net_device *dev,
                         rtl8169_rx_csum(skb, desc);
  
                         if (rtl8169_try_rx_copy(&skb, tp, pkt_size, addr)) {
-                               pci_dma_sync_single_for_device(pdev, addr,
+                               dma_sync_single_for_device(&pdev->dev, addr,
                                         pkt_size, PCI_DMA_FROMDEVICE);
                                 rtl8169_mark_to_asic(desc, tp->rx_buf_sz);
                         } else {
-                               pci_unmap_single(pdev, addr, tp->rx_buf_sz,
+                               dma_unmap_single(&pdev->dev, addr, tp->rx_buf_sz,
                                                  PCI_DMA_FROMDEVICE);
                                 tp->Rx_skbuff[entry] = NULL;
                         }
@@ -4583,7 +4588,7 @@ static int rtl8169_rx_interrupt(struct net_device *dev,
         count = cur_rx - tp->cur_rx;
         tp->cur_rx = cur_rx;
  
-       delta = rtl8169_rx_fill(tp, dev, tp->dirty_rx, tp->cur_rx);
+       delta = rtl8169_rx_fill(tp, dev, tp->dirty_rx, tp->cur_rx, GFP_ATOMIC);
         if (!delta && count)
                 netif_info(tp, intr, dev, "no Rx buffer allocated\n");
         tp->dirty_rx += delta;
@@ -4769,10 +4774,10 @@ static int rtl8169_close(struct net_device *dev)
  
         free_irq(dev->irq, dev);
  
-       pci_free_consistent(pdev, R8169_RX_RING_BYTES, tp->RxDescArray,
-                           tp->RxPhyAddr);
-       pci_free_consistent(pdev, R8169_TX_RING_BYTES, tp->TxDescArray,
-                           tp->TxPhyAddr);
+       dma_free_coherent(&pdev->dev, R8169_RX_RING_BYTES, tp->RxDescArray,
+                         tp->RxPhyAddr);
+       dma_free_coherent(&pdev->dev, R8169_TX_RING_BYTES, tp->TxDescArray,
+                         tp->TxPhyAddr);
         tp->TxDescArray = NULL;
         tp->RxDescArray = NULL;
  
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c

index bc3af78a869ff52881077b89cf6e5e544b6e8a91..1ec4b9e0239a8ff8c0d3cb18882ff517ef528881 100644 (file)
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -4666,7 +4666,7 @@ static int tg3_rx(struct tg3_napi *tnapi, int budget)
                                        desc_idx, *post_ptr);
                 drop_it_no_recycle:
                         /* Other statistics kept track of by card. */
-                       tp->net_stats.rx_dropped++;
+                       tp->rx_dropped++;
                         goto next_pkt;
                 }
  
@@ -4726,7 +4726,7 @@ static int tg3_rx(struct tg3_napi *tnapi, int budget)
                 if (len > (tp->dev->mtu + ETH_HLEN) &&
                     skb->protocol != htons(ETH_P_8021Q)) {
                         dev_kfree_skb(skb);
-                       goto next_pkt;
+                       goto drop_it_no_recycle;
                 }
  
                 if (desc->type_flags & RXD_FLAG_VLAN &&
@@ -9240,6 +9240,8 @@ static struct rtnl_link_stats64 *tg3_get_stats64(struct net_device *dev,
         stats->rx_missed_errors = old_stats->rx_missed_errors +
                 get_stat64(&hw_stats->rx_discards);
  
+       stats->rx_dropped = tp->rx_dropped;
+
         return stats;
  }
  
diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h

index 4937bd19096413bae1115b82cc63ce5123207536..be7ff138a7f98d58a4f4d4c8ff7fe70bb83911d9 100644 (file)
--- a/drivers/net/tg3.h
+++ b/drivers/net/tg3.h
@@ -2759,7 +2759,7 @@ struct tg3 {
  
  
         /* begin "everything else" cacheline(s) section */
-       struct rtnl_link_stats64        net_stats;
+       unsigned long                   rx_dropped;
         struct rtnl_link_stats64        net_stats_prev;
         struct tg3_ethtool_stats        estats;
         struct tg3_ethtool_stats        estats_prev;
diff --git a/drivers/net/wan/cosa.c b/drivers/net/wan/cosa.c

index 04c6cd4333f1ec83d69be506b040b75183cdfb81..10bafd59f9c336e36fbec4c6703530ce54a92a36 100644 (file)
--- a/drivers/net/wan/cosa.c
+++ b/drivers/net/wan/cosa.c
@@ -575,7 +575,7 @@ static int cosa_probe(int base, int irq, int dma)
  
                 /* Initialize the chardev data structures */
                 mutex_init(&chan->rlock);
-               init_MUTEX(&chan->wsem);
+               sema_init(&chan->wsem, 1);
  
                 /* Register the network interface */
                 if (!(chan->netdev = alloc_hdlcdev(chan))) {
diff --git a/drivers/net/wimax/i2400m/rx.c b/drivers/net/wimax/i2400m/rx.c

index 8cc9e319f4356da8904cbf4e7a495d59eba94645..1737d1488b35704f3196975a76bcc919e943808b 100644 (file)
--- a/drivers/net/wimax/i2400m/rx.c
+++ b/drivers/net/wimax/i2400m/rx.c
@@ -1244,16 +1244,16 @@ int i2400m_rx(struct i2400m *i2400m, struct sk_buff *skb)
         int i, result;
         struct device *dev = i2400m_dev(i2400m);
         const struct i2400m_msg_hdr *msg_hdr;
-       size_t pl_itr, pl_size, skb_len;
+       size_t pl_itr, pl_size;
         unsigned long flags;
-       unsigned num_pls, single_last;
+       unsigned num_pls, single_last, skb_len;
  
         skb_len = skb->len;
-       d_fnstart(4, dev, "(i2400m %p skb %p [size %zu])\n",
+       d_fnstart(4, dev, "(i2400m %p skb %p [size %u])\n",
                   i2400m, skb, skb_len);
         result = -EIO;
         msg_hdr = (void *) skb->data;
-       result = i2400m_rx_msg_hdr_check(i2400m, msg_hdr, skb->len);
+       result = i2400m_rx_msg_hdr_check(i2400m, msg_hdr, skb_len);
         if (result < 0)
                 goto error_msg_hdr_check;
         result = -EIO;
@@ -1261,10 +1261,10 @@ int i2400m_rx(struct i2400m *i2400m, struct sk_buff *skb)
         pl_itr = sizeof(*msg_hdr) +     /* Check payload descriptor(s) */
                 num_pls * sizeof(msg_hdr->pld[0]);
         pl_itr = ALIGN(pl_itr, I2400M_PL_ALIGN);
-       if (pl_itr > skb->len) {        /* got all the payload descriptors? */
+       if (pl_itr > skb_len) { /* got all the payload descriptors? */
                 dev_err(dev, "RX: HW BUG? message too short (%u bytes) for "
                         "%u payload descriptors (%zu each, total %zu)\n",
-                       skb->len, num_pls, sizeof(msg_hdr->pld[0]), pl_itr);
+                       skb_len, num_pls, sizeof(msg_hdr->pld[0]), pl_itr);
                 goto error_pl_descr_short;
         }
         /* Walk each payload payload--check we really got it */
@@ -1272,7 +1272,7 @@ int i2400m_rx(struct i2400m *i2400m, struct sk_buff *skb)
                 /* work around old gcc warnings */
                 pl_size = i2400m_pld_size(&msg_hdr->pld[i]);
                 result = i2400m_rx_pl_descr_check(i2400m, &msg_hdr->pld[i],
-                                                 pl_itr, skb->len);
+                                                 pl_itr, skb_len);
                 if (result < 0)
                         goto error_pl_descr_check;
                 single_last = num_pls == 1 || i == num_pls - 1;
@@ -1290,16 +1290,16 @@ int i2400m_rx(struct i2400m *i2400m, struct sk_buff *skb)
         if (i < i2400m->rx_pl_min)
                 i2400m->rx_pl_min = i;
         i2400m->rx_num++;
-       i2400m->rx_size_acc += skb->len;
-       if (skb->len < i2400m->rx_size_min)
-               i2400m->rx_size_min = skb->len;
-       if (skb->len > i2400m->rx_size_max)
-               i2400m->rx_size_max = skb->len;
+       i2400m->rx_size_acc += skb_len;
+       if (skb_len < i2400m->rx_size_min)
+               i2400m->rx_size_min = skb_len;
+       if (skb_len > i2400m->rx_size_max)
+               i2400m->rx_size_max = skb_len;
         spin_unlock_irqrestore(&i2400m->rx_lock, flags);
  error_pl_descr_check:
  error_pl_descr_short:
  error_msg_hdr_check:
-       d_fnend(4, dev, "(i2400m %p skb %p [size %zu]) = %d\n",
+       d_fnend(4, dev, "(i2400m %p skb %p [size %u]) = %d\n",
                 i2400m, skb, skb_len, result);
         return result;
  }
diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c

index b336cd9ee7a114c54d6b7b85d48cacc6f3ed1a6d..f9bda64fcd1b62771880d5d823b53f797cb9c7f0 100644 (file)
--- a/drivers/oprofile/oprof.c
+++ b/drivers/oprofile/oprof.c
@@ -225,26 +225,17 @@ post_sync:
         mutex_unlock(&start_mutex);
  }
  
-int oprofile_set_backtrace(unsigned long val)
+int oprofile_set_ulong(unsigned long *addr, unsigned long val)
  {
-       int err = 0;
+       int err = -EBUSY;
  
         mutex_lock(&start_mutex);
-
-       if (oprofile_started) {
-               err = -EBUSY;
-               goto out;
-       }
-
-       if (!oprofile_ops.backtrace) {
-               err = -EINVAL;
-               goto out;
+       if (!oprofile_started) {
+               *addr = val;
+               err = 0;
         }
-
-       oprofile_backtrace_depth = val;
-
-out:
         mutex_unlock(&start_mutex);
+
         return err;
  }
  
@@ -257,16 +248,9 @@ static int __init oprofile_init(void)
                 printk(KERN_INFO "oprofile: using timer interrupt.\n");
                 err = oprofile_timer_init(&oprofile_ops);
                 if (err)
-                       goto out_arch;
+                       return err;
         }
-       err = oprofilefs_register();
-       if (err)
-               goto out_arch;
-       return 0;
-
-out_arch:
-       oprofile_arch_exit();
-       return err;
+       return oprofilefs_register();
  }
  
  
diff --git a/drivers/oprofile/oprof.h b/drivers/oprofile/oprof.h

index 47e12cb4ee8ba7464e4b0c7ef955be71b1caa201..177b73de5e5f158cbb31fe27f8e31b9363c3a255 100644 (file)
--- a/drivers/oprofile/oprof.h
+++ b/drivers/oprofile/oprof.h
@@ -37,7 +37,7 @@ void oprofile_create_files(struct super_block *sb, struct dentry *root);
  int oprofile_timer_init(struct oprofile_operations *ops);
  void oprofile_timer_exit(void);
  
-int oprofile_set_backtrace(unsigned long depth);
+int oprofile_set_ulong(unsigned long *addr, unsigned long val);
  int oprofile_set_timeout(unsigned long time);
  
  #endif /* OPROF_H */
diff --git a/drivers/oprofile/oprofile_files.c b/drivers/oprofile/oprofile_files.c

index bbd7516e0869461c141659004580d509ec5e97a5..ccf099e684a46e90e53bcb17ad72536cdc42a4de 100644 (file)
--- a/drivers/oprofile/oprofile_files.c
+++ b/drivers/oprofile/oprofile_files.c
@@ -79,14 +79,17 @@ static ssize_t depth_write(struct file *file, char const __user *buf, size_t cou
         if (*offset)
                 return -EINVAL;
  
+       if (!oprofile_ops.backtrace)
+               return -EINVAL;
+
         retval = oprofilefs_ulong_from_user(&val, buf, count);
         if (retval)
                 return retval;
  
-       retval = oprofile_set_backtrace(val);
-
+       retval = oprofile_set_ulong(&oprofile_backtrace_depth, val);
         if (retval)
                 return retval;
+
         return count;
  }
  
diff --git a/drivers/oprofile/oprofile_perf.c b/drivers/oprofile/oprofile_perf.c

new file mode 100644 (file)

index 0000000..9046f7b
--- /dev/null
+++ b/drivers/oprofile/oprofile_perf.c
@@ -0,0 +1,328 @@
+/*
+ * Copyright 2010 ARM Ltd.
+ *
+ * Perf-events backend for OProfile.
+ */
+#include <linux/perf_event.h>
+#include <linux/platform_device.h>
+#include <linux/oprofile.h>
+#include <linux/slab.h>
+
+/*
+ * Per performance monitor configuration as set via oprofilefs.
+ */
+struct op_counter_config {
+       unsigned long count;
+       unsigned long enabled;
+       unsigned long event;
+       unsigned long unit_mask;
+       unsigned long kernel;
+       unsigned long user;
+       struct perf_event_attr attr;
+};
+
+static int oprofile_perf_enabled;
+static DEFINE_MUTEX(oprofile_perf_mutex);
+
+static struct op_counter_config *counter_config;
+static struct perf_event **perf_events[nr_cpumask_bits];
+static int num_counters;
+
+/*
+ * Overflow callback for oprofile.
+ */
+static void op_overflow_handler(struct perf_event *event, int unused,
+                       struct perf_sample_data *data, struct pt_regs *regs)
+{
+       int id;
+       u32 cpu = smp_processor_id();
+
+       for (id = 0; id < num_counters; ++id)
+               if (perf_events[cpu][id] == event)
+                       break;
+
+       if (id != num_counters)
+               oprofile_add_sample(regs, id);
+       else
+               pr_warning("oprofile: ignoring spurious overflow "
+                               "on cpu %u\n", cpu);
+}
+
+/*
+ * Called by oprofile_perf_setup to create perf attributes to mirror the oprofile
+ * settings in counter_config. Attributes are created as `pinned' events and
+ * so are permanently scheduled on the PMU.
+ */
+static void op_perf_setup(void)
+{
+       int i;
+       u32 size = sizeof(struct perf_event_attr);
+       struct perf_event_attr *attr;
+
+       for (i = 0; i < num_counters; ++i) {
+               attr = &counter_config[i].attr;
+               memset(attr, 0, size);
+               attr->type              = PERF_TYPE_RAW;
+               attr->size              = size;
+               attr->config            = counter_config[i].event;
+               attr->sample_period     = counter_config[i].count;
+               attr->pinned            = 1;
+       }
+}
+
+static int op_create_counter(int cpu, int event)
+{
+       struct perf_event *pevent;
+
+       if (!counter_config[event].enabled || perf_events[cpu][event])
+               return 0;
+
+       pevent = perf_event_create_kernel_counter(&counter_config[event].attr,
+                                                 cpu, NULL,
+                                                 op_overflow_handler);
+
+       if (IS_ERR(pevent))
+               return PTR_ERR(pevent);
+
+       if (pevent->state != PERF_EVENT_STATE_ACTIVE) {
+               perf_event_release_kernel(pevent);
+               pr_warning("oprofile: failed to enable event %d "
+                               "on CPU %d\n", event, cpu);
+               return -EBUSY;
+       }
+
+       perf_events[cpu][event] = pevent;
+
+       return 0;
+}
+
+static void op_destroy_counter(int cpu, int event)
+{
+       struct perf_event *pevent = perf_events[cpu][event];
+
+       if (pevent) {
+               perf_event_release_kernel(pevent);
+               perf_events[cpu][event] = NULL;
+       }
+}
+
+/*
+ * Called by oprofile_perf_start to create active perf events based on the
+ * perviously configured attributes.
+ */
+static int op_perf_start(void)
+{
+       int cpu, event, ret = 0;
+
+       for_each_online_cpu(cpu) {
+               for (event = 0; event < num_counters; ++event) {
+                       ret = op_create_counter(cpu, event);
+                       if (ret)
+                               return ret;
+               }
+       }
+
+       return ret;
+}
+
+/*
+ * Called by oprofile_perf_stop at the end of a profiling run.
+ */
+static void op_perf_stop(void)
+{
+       int cpu, event;
+
+       for_each_online_cpu(cpu)
+               for (event = 0; event < num_counters; ++event)
+                       op_destroy_counter(cpu, event);
+}
+
+static int oprofile_perf_create_files(struct super_block *sb, struct dentry *root)
+{
+       unsigned int i;
+
+       for (i = 0; i < num_counters; i++) {
+               struct dentry *dir;
+               char buf[4];
+
+               snprintf(buf, sizeof buf, "%d", i);
+               dir = oprofilefs_mkdir(sb, root, buf);
+               oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled);
+               oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event);
+               oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count);
+               oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask);
+               oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel);
+               oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user);
+       }
+
+       return 0;
+}
+
+static int oprofile_perf_setup(void)
+{
+       spin_lock(&oprofilefs_lock);
+       op_perf_setup();
+       spin_unlock(&oprofilefs_lock);
+       return 0;
+}
+
+static int oprofile_perf_start(void)
+{
+       int ret = -EBUSY;
+
+       mutex_lock(&oprofile_perf_mutex);
+       if (!oprofile_perf_enabled) {
+               ret = 0;
+               op_perf_start();
+               oprofile_perf_enabled = 1;
+       }
+       mutex_unlock(&oprofile_perf_mutex);
+       return ret;
+}
+
+static void oprofile_perf_stop(void)
+{
+       mutex_lock(&oprofile_perf_mutex);
+       if (oprofile_perf_enabled)
+               op_perf_stop();
+       oprofile_perf_enabled = 0;
+       mutex_unlock(&oprofile_perf_mutex);
+}
+
+#ifdef CONFIG_PM
+
+static int oprofile_perf_suspend(struct platform_device *dev, pm_message_t state)
+{
+       mutex_lock(&oprofile_perf_mutex);
+       if (oprofile_perf_enabled)
+               op_perf_stop();
+       mutex_unlock(&oprofile_perf_mutex);
+       return 0;
+}
+
+static int oprofile_perf_resume(struct platform_device *dev)
+{
+       mutex_lock(&oprofile_perf_mutex);
+       if (oprofile_perf_enabled && op_perf_start())
+               oprofile_perf_enabled = 0;
+       mutex_unlock(&oprofile_perf_mutex);
+       return 0;
+}
+
+static struct platform_driver oprofile_driver = {
+       .driver         = {
+               .name           = "oprofile-perf",
+       },
+       .resume         = oprofile_perf_resume,
+       .suspend        = oprofile_perf_suspend,
+};
+
+static struct platform_device *oprofile_pdev;
+
+static int __init init_driverfs(void)
+{
+       int ret;
+
+       ret = platform_driver_register(&oprofile_driver);
+       if (ret)
+               return ret;
+
+       oprofile_pdev = platform_device_register_simple(
+                               oprofile_driver.driver.name, 0, NULL, 0);
+       if (IS_ERR(oprofile_pdev)) {
+               ret = PTR_ERR(oprofile_pdev);
+               platform_driver_unregister(&oprofile_driver);
+       }
+
+       return ret;
+}
+
+static void exit_driverfs(void)
+{
+       platform_device_unregister(oprofile_pdev);
+       platform_driver_unregister(&oprofile_driver);
+}
+
+#else
+
+static inline int  init_driverfs(void) { return 0; }
+static inline void exit_driverfs(void) { }
+
+#endif /* CONFIG_PM */
+
+void oprofile_perf_exit(void)
+{
+       int cpu, id;
+       struct perf_event *event;
+
+       for_each_possible_cpu(cpu) {
+               for (id = 0; id < num_counters; ++id) {
+                       event = perf_events[cpu][id];
+                       if (event)
+                               perf_event_release_kernel(event);
+               }
+
+               kfree(perf_events[cpu]);
+       }
+
+       kfree(counter_config);
+       exit_driverfs();
+}
+
+int __init oprofile_perf_init(struct oprofile_operations *ops)
+{
+       int cpu, ret = 0;
+
+       ret = init_driverfs();
+       if (ret)
+               return ret;
+
+       memset(&perf_events, 0, sizeof(perf_events));
+
+       num_counters = perf_num_counters();
+       if (num_counters <= 0) {
+               pr_info("oprofile: no performance counters\n");
+               ret = -ENODEV;
+               goto out;
+       }
+
+       counter_config = kcalloc(num_counters,
+                       sizeof(struct op_counter_config), GFP_KERNEL);
+
+       if (!counter_config) {
+               pr_info("oprofile: failed to allocate %d "
+                               "counters\n", num_counters);
+               ret = -ENOMEM;
+               num_counters = 0;
+               goto out;
+       }
+
+       for_each_possible_cpu(cpu) {
+               perf_events[cpu] = kcalloc(num_counters,
+                               sizeof(struct perf_event *), GFP_KERNEL);
+               if (!perf_events[cpu]) {
+                       pr_info("oprofile: failed to allocate %d perf events "
+                                       "for cpu %d\n", num_counters, cpu);
+                       ret = -ENOMEM;
+                       goto out;
+               }
+       }
+
+       ops->create_files       = oprofile_perf_create_files;
+       ops->setup              = oprofile_perf_setup;
+       ops->start              = oprofile_perf_start;
+       ops->stop               = oprofile_perf_stop;
+       ops->shutdown           = oprofile_perf_stop;
+       ops->cpu_type           = op_name_from_perf_id();
+
+       if (!ops->cpu_type)
+               ret = -ENODEV;
+       else
+               pr_info("oprofile: using %s\n", ops->cpu_type);
+
+out:
+       if (ret)
+               oprofile_perf_exit();
+
+       return ret;
+}
diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c

index 2766a6d3c2e9c8fe2f7ab2a2d1f3457275693f80..1944621930d96fc51a8f0497037b4dac74e0329b 100644 (file)
--- a/drivers/oprofile/oprofilefs.c
+++ b/drivers/oprofile/oprofilefs.c
@@ -91,16 +91,20 @@ static ssize_t ulong_read_file(struct file *file, char __user *buf, size_t count
  
  static ssize_t ulong_write_file(struct file *file, char const __user *buf, size_t count, loff_t *offset)
  {
-       unsigned long *value = file->private_data;
+       unsigned long value;
         int retval;
  
         if (*offset)
                 return -EINVAL;
  
-       retval = oprofilefs_ulong_from_user(value, buf, count);
+       retval = oprofilefs_ulong_from_user(&value, buf, count);
+       if (retval)
+               return retval;
  
+       retval = oprofile_set_ulong(file->private_data, value);
         if (retval)
                 return retval;
+
         return count;
  }
  
@@ -126,50 +130,41 @@ static const struct file_operations ulong_ro_fops = {
  };
  
  
-static struct dentry *__oprofilefs_create_file(struct super_block *sb,
+static int __oprofilefs_create_file(struct super_block *sb,
         struct dentry *root, char const *name, const struct file_operations *fops,
-       int perm)
+       int perm, void *priv)
  {
         struct dentry *dentry;
         struct inode *inode;
  
         dentry = d_alloc_name(root, name);
         if (!dentry)
-               return NULL;
+               return -ENOMEM;
         inode = oprofilefs_get_inode(sb, S_IFREG | perm);
         if (!inode) {
                 dput(dentry);
-               return NULL;
+               return -ENOMEM;
         }
         inode->i_fop = fops;
         d_add(dentry, inode);
-       return dentry;
+       dentry->d_inode->i_private = priv;
+       return 0;
  }
  
  
  int oprofilefs_create_ulong(struct super_block *sb, struct dentry *root,
         char const *name, unsigned long *val)
  {
-       struct dentry *d = __oprofilefs_create_file(sb, root, name,
-                                                    &ulong_fops, 0644);
-       if (!d)
-               return -EFAULT;
-
-       d->d_inode->i_private = val;
-       return 0;
+       return __oprofilefs_create_file(sb, root, name,
+                                       &ulong_fops, 0644, val);
  }
  
  
  int oprofilefs_create_ro_ulong(struct super_block *sb, struct dentry *root,
         char const *name, unsigned long *val)
  {
-       struct dentry *d = __oprofilefs_create_file(sb, root, name,
-                                                    &ulong_ro_fops, 0444);
-       if (!d)
-               return -EFAULT;
-
-       d->d_inode->i_private = val;
-       return 0;
+       return __oprofilefs_create_file(sb, root, name,
+                                       &ulong_ro_fops, 0444, val);
  }
  
  
@@ -189,31 +184,22 @@ static const struct file_operations atomic_ro_fops = {
  int oprofilefs_create_ro_atomic(struct super_block *sb, struct dentry *root,
         char const *name, atomic_t *val)
  {
-       struct dentry *d = __oprofilefs_create_file(sb, root, name,
-                                                    &atomic_ro_fops, 0444);
-       if (!d)
-               return -EFAULT;
-
-       d->d_inode->i_private = val;
-       return 0;
+       return __oprofilefs_create_file(sb, root, name,
+                                       &atomic_ro_fops, 0444, val);
  }
  
  
  int oprofilefs_create_file(struct super_block *sb, struct dentry *root,
         char const *name, const struct file_operations *fops)
  {
-       if (!__oprofilefs_create_file(sb, root, name, fops, 0644))
-               return -EFAULT;
-       return 0;
+       return __oprofilefs_create_file(sb, root, name, fops, 0644, NULL);
  }
  
  
  int oprofilefs_create_file_perm(struct super_block *sb, struct dentry *root,
         char const *name, const struct file_operations *fops, int perm)
  {
-       if (!__oprofilefs_create_file(sb, root, name, fops, perm))
-               return -EFAULT;
-       return 0;
+       return __oprofilefs_create_file(sb, root, name, fops, perm, NULL);
  }
  
  
diff --git a/drivers/parport/share.c b/drivers/parport/share.c

index dffa5d4fb2986f89b6ab26882bd9de63df81e2e1..a2d9d1e59260eab17097e0a12f16908eed26c552 100644 (file)
--- a/drivers/parport/share.c
+++ b/drivers/parport/share.c
@@ -306,7 +306,7 @@ struct parport *parport_register_port(unsigned long base, int irq, int dma,
         spin_lock_init(&tmp->pardevice_lock);
         tmp->ieee1284.mode = IEEE1284_MODE_COMPAT;
         tmp->ieee1284.phase = IEEE1284_PH_FWD_IDLE;
-       init_MUTEX_LOCKED (&tmp->ieee1284.irq); /* actually a semaphore at 0 */
+       sema_init(&tmp->ieee1284.irq, 0);
         tmp->spintime = parport_default_spintime;
         atomic_set (&tmp->ref_count, 1);
         INIT_LIST_HEAD(&tmp->full_list);
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c

index ad0ed212db4ad094441f7a5656639e989c980738..348fba0a8976467fa9724411699b918a9cd8b0ea 100644 (file)
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -1046,13 +1046,13 @@ int scsi_get_vpd_page(struct scsi_device *sdev, u8 page, unsigned char *buf,
  
         /* If the user actually wanted this page, we can skip the rest */
         if (page == 0)
-               return -EINVAL;
+               return 0;
  
         for (i = 0; i < min((int)buf[3], buf_len - 4); i++)
                 if (buf[i + 4] == page)
                         goto found;
  
-       if (i < buf[3] && i > buf_len)
+       if (i < buf[3] && i >= buf_len - 4)
                 /* ran off the end of the buffer, give us benefit of doubt */
                 goto found;
         /* The device claims it doesn't support the requested page */
diff --git a/drivers/serial/ioc3_serial.c b/drivers/serial/ioc3_serial.c

index 93de907b12088a54ab5809cf79c46ba5ec2757f2..800c54602339780cc79d615e16fb89f49bf3d5f5 100644 (file)
--- a/drivers/serial/ioc3_serial.c
+++ b/drivers/serial/ioc3_serial.c
@@ -2044,6 +2044,7 @@ ioc3uart_probe(struct ioc3_submodule *is, struct ioc3_driver_data *idd)
                 if (!port) {
                         printk(KERN_WARNING
                                "IOC3 serial memory not available for port\n");
+                       ret = -ENOMEM;
                         goto out4;
                 }
                 spin_lock_init(&port->ip_lock);
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c

index 7c8008225ee3a47ebef2cecaf005c0631fa93da1..17927b1f9334ead1e854ed539dea352f8718c589 100644 (file)
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -127,7 +127,10 @@ static void handle_tx(struct vhost_net *net)
         size_t len, total_len = 0;
         int err, wmem;
         size_t hdr_size;
-       struct socket *sock = rcu_dereference(vq->private_data);
+       struct socket *sock;
+
+       sock = rcu_dereference_check(vq->private_data,
+                                    lockdep_is_held(&vq->mutex));
         if (!sock)
                 return;
  
@@ -582,7 +585,10 @@ static void vhost_net_disable_vq(struct vhost_net *n,
  static void vhost_net_enable_vq(struct vhost_net *n,
                                 struct vhost_virtqueue *vq)
  {
-       struct socket *sock = vq->private_data;
+       struct socket *sock;
+
+       sock = rcu_dereference_protected(vq->private_data,
+                                        lockdep_is_held(&vq->mutex));
         if (!sock)
                 return;
         if (vq == n->vqs + VHOST_NET_VQ_TX) {
@@ -598,7 +604,8 @@ static struct socket *vhost_net_stop_vq(struct vhost_net *n,
         struct socket *sock;
  
         mutex_lock(&vq->mutex);
-       sock = vq->private_data;
+       sock = rcu_dereference_protected(vq->private_data,
+                                        lockdep_is_held(&vq->mutex));
         vhost_net_disable_vq(n, vq);
         rcu_assign_pointer(vq->private_data, NULL);
         mutex_unlock(&vq->mutex);
@@ -736,7 +743,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
         }
  
         /* start polling new socket */
-       oldsock = vq->private_data;
+       oldsock = rcu_dereference_protected(vq->private_data,
+                                           lockdep_is_held(&vq->mutex));
         if (sock != oldsock) {
                  vhost_net_disable_vq(n, vq);
                  rcu_assign_pointer(vq->private_data, sock);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c

index dd3d6f7406f80092cdd8843acdd06c313e9cf164..8b5a1b33d0fed906ef6d0873027a3275858e8c2c 100644 (file)
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -320,7 +320,7 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
         vhost_dev_cleanup(dev);
  
         memory->nregions = 0;
-       dev->memory = memory;
+       RCU_INIT_POINTER(dev->memory, memory);
         return 0;
  }
  
@@ -352,8 +352,9 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
                 fput(dev->log_file);
         dev->log_file = NULL;
         /* No one will access memory at this point */
-       kfree(dev->memory);
-       dev->memory = NULL;
+       kfree(rcu_dereference_protected(dev->memory,
+                                       lockdep_is_held(&dev->mutex)));
+       RCU_INIT_POINTER(dev->memory, NULL);
         if (dev->mm)
                 mmput(dev->mm);
         dev->mm = NULL;
@@ -440,14 +441,22 @@ static int vq_access_ok(unsigned int num,
  /* Caller should have device mutex but not vq mutex */
  int vhost_log_access_ok(struct vhost_dev *dev)
  {
-       return memory_access_ok(dev, dev->memory, 1);
+       struct vhost_memory *mp;
+
+       mp = rcu_dereference_protected(dev->memory,
+                                      lockdep_is_held(&dev->mutex));
+       return memory_access_ok(dev, mp, 1);
  }
  
  /* Verify access for write logging. */
  /* Caller should have vq mutex and device mutex */
  static int vq_log_access_ok(struct vhost_virtqueue *vq, void __user *log_base)
  {
-       return vq_memory_access_ok(log_base, vq->dev->memory,
+       struct vhost_memory *mp;
+
+       mp = rcu_dereference_protected(vq->dev->memory,
+                                      lockdep_is_held(&vq->mutex));
+       return vq_memory_access_ok(log_base, mp,
                             vhost_has_feature(vq->dev, VHOST_F_LOG_ALL)) &&
                 (!vq->log_used || log_access_ok(log_base, vq->log_addr,
                                         sizeof *vq->used +
@@ -487,7 +496,8 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
                 kfree(newmem);
                 return -EFAULT;
         }
-       oldmem = d->memory;
+       oldmem = rcu_dereference_protected(d->memory,
+                                          lockdep_is_held(&d->mutex));
         rcu_assign_pointer(d->memory, newmem);
         synchronize_rcu();
         kfree(oldmem);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h

index afd77295971ce3044117d0d6e5ea8f4e20f655fe..af3c11ded5fd4910ed0dccea161a801298036731 100644 (file)
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -106,7 +106,7 @@ struct vhost_virtqueue {
          * vhost_work execution acts instead of rcu_read_lock() and the end of
          * vhost_work execution acts instead of rcu_read_lock().
          * Writers use virtqueue mutex. */
-       void *private_data;
+       void __rcu *private_data;
         /* Log write descriptors */
         void __user *log_base;
         struct vhost_log log[VHOST_NET_MAX_SG];
@@ -116,7 +116,7 @@ struct vhost_dev {
         /* Readers use RCU to access memory table pointer
          * log base pointer and features.
          * Writers use mutex below.*/
-       struct vhost_memory *memory;
+       struct vhost_memory __rcu *memory;
         struct mm_struct *mm;
         struct mutex mutex;
         unsigned acked_features;
@@ -173,7 +173,11 @@ enum {
  
  static inline int vhost_has_feature(struct vhost_dev *dev, int bit)
  {
-       unsigned acked_features = rcu_dereference(dev->acked_features);
+       unsigned acked_features;
+
+       acked_features =
+               rcu_dereference_index_check(dev->acked_features,
+                                           lockdep_is_held(&dev->mutex));
         return acked_features & (1 << bit);
  }
  
diff --git a/fs/affs/super.c b/fs/affs/super.c

index 33c4e7eef470e995246562b774fd88a82ca7f12b..9581ea94d5a146e95ba9533d1a7ac3aba2b83804 100644 (file)
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -109,8 +109,8 @@ static void init_once(void *foo)
  {
         struct affs_inode_info *ei = (struct affs_inode_info *) foo;
  
-       init_MUTEX(&ei->i_link_lock);
-       init_MUTEX(&ei->i_ext_lock);
+       sema_init(&ei->i_link_lock, 1);
+       sema_init(&ei->i_ext_lock, 1);
         inode_init_once(&ei->vfs_inode);
  }
  
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c

index f96eff04e11ab4a8b23f7489ee4b0de50e67e152..a6395bdb26aeb13b7b98c74df4f77780c1c95412 100644 (file)
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -134,10 +134,6 @@ static int aout_core_dump(struct coredump_params *cprm)
                 if (!dump_write(file, dump_start, dump_size))
                         goto end_coredump;
         }
-/* Finally dump the task struct.  Not be used by gdb, but could be useful */
-       set_fs(KERNEL_DS);
-       if (!dump_write(file, current, sizeof(*current)))
-               goto end_coredump;
  end_coredump:
         set_fs(fs);
         return has_dumped;
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig

index 0fcd2640c23fdda2c7fd99415b730837c591eba8..9eb134ea6eb223a45be745c77f9530b80fcdc082 100644 (file)
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -1,9 +1,11 @@
  config CEPH_FS
          tristate "Ceph distributed file system (EXPERIMENTAL)"
         depends on INET && EXPERIMENTAL
+       select CEPH_LIB
         select LIBCRC32C
         select CRYPTO_AES
         select CRYPTO
+       default n
         help
           Choose Y or M here to include support for mounting the
           experimental Ceph distributed file system.  Ceph is an extremely
@@ -14,15 +16,3 @@ config CEPH_FS
  
           If unsure, say N.
  
-config CEPH_FS_PRETTYDEBUG
-       bool "Include file:line in ceph debug output"
-       depends on CEPH_FS
-       default n
-       help
-         If you say Y here, debug output will include a filename and
-         line to aid debugging.  This icnreases kernel size and slows
-         execution slightly when debug call sites are enabled (e.g.,
-         via CONFIG_DYNAMIC_DEBUG).
-
-         If unsure, say N.
-
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile

index 278e1172600dc3a3d5acba3654c53719d6f38697..9e6c4f2e8ff1f3e2712979d791da9e55fa780982 100644 (file)
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -8,15 +8,8 @@ obj-$(CONFIG_CEPH_FS) += ceph.o
  
  ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
         export.o caps.o snap.o xattr.o \
-       messenger.o msgpool.o buffer.o pagelist.o \
-       mds_client.o mdsmap.o \
-       mon_client.o \
-       osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
-       debugfs.o \
-       auth.o auth_none.o \
-       crypto.o armor.o \
-       auth_x.o \
-       ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
+       mds_client.o mdsmap.o strings.o ceph_frag.o \
+       debugfs.o
  
  else
  #Otherwise we were called directly from the command
diff --git a/fs/ceph/README b/fs/ceph/README

deleted file mode 100644 (file)

index 18352fa..0000000
--- a/fs/ceph/README
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# The following files are shared by (and manually synchronized
-# between) the Ceph userland and kernel client.
-#
-# userland                  kernel
-src/include/ceph_fs.h      fs/ceph/ceph_fs.h
-src/include/ceph_fs.cc     fs/ceph/ceph_fs.c
-src/include/msgr.h         fs/ceph/msgr.h
-src/include/rados.h        fs/ceph/rados.h
-src/include/ceph_strings.cc fs/ceph/ceph_strings.c
-src/include/ceph_frag.h            fs/ceph/ceph_frag.h
-src/include/ceph_frag.cc    fs/ceph/ceph_frag.c
-src/include/ceph_hash.h            fs/ceph/ceph_hash.h
-src/include/ceph_hash.cc    fs/ceph/ceph_hash.c
-src/crush/crush.c          fs/ceph/crush/crush.c
-src/crush/crush.h          fs/ceph/crush/crush.h
-src/crush/mapper.c         fs/ceph/crush/mapper.c
-src/crush/mapper.h         fs/ceph/crush/mapper.h
-src/crush/hash.h           fs/ceph/crush/hash.h
-src/crush/hash.c           fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c

index efbc604001c8bbfa3b96eb107529f0f03256b1a1..51bcc5ce323024a995d300b4a6035ecf8e72e94b 100644 (file)
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <linux/backing-dev.h>
  #include <linux/fs.h>
@@ -10,7 +10,8 @@
  #include <linux/task_io_accounting_ops.h>
  
  #include "super.h"
-#include "osd_client.h"
+#include "mds_client.h"
+#include <linux/ceph/osd_client.h>
  
  /*
   * Ceph address space ops.
@@ -193,7 +194,8 @@ static int readpage_nounlock(struct file *filp, struct page *page)
  {
         struct inode *inode = filp->f_dentry->d_inode;
         struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+       struct ceph_osd_client *osdc = 
+               &ceph_inode_to_client(inode)->client->osdc;
         int err = 0;
         u64 len = PAGE_CACHE_SIZE;
  
@@ -265,7 +267,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
  {
         struct inode *inode = file->f_dentry->d_inode;
         struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+       struct ceph_osd_client *osdc =
+               &ceph_inode_to_client(inode)->client->osdc;
         int rc = 0;
         struct page **pages;
         loff_t offset;
@@ -365,7 +368,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
  {
         struct inode *inode;
         struct ceph_inode_info *ci;
-       struct ceph_client *client;
+       struct ceph_fs_client *fsc;
         struct ceph_osd_client *osdc;
         loff_t page_off = page->index << PAGE_CACHE_SHIFT;
         int len = PAGE_CACHE_SIZE;
@@ -383,8 +386,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
         }
         inode = page->mapping->host;
         ci = ceph_inode(inode);
-       client = ceph_inode_to_client(inode);
-       osdc = &client->osdc;
+       fsc = ceph_inode_to_client(inode);
+       osdc = &fsc->client->osdc;
  
         /* verify this is a writeable snap context */
         snapc = (void *)page->private;
@@ -414,10 +417,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
         dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
              inode, page, page->index, page_off, len, snapc);
  
-       writeback_stat = atomic_long_inc_return(&client->writeback_count);
+       writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
         if (writeback_stat >
-           CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
-               set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+           CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
+               set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
  
         set_page_writeback(page);
         err = ceph_osdc_writepages(osdc, ceph_vino(inode),
@@ -496,7 +499,7 @@ static void writepages_finish(struct ceph_osd_request *req,
         struct address_space *mapping = inode->i_mapping;
         __s32 rc = -EIO;
         u64 bytes = 0;
-       struct ceph_client *client = ceph_inode_to_client(inode);
+       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
         long writeback_stat;
         unsigned issued = ceph_caps_issued(ci);
  
@@ -529,10 +532,10 @@ static void writepages_finish(struct ceph_osd_request *req,
                 WARN_ON(!PageUptodate(page));
  
                 writeback_stat =
-                       atomic_long_dec_return(&client->writeback_count);
+                       atomic_long_dec_return(&fsc->writeback_count);
                 if (writeback_stat <
-                   CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
-                       clear_bdi_congested(&client->backing_dev_info,
+                   CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
+                       clear_bdi_congested(&fsc->backing_dev_info,
                                             BLK_RW_ASYNC);
  
                 ceph_put_snap_context((void *)page->private);
@@ -569,13 +572,13 @@ static void writepages_finish(struct ceph_osd_request *req,
   * mempool.  we avoid the mempool if we can because req->r_num_pages
   * may be less than the maximum write size.
   */
-static void alloc_page_vec(struct ceph_client *client,
+static void alloc_page_vec(struct ceph_fs_client *fsc,
                            struct ceph_osd_request *req)
  {
         req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
                                GFP_NOFS);
         if (!req->r_pages) {
-               req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
+               req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
                 req->r_pages_from_pool = 1;
                 WARN_ON(!req->r_pages);
         }
@@ -590,7 +593,7 @@ static int ceph_writepages_start(struct address_space *mapping,
         struct inode *inode = mapping->host;
         struct backing_dev_info *bdi = mapping->backing_dev_info;
         struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_client *client;
+       struct ceph_fs_client *fsc;
         pgoff_t index, start, end;
         int range_whole = 0;
         int should_loop = 1;
@@ -617,13 +620,13 @@ static int ceph_writepages_start(struct address_space *mapping,
              wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
              (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
  
-       client = ceph_inode_to_client(inode);
-       if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
+       fsc = ceph_inode_to_client(inode);
+       if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
                 pr_warning("writepage_start %p on forced umount\n", inode);
                 return -EIO; /* we're in a forced umount, don't write! */
         }
-       if (client->mount_args->wsize && client->mount_args->wsize < wsize)
-               wsize = client->mount_args->wsize;
+       if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
+               wsize = fsc->mount_options->wsize;
         if (wsize < PAGE_CACHE_SIZE)
                 wsize = PAGE_CACHE_SIZE;
         max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
@@ -769,7 +772,7 @@ get_more_pages:
                                 offset = (unsigned long long)page->index
                                         << PAGE_CACHE_SHIFT;
                                 len = wsize;
-                               req = ceph_osdc_new_request(&client->osdc,
+                               req = ceph_osdc_new_request(&fsc->client->osdc,
                                             &ci->i_layout,
                                             ceph_vino(inode),
                                             offset, &len,
@@ -782,7 +785,7 @@ get_more_pages:
                                             &inode->i_mtime, true, 1);
                                 max_pages = req->r_num_pages;
  
-                               alloc_page_vec(client, req);
+                               alloc_page_vec(fsc, req);
                                 req->r_callback = writepages_finish;
                                 req->r_inode = inode;
                         }
@@ -794,10 +797,10 @@ get_more_pages:
                              inode, page, page->index);
  
                         writeback_stat =
-                              atomic_long_inc_return(&client->writeback_count);
+                              atomic_long_inc_return(&fsc->writeback_count);
                         if (writeback_stat > CONGESTION_ON_THRESH(
-                                   client->mount_args->congestion_kb)) {
-                               set_bdi_congested(&client->backing_dev_info,
+                                   fsc->mount_options->congestion_kb)) {
+                               set_bdi_congested(&fsc->backing_dev_info,
                                                   BLK_RW_ASYNC);
                         }
  
@@ -846,7 +849,7 @@ get_more_pages:
                 op->payload_len = cpu_to_le32(len);
                 req->r_request->hdr.data_len = cpu_to_le32(len);
  
-               ceph_osdc_start_request(&client->osdc, req, true);
+               ceph_osdc_start_request(&fsc->client->osdc, req, true);
                 req = NULL;
  
                 /* continue? */
@@ -915,7 +918,7 @@ static int ceph_update_writeable_page(struct file *file,
  {
         struct inode *inode = file->f_dentry->d_inode;
         struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+       struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
         loff_t page_off = pos & PAGE_CACHE_MASK;
         int pos_in_page = pos & ~PAGE_CACHE_MASK;
         int end_in_page = pos_in_page + len;
@@ -1053,8 +1056,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
                           struct page *page, void *fsdata)
  {
         struct inode *inode = file->f_dentry->d_inode;
-       struct ceph_client *client = ceph_inode_to_client(inode);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         unsigned from = pos & (PAGE_CACHE_SIZE - 1);
         int check_cap = 0;
  
@@ -1123,7 +1126,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
  {
         struct inode *inode = vma->vm_file->f_dentry->d_inode;
         struct page *page = vmf->page;
-       struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+       struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
         loff_t off = page->index << PAGE_CACHE_SHIFT;
         loff_t size, len;
         int ret;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c

index 5e9da996a1519c42880872267d7609cc82eb2dc2..98ab13e2b71d9d8ebcbf1715da112d78cb0bac17 100644 (file)
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <linux/fs.h>
  #include <linux/kernel.h>
@@ -9,8 +9,9 @@
  #include <linux/writeback.h>
  
  #include "super.h"
-#include "decode.h"
-#include "messenger.h"
+#include "mds_client.h"
+#include <linux/ceph/decode.h>
+#include <linux/ceph/messenger.h>
  
  /*
   * Capability management
@@ -287,11 +288,11 @@ void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
         spin_unlock(&mdsc->caps_list_lock);
  }
  
-void ceph_reservation_status(struct ceph_client *client,
+void ceph_reservation_status(struct ceph_fs_client *fsc,
                              int *total, int *avail, int *used, int *reserved,
                              int *min)
  {
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_mds_client *mdsc = fsc->mdsc;
  
         if (total)
                 *total = mdsc->caps_total_count;
@@ -399,7 +400,7 @@ static void __insert_cap_node(struct ceph_inode_info *ci,
  static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
                                struct ceph_inode_info *ci)
  {
-       struct ceph_mount_args *ma = mdsc->client->mount_args;
+       struct ceph_mount_options *ma = mdsc->fsc->mount_options;
  
         ci->i_hold_caps_min = round_jiffies(jiffies +
                                             ma->caps_wanted_delay_min * HZ);
@@ -515,7 +516,7 @@ int ceph_add_cap(struct inode *inode,
                  unsigned seq, unsigned mseq, u64 realmino, int flags,
                  struct ceph_cap_reservation *caps_reservation)
  {
-       struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+       struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
         struct ceph_inode_info *ci = ceph_inode(inode);
         struct ceph_cap *new_cap = NULL;
         struct ceph_cap *cap;
@@ -873,7 +874,7 @@ void __ceph_remove_cap(struct ceph_cap *cap)
         struct ceph_mds_session *session = cap->session;
         struct ceph_inode_info *ci = cap->ci;
         struct ceph_mds_client *mdsc =
-               &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+               ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
         int removed = 0;
  
         dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -1210,7 +1211,7 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci,
         int mds;
         struct ceph_cap_snap *capsnap;
         u32 mseq;
-       struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+       struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
         struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
                                                     session->s_mutex */
         u64 next_follows = 0;  /* keep track of how far we've gotten through the
@@ -1336,7 +1337,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
  void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
  {
         struct ceph_mds_client *mdsc =
-               &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+               ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
         struct inode *inode = &ci->vfs_inode;
         int was = ci->i_dirty_caps;
         int dirty = 0;
@@ -1378,7 +1379,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
  static int __mark_caps_flushing(struct inode *inode,
                                  struct ceph_mds_session *session)
  {
-       struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+       struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
         struct ceph_inode_info *ci = ceph_inode(inode);
         int flushing;
  
@@ -1416,17 +1417,6 @@ static int __mark_caps_flushing(struct inode *inode,
  /*
   * try to invalidate mapping pages without blocking.
   */
-static int mapping_is_empty(struct address_space *mapping)
-{
-       struct page *page = find_get_page(mapping, 0);
-
-       if (!page)
-               return 1;
-
-       put_page(page);
-       return 0;
-}
-
  static int try_nonblocking_invalidate(struct inode *inode)
  {
         struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1436,7 +1426,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
         invalidate_mapping_pages(&inode->i_data, 0, -1);
         spin_lock(&inode->i_lock);
  
-       if (mapping_is_empty(&inode->i_data) &&
+       if (inode->i_data.nrpages == 0 &&
             invalidating_gen == ci->i_rdcache_gen) {
                 /* success. */
                 dout("try_nonblocking_invalidate %p success\n", inode);
@@ -1462,8 +1452,8 @@ static int try_nonblocking_invalidate(struct inode *inode)
  void ceph_check_caps(struct ceph_inode_info *ci, int flags,
                      struct ceph_mds_session *session)
  {
-       struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct inode *inode = &ci->vfs_inode;
         struct ceph_cap *cap;
         int file_wanted, used;
@@ -1533,7 +1523,7 @@ retry_locked:
          */
         if ((!is_delayed || mdsc->stopping) &&
             ci->i_wrbuffer_ref == 0 &&               /* no dirty pages... */
-           ci->i_rdcache_gen &&                     /* may have cached pages */
+           inode->i_data.nrpages &&                 /* have cached pages */
             (file_wanted == 0 ||                     /* no open files */
              (revoking & (CEPH_CAP_FILE_CACHE|
                           CEPH_CAP_FILE_LAZYIO))) && /*  or revoking cache */
@@ -1706,7 +1696,7 @@ ack:
  static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
                           unsigned *flush_tid)
  {
-       struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+       struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
         struct ceph_inode_info *ci = ceph_inode(inode);
         int unlock_session = session ? 0 : 1;
         int flushing = 0;
@@ -1872,7 +1862,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
                                        caps_are_flushed(inode, flush_tid));
         } else {
                 struct ceph_mds_client *mdsc =
-                       &ceph_sb_to_client(inode->i_sb)->mdsc;
+                       ceph_sb_to_client(inode->i_sb)->mdsc;
  
                 spin_lock(&inode->i_lock);
                 if (__ceph_caps_dirty(ci))
@@ -2465,7 +2455,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
         __releases(inode->i_lock)
  {
         struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+       struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
         unsigned seq = le32_to_cpu(m->seq);
         int dirty = le32_to_cpu(m->dirty);
         int cleaned = 0;
@@ -2713,7 +2703,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
                       struct ceph_msg *msg)
  {
         struct ceph_mds_client *mdsc = session->s_mdsc;
-       struct super_block *sb = mdsc->client->sb;
+       struct super_block *sb = mdsc->fsc->sb;
         struct inode *inode;
         struct ceph_cap *cap;
         struct ceph_mds_caps *h;
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c

index ab6cf35c40919843e40f29137500e45a8170e5ab..bdce8b1fbd06794d9de7be918c9ab8aab97bcbd2 100644 (file)
--- a/fs/ceph/ceph_frag.c
+++ b/fs/ceph/ceph_frag.c
@@ -1,7 +1,8 @@
  /*
   * Ceph 'frag' type
   */
-#include "types.h"
+#include <linux/module.h>
+#include <linux/ceph/types.h>
  
  int ceph_frag_compare(__u32 a, __u32 b)
  {
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c

index 6fd8b20a86112c367c788a20c2f134108acc40e8..7ae1b3d55b58a7b70bf55e79a0788f211b90ff3c 100644 (file)
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <linux/device.h>
  #include <linux/slab.h>
@@ -7,143 +7,49 @@
  #include <linux/debugfs.h>
  #include <linux/seq_file.h>
  
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
+
  #include "super.h"
-#include "mds_client.h"
-#include "mon_client.h"
-#include "auth.h"
  
  #ifdef CONFIG_DEBUG_FS
  
-/*
- * Implement /sys/kernel/debug/ceph fun
- *
- * /sys/kernel/debug/ceph/client*  - an instance of the ceph client
- *      .../osdmap      - current osdmap
- *      .../mdsmap      - current mdsmap
- *      .../monmap      - current monmap
- *      .../osdc        - active osd requests
- *      .../mdsc        - active mds requests
- *      .../monc        - mon client state
- *      .../dentry_lru  - dump contents of dentry lru
- *      .../caps        - expose cap (reservation) stats
- *      .../bdi         - symlink to ../../bdi/something
- */
-
-static struct dentry *ceph_debugfs_dir;
-
-static int monmap_show(struct seq_file *s, void *p)
-{
-       int i;
-       struct ceph_client *client = s->private;
-
-       if (client->monc.monmap == NULL)
-               return 0;
-
-       seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
-       for (i = 0; i < client->monc.monmap->num_mon; i++) {
-               struct ceph_entity_inst *inst =
-                       &client->monc.monmap->mon_inst[i];
-
-               seq_printf(s, "\t%s%lld\t%s\n",
-                          ENTITY_NAME(inst->name),
-                          pr_addr(&inst->addr.in_addr));
-       }
-       return 0;
-}
+#include "mds_client.h"
  
  static int mdsmap_show(struct seq_file *s, void *p)
  {
         int i;
-       struct ceph_client *client = s->private;
+       struct ceph_fs_client *fsc = s->private;
  
-       if (client->mdsc.mdsmap == NULL)
+       if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
                 return 0;
-       seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
-       seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
+       seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
+       seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
         seq_printf(s, "session_timeout %d\n",
-                      client->mdsc.mdsmap->m_session_timeout);
+                      fsc->mdsc->mdsmap->m_session_timeout);
         seq_printf(s, "session_autoclose %d\n",
-                      client->mdsc.mdsmap->m_session_autoclose);
-       for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
+                      fsc->mdsc->mdsmap->m_session_autoclose);
+       for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
                 struct ceph_entity_addr *addr =
-                       &client->mdsc.mdsmap->m_info[i].addr;
-               int state = client->mdsc.mdsmap->m_info[i].state;
+                       &fsc->mdsc->mdsmap->m_info[i].addr;
+               int state = fsc->mdsc->mdsmap->m_info[i].state;
  
-               seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
+               seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
+                              ceph_pr_addr(&addr->in_addr),
                                ceph_mds_state_name(state));
         }
         return 0;
  }
  
-static int osdmap_show(struct seq_file *s, void *p)
-{
-       int i;
-       struct ceph_client *client = s->private;
-       struct rb_node *n;
-
-       if (client->osdc.osdmap == NULL)
-               return 0;
-       seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
-       seq_printf(s, "flags%s%s\n",
-                  (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
-                  " NEARFULL" : "",
-                  (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
-                  " FULL" : "");
-       for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
-               struct ceph_pg_pool_info *pool =
-                       rb_entry(n, struct ceph_pg_pool_info, node);
-               seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
-                          pool->id, pool->v.pg_num, pool->pg_num_mask,
-                          pool->v.lpg_num, pool->lpg_num_mask);
-       }
-       for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
-               struct ceph_entity_addr *addr =
-                       &client->osdc.osdmap->osd_addr[i];
-               int state = client->osdc.osdmap->osd_state[i];
-               char sb[64];
-
-               seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
-                          i, pr_addr(&addr->in_addr),
-                          ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
-                          ceph_osdmap_state_str(sb, sizeof(sb), state));
-       }
-       return 0;
-}
-
-static int monc_show(struct seq_file *s, void *p)
-{
-       struct ceph_client *client = s->private;
-       struct ceph_mon_generic_request *req;
-       struct ceph_mon_client *monc = &client->monc;
-       struct rb_node *rp;
-
-       mutex_lock(&monc->mutex);
-
-       if (monc->have_mdsmap)
-               seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
-       if (monc->have_osdmap)
-               seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
-       if (monc->want_next_osdmap)
-               seq_printf(s, "want next osdmap\n");
-
-       for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
-               __u16 op;
-               req = rb_entry(rp, struct ceph_mon_generic_request, node);
-               op = le16_to_cpu(req->request->hdr.type);
-               if (op == CEPH_MSG_STATFS)
-                       seq_printf(s, "%lld statfs\n", req->tid);
-               else
-                       seq_printf(s, "%lld unknown\n", req->tid);
-       }
-
-       mutex_unlock(&monc->mutex);
-       return 0;
-}
-
+/*
+ * mdsc debugfs
+ */
  static int mdsc_show(struct seq_file *s, void *p)
  {
-       struct ceph_client *client = s->private;
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = s->private;
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct ceph_mds_request *req;
         struct rb_node *rp;
         int pathlen;
@@ -214,61 +120,12 @@ static int mdsc_show(struct seq_file *s, void *p)
         return 0;
  }
  
-static int osdc_show(struct seq_file *s, void *pp)
-{
-       struct ceph_client *client = s->private;
-       struct ceph_osd_client *osdc = &client->osdc;
-       struct rb_node *p;
-
-       mutex_lock(&osdc->request_mutex);
-       for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
-               struct ceph_osd_request *req;
-               struct ceph_osd_request_head *head;
-               struct ceph_osd_op *op;
-               int num_ops;
-               int opcode, olen;
-               int i;
-
-               req = rb_entry(p, struct ceph_osd_request, r_node);
-
-               seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
-                          req->r_osd ? req->r_osd->o_osd : -1,
-                          le32_to_cpu(req->r_pgid.pool),
-                          le16_to_cpu(req->r_pgid.ps));
-
-               head = req->r_request->front.iov_base;
-               op = (void *)(head + 1);
-
-               num_ops = le16_to_cpu(head->num_ops);
-               olen = le32_to_cpu(head->object_len);
-               seq_printf(s, "%.*s", olen,
-                          (const char *)(head->ops + num_ops));
-
-               if (req->r_reassert_version.epoch)
-                       seq_printf(s, "\t%u'%llu",
-                          (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
-                          le64_to_cpu(req->r_reassert_version.version));
-               else
-                       seq_printf(s, "\t");
-
-               for (i = 0; i < num_ops; i++) {
-                       opcode = le16_to_cpu(op->op);
-                       seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
-                       op++;
-               }
-
-               seq_printf(s, "\n");
-       }
-       mutex_unlock(&osdc->request_mutex);
-       return 0;
-}
-
  static int caps_show(struct seq_file *s, void *p)
  {
-       struct ceph_client *client = s->private;
+       struct ceph_fs_client *fsc = s->private;
         int total, avail, used, reserved, min;
  
-       ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
+       ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
         seq_printf(s, "total\t\t%d\n"
                    "avail\t\t%d\n"
                    "used\t\t%d\n"
@@ -280,8 +137,8 @@ static int caps_show(struct seq_file *s, void *p)
  
  static int dentry_lru_show(struct seq_file *s, void *ptr)
  {
-       struct ceph_client *client = s->private;
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = s->private;
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct ceph_dentry_info *di;
  
         spin_lock(&mdsc->dentry_lru_lock);
@@ -295,199 +152,124 @@ static int dentry_lru_show(struct seq_file *s, void *ptr)
         return 0;
  }
  
-#define DEFINE_SHOW_FUNC(name)                                         \
-static int name##_open(struct inode *inode, struct file *file)         \
-{                                                                      \
-       struct seq_file *sf;                                            \
-       int ret;                                                        \
-                                                                       \
-       ret = single_open(file, name, NULL);                            \
-       sf = file->private_data;                                        \
-       sf->private = inode->i_private;                                 \
-       return ret;                                                     \
-}                                                                      \
-                                                                       \
-static const struct file_operations name##_fops = {                    \
-       .open           = name##_open,                                  \
-       .read           = seq_read,                                     \
-       .llseek         = seq_lseek,                                    \
-       .release        = single_release,                               \
-};
-
-DEFINE_SHOW_FUNC(monmap_show)
-DEFINE_SHOW_FUNC(mdsmap_show)
-DEFINE_SHOW_FUNC(osdmap_show)
-DEFINE_SHOW_FUNC(monc_show)
-DEFINE_SHOW_FUNC(mdsc_show)
-DEFINE_SHOW_FUNC(osdc_show)
-DEFINE_SHOW_FUNC(dentry_lru_show)
-DEFINE_SHOW_FUNC(caps_show)
+CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
+CEPH_DEFINE_SHOW_FUNC(mdsc_show)
+CEPH_DEFINE_SHOW_FUNC(caps_show)
+CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
+
  
+/*
+ * debugfs
+ */
  static int congestion_kb_set(void *data, u64 val)
  {
-       struct ceph_client *client = (struct ceph_client *)data;
-
-       if (client)
-               client->mount_args->congestion_kb = (int)val;
+       struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
  
+       fsc->mount_options->congestion_kb = (int)val;
         return 0;
  }
  
  static int congestion_kb_get(void *data, u64 *val)
  {
-       struct ceph_client *client = (struct ceph_client *)data;
-
-       if (client)
-               *val = (u64)client->mount_args->congestion_kb;
+       struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
  
+       *val = (u64)fsc->mount_options->congestion_kb;
         return 0;
  }
  
-
  DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
                         congestion_kb_set, "%llu\n");
  
-int __init ceph_debugfs_init(void)
-{
-       ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
-       if (!ceph_debugfs_dir)
-               return -ENOMEM;
-       return 0;
-}
  
-void ceph_debugfs_cleanup(void)
+void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
  {
-       debugfs_remove(ceph_debugfs_dir);
+       dout("ceph_fs_debugfs_cleanup\n");
+       debugfs_remove(fsc->debugfs_bdi);
+       debugfs_remove(fsc->debugfs_congestion_kb);
+       debugfs_remove(fsc->debugfs_mdsmap);
+       debugfs_remove(fsc->debugfs_caps);
+       debugfs_remove(fsc->debugfs_mdsc);
+       debugfs_remove(fsc->debugfs_dentry_lru);
  }
  
-int ceph_debugfs_client_init(struct ceph_client *client)
+int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
  {
-       int ret = 0;
-       char name[80];
-
-       snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
-                client->monc.auth->global_id);
+       char name[100];
+       int err = -ENOMEM;
  
-       client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
-       if (!client->debugfs_dir)
-               goto out;
-
-       client->monc.debugfs_file = debugfs_create_file("monc",
-                                                     0600,
-                                                     client->debugfs_dir,
-                                                     client,
-                                                     &monc_show_fops);
-       if (!client->monc.debugfs_file)
+       dout("ceph_fs_debugfs_init\n");
+       fsc->debugfs_congestion_kb =
+               debugfs_create_file("writeback_congestion_kb",
+                                   0600,
+                                   fsc->client->debugfs_dir,
+                                   fsc,
+                                   &congestion_kb_fops);
+       if (!fsc->debugfs_congestion_kb)
                 goto out;
  
-       client->mdsc.debugfs_file = debugfs_create_file("mdsc",
-                                                     0600,
-                                                     client->debugfs_dir,
-                                                     client,
-                                                     &mdsc_show_fops);
-       if (!client->mdsc.debugfs_file)
-               goto out;
+       dout("a\n");
  
-       client->osdc.debugfs_file = debugfs_create_file("osdc",
-                                                     0600,
-                                                     client->debugfs_dir,
-                                                     client,
-                                                     &osdc_show_fops);
-       if (!client->osdc.debugfs_file)
+       snprintf(name, sizeof(name), "../../bdi/%s",
+                dev_name(fsc->backing_dev_info.dev));
+       fsc->debugfs_bdi =
+               debugfs_create_symlink("bdi",
+                                      fsc->client->debugfs_dir,
+                                      name);
+       if (!fsc->debugfs_bdi)
                 goto out;
  
-       client->debugfs_monmap = debugfs_create_file("monmap",
+       dout("b\n");
+       fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
                                         0600,
-                                       client->debugfs_dir,
-                                       client,
-                                       &monmap_show_fops);
-       if (!client->debugfs_monmap)
-               goto out;
-
-       client->debugfs_mdsmap = debugfs_create_file("mdsmap",
-                                       0600,
-                                       client->debugfs_dir,
-                                       client,
+                                       fsc->client->debugfs_dir,
+                                       fsc,
                                         &mdsmap_show_fops);
-       if (!client->debugfs_mdsmap)
-               goto out;
-
-       client->debugfs_osdmap = debugfs_create_file("osdmap",
-                                       0600,
-                                       client->debugfs_dir,
-                                       client,
-                                       &osdmap_show_fops);
-       if (!client->debugfs_osdmap)
+       if (!fsc->debugfs_mdsmap)
                 goto out;
  
-       client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
-                                       0600,
-                                       client->debugfs_dir,
-                                       client,
-                                       &dentry_lru_show_fops);
-       if (!client->debugfs_dentry_lru)
+       dout("ca\n");
+       fsc->debugfs_mdsc = debugfs_create_file("mdsc",
+                                               0600,
+                                               fsc->client->debugfs_dir,
+                                               fsc,
+                                               &mdsc_show_fops);
+       if (!fsc->debugfs_mdsc)
                 goto out;
  
-       client->debugfs_caps = debugfs_create_file("caps",
+       dout("da\n");
+       fsc->debugfs_caps = debugfs_create_file("caps",
                                                    0400,
-                                                  client->debugfs_dir,
-                                                  client,
+                                                  fsc->client->debugfs_dir,
+                                                  fsc,
                                                    &caps_show_fops);
-       if (!client->debugfs_caps)
+       if (!fsc->debugfs_caps)
                 goto out;
  
-       client->debugfs_congestion_kb =
-               debugfs_create_file("writeback_congestion_kb",
-                                   0600,
-                                   client->debugfs_dir,
-                                   client,
-                                   &congestion_kb_fops);
-       if (!client->debugfs_congestion_kb)
+       dout("ea\n");
+       fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
+                                       0600,
+                                       fsc->client->debugfs_dir,
+                                       fsc,
+                                       &dentry_lru_show_fops);
+       if (!fsc->debugfs_dentry_lru)
                 goto out;
  
-       sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
-       client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
-                                                    name);
-
         return 0;
  
  out:
-       ceph_debugfs_client_cleanup(client);
-       return ret;
+       ceph_fs_debugfs_cleanup(fsc);
+       return err;
  }
  
-void ceph_debugfs_client_cleanup(struct ceph_client *client)
-{
-       debugfs_remove(client->debugfs_bdi);
-       debugfs_remove(client->debugfs_caps);
-       debugfs_remove(client->debugfs_dentry_lru);
-       debugfs_remove(client->debugfs_osdmap);
-       debugfs_remove(client->debugfs_mdsmap);
-       debugfs_remove(client->debugfs_monmap);
-       debugfs_remove(client->osdc.debugfs_file);
-       debugfs_remove(client->mdsc.debugfs_file);
-       debugfs_remove(client->monc.debugfs_file);
-       debugfs_remove(client->debugfs_congestion_kb);
-       debugfs_remove(client->debugfs_dir);
-}
  
  #else  /* CONFIG_DEBUG_FS */
  
-int __init ceph_debugfs_init(void)
-{
-       return 0;
-}
-
-void ceph_debugfs_cleanup(void)
-{
-}
-
-int ceph_debugfs_client_init(struct ceph_client *client)
+int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
  {
         return 0;
  }
  
-void ceph_debugfs_client_cleanup(struct ceph_client *client)
+void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
  {
  }
  
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c

index a1986eb52045b8184d6ab31e46b8a2497295e9ba..e0a2dc6fcafcb62266909c5ec71329e58083d1d7 100644 (file)
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <linux/spinlock.h>
  #include <linux/fs_struct.h>
@@ -7,6 +7,7 @@
  #include <linux/sched.h>
  
  #include "super.h"
+#include "mds_client.h"
  
  /*
   * Directory operations: readdir, lookup, create, link, unlink,
@@ -94,10 +95,7 @@ static unsigned fpos_off(loff_t p)
   */
  static int __dcache_readdir(struct file *filp,
                             void *dirent, filldir_t filldir)
-               __releases(inode->i_lock)
-               __acquires(inode->i_lock)
  {
-       struct inode *inode = filp->f_dentry->d_inode;
         struct ceph_file_info *fi = filp->private_data;
         struct dentry *parent = filp->f_dentry;
         struct inode *dir = parent->d_inode;
@@ -153,7 +151,6 @@ more:
  
         atomic_inc(&dentry->d_count);
         spin_unlock(&dcache_lock);
-       spin_unlock(&inode->i_lock);
  
         dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
              dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
@@ -171,35 +168,30 @@ more:
                 } else {
                         dput(last);
                 }
-               last = NULL;
         }
-
-       spin_lock(&inode->i_lock);
-       spin_lock(&dcache_lock);
-
         last = dentry;
  
         if (err < 0)
-               goto out_unlock;
+               goto out;
  
-       p = p->prev;
         filp->f_pos++;
  
         /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
-       if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
-               goto more;
-       dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
-       err = -EAGAIN;
+       if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
+               dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
+               err = -EAGAIN;
+               goto out;
+       }
+
+       spin_lock(&dcache_lock);
+       p = p->prev;    /* advance to next dentry */
+       goto more;
  
  out_unlock:
         spin_unlock(&dcache_lock);
-
-       if (last) {
-               spin_unlock(&inode->i_lock);
+out:
+       if (last)
                 dput(last);
-               spin_lock(&inode->i_lock);
-       }
-
         return err;
  }
  
@@ -227,15 +219,15 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
         struct ceph_file_info *fi = filp->private_data;
         struct inode *inode = filp->f_dentry->d_inode;
         struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_client *client = ceph_inode_to_client(inode);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         unsigned frag = fpos_frag(filp->f_pos);
         int off = fpos_off(filp->f_pos);
         int err;
         u32 ftype;
         struct ceph_mds_reply_info_parsed *rinfo;
-       const int max_entries = client->mount_args->max_readdir;
-       const int max_bytes = client->mount_args->max_readdir_bytes;
+       const int max_entries = fsc->mount_options->max_readdir;
+       const int max_bytes = fsc->mount_options->max_readdir_bytes;
  
         dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
         if (fi->at_end)
@@ -267,17 +259,17 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
         /* can we use the dcache? */
         spin_lock(&inode->i_lock);
         if ((filp->f_pos == 2 || fi->dentry) &&
-           !ceph_test_opt(client, NOASYNCREADDIR) &&
+           !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
             ceph_snap(inode) != CEPH_SNAPDIR &&
             (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
             __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
+               spin_unlock(&inode->i_lock);
                 err = __dcache_readdir(filp, dirent, filldir);
-               if (err != -EAGAIN) {
-                       spin_unlock(&inode->i_lock);
+               if (err != -EAGAIN)
                         return err;
-               }
+       } else {
+               spin_unlock(&inode->i_lock);
         }
-       spin_unlock(&inode->i_lock);
         if (fi->dentry) {
                 err = note_last_dentry(fi, fi->dentry->d_name.name,
                                        fi->dentry->d_name.len);
@@ -487,14 +479,13 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
  struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
                                   struct dentry *dentry, int err)
  {
-       struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
+       struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
         struct inode *parent = dentry->d_parent->d_inode;
  
         /* .snap dir? */
         if (err == -ENOENT &&
-           ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
             strcmp(dentry->d_name.name,
-                  client->mount_args->snapdir_name) == 0) {
+                  fsc->mount_options->snapdir_name) == 0) {
                 struct inode *inode = ceph_get_snapdir(parent);
                 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
                      dentry, dentry->d_name.len, dentry->d_name.name, inode);
@@ -539,8 +530,8 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
  static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                                   struct nameidata *nd)
  {
-       struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct ceph_mds_request *req;
         int op;
         int err;
@@ -572,7 +563,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                 spin_lock(&dir->i_lock);
                 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
                 if (strncmp(dentry->d_name.name,
-                           client->mount_args->snapdir_name,
+                           fsc->mount_options->snapdir_name,
                             dentry->d_name.len) &&
                     !is_root_ceph_dentry(dir, dentry) &&
                     (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
@@ -629,8 +620,8 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
  static int ceph_mknod(struct inode *dir, struct dentry *dentry,
                       int mode, dev_t rdev)
  {
-       struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct ceph_mds_request *req;
         int err;
  
@@ -685,8 +676,8 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
  static int ceph_symlink(struct inode *dir, struct dentry *dentry,
                             const char *dest)
  {
-       struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct ceph_mds_request *req;
         int err;
  
@@ -716,8 +707,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
  
  static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
  {
-       struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct ceph_mds_request *req;
         int err = -EROFS;
         int op;
@@ -758,8 +749,8 @@ out:
  static int ceph_link(struct dentry *old_dentry, struct inode *dir,
                      struct dentry *dentry)
  {
-       struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct ceph_mds_request *req;
         int err;
  
@@ -813,8 +804,8 @@ static int drop_caps_for_unlink(struct inode *inode)
   */
  static int ceph_unlink(struct inode *dir, struct dentry *dentry)
  {
-       struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct inode *inode = dentry->d_inode;
         struct ceph_mds_request *req;
         int err = -EROFS;
@@ -854,8 +845,8 @@ out:
  static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
                        struct inode *new_dir, struct dentry *new_dentry)
  {
-       struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct ceph_mds_request *req;
         int err;
  
@@ -1076,7 +1067,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
         struct ceph_inode_info *ci = ceph_inode(inode);
         int left;
  
-       if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
+       if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
                 return -EISDIR;
  
         if (!cf->dir_info) {
@@ -1177,7 +1168,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
         dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
              dn->d_name.len, dn->d_name.name);
         if (di) {
-               mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+               mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
                 spin_lock(&mdsc->dentry_lru_lock);
                 list_add_tail(&di->lru, &mdsc->dentry_lru);
                 mdsc->num_dentry++;
@@ -1193,7 +1184,7 @@ void ceph_dentry_lru_touch(struct dentry *dn)
         dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
              dn->d_name.len, dn->d_name.name, di->offset);
         if (di) {
-               mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+               mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
                 spin_lock(&mdsc->dentry_lru_lock);
                 list_move_tail(&di->lru, &mdsc->dentry_lru);
                 spin_unlock(&mdsc->dentry_lru_lock);
@@ -1208,7 +1199,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
         dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
              dn->d_name.len, dn->d_name.name);
         if (di) {
-               mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+               mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
                 spin_lock(&mdsc->dentry_lru_lock);
                 list_del_init(&di->lru);
                 mdsc->num_dentry--;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c

index e38423e82f2ebe3f311a3f173964181c5ebe1860..2297d9426992b0132b991e0d4e129982c4ab4a01 100644 (file)
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -1,10 +1,11 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <linux/exportfs.h>
  #include <linux/slab.h>
  #include <asm/unaligned.h>
  
  #include "super.h"
+#include "mds_client.h"
  
  /*
   * NFS export support
@@ -120,7 +121,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
  static struct dentry *__cfh_to_dentry(struct super_block *sb,
                                       struct ceph_nfs_confh *cfh)
  {
-       struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc;
+       struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
         struct inode *inode;
         struct dentry *dentry;
         struct ceph_vino vino;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c

index 66e4da6dba229716ff3349fe87f5dfe62fe62db1..e77c28cf369059112dd06ee3b7c78c5146f0ffdc 100644 (file)
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1,5 +1,6 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
+#include <linux/module.h>
  #include <linux/sched.h>
  #include <linux/slab.h>
  #include <linux/file.h>
@@ -38,8 +39,8 @@
  static struct ceph_mds_request *
  prepare_open_request(struct super_block *sb, int flags, int create_mode)
  {
-       struct ceph_client *client = ceph_sb_to_client(sb);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct ceph_mds_request *req;
         int want_auth = USE_ANY_MDS;
         int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
@@ -117,8 +118,8 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
  int ceph_open(struct inode *inode, struct file *file)
  {
         struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct ceph_mds_request *req;
         struct ceph_file_info *cf = file->private_data;
         struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
@@ -216,8 +217,8 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
                                 struct nameidata *nd, int mode,
                                 int locked_dir)
  {
-       struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct file *file = nd->intent.open.file;
         struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
         struct ceph_mds_request *req;
@@ -269,163 +270,6 @@ int ceph_release(struct inode *inode, struct file *file)
         return 0;
  }
  
-/*
- * build a vector of user pages
- */
-static struct page **get_direct_page_vector(const char __user *data,
-                                           int num_pages,
-                                           loff_t off, size_t len)
-{
-       struct page **pages;
-       int rc;
-
-       pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
-       if (!pages)
-               return ERR_PTR(-ENOMEM);
-
-       down_read(&current->mm->mmap_sem);
-       rc = get_user_pages(current, current->mm, (unsigned long)data,
-                           num_pages, 0, 0, pages, NULL);
-       up_read(&current->mm->mmap_sem);
-       if (rc < 0)
-               goto fail;
-       return pages;
-
-fail:
-       kfree(pages);
-       return ERR_PTR(rc);
-}
-
-static void put_page_vector(struct page **pages, int num_pages)
-{
-       int i;
-
-       for (i = 0; i < num_pages; i++)
-               put_page(pages[i]);
-       kfree(pages);
-}
-
-void ceph_release_page_vector(struct page **pages, int num_pages)
-{
-       int i;
-
-       for (i = 0; i < num_pages; i++)
-               __free_pages(pages[i], 0);
-       kfree(pages);
-}
-
-/*
- * allocate a vector new pages
- */
-static struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
-{
-       struct page **pages;
-       int i;
-
-       pages = kmalloc(sizeof(*pages) * num_pages, flags);
-       if (!pages)
-               return ERR_PTR(-ENOMEM);
-       for (i = 0; i < num_pages; i++) {
-               pages[i] = __page_cache_alloc(flags);
-               if (pages[i] == NULL) {
-                       ceph_release_page_vector(pages, i);
-                       return ERR_PTR(-ENOMEM);
-               }
-       }
-       return pages;
-}
-
-/*
- * copy user data into a page vector
- */
-static int copy_user_to_page_vector(struct page **pages,
-                                   const char __user *data,
-                                   loff_t off, size_t len)
-{
-       int i = 0;
-       int po = off & ~PAGE_CACHE_MASK;
-       int left = len;
-       int l, bad;
-
-       while (left > 0) {
-               l = min_t(int, PAGE_CACHE_SIZE-po, left);
-               bad = copy_from_user(page_address(pages[i]) + po, data, l);
-               if (bad == l)
-                       return -EFAULT;
-               data += l - bad;
-               left -= l - bad;
-               po += l - bad;
-               if (po == PAGE_CACHE_SIZE) {
-                       po = 0;
-                       i++;
-               }
-       }
-       return len;
-}
-
-/*
- * copy user data from a page vector into a user pointer
- */
-static int copy_page_vector_to_user(struct page **pages, char __user *data,
-                                   loff_t off, size_t len)
-{
-       int i = 0;
-       int po = off & ~PAGE_CACHE_MASK;
-       int left = len;
-       int l, bad;
-
-       while (left > 0) {
-               l = min_t(int, left, PAGE_CACHE_SIZE-po);
-               bad = copy_to_user(data, page_address(pages[i]) + po, l);
-               if (bad == l)
-                       return -EFAULT;
-               data += l - bad;
-               left -= l - bad;
-               if (po) {
-                       po += l - bad;
-                       if (po == PAGE_CACHE_SIZE)
-                               po = 0;
-               }
-               i++;
-       }
-       return len;
-}
-
-/*
- * Zero an extent within a page vector.  Offset is relative to the
- * start of the first page.
- */
-static void zero_page_vector_range(int off, int len, struct page **pages)
-{
-       int i = off >> PAGE_CACHE_SHIFT;
-
-       off &= ~PAGE_CACHE_MASK;
-
-       dout("zero_page_vector_page %u~%u\n", off, len);
-
-       /* leading partial page? */
-       if (off) {
-               int end = min((int)PAGE_CACHE_SIZE, off + len);
-               dout("zeroing %d %p head from %d\n", i, pages[i],
-                    (int)off);
-               zero_user_segment(pages[i], off, end);
-               len -= (end - off);
-               i++;
-       }
-       while (len >= PAGE_CACHE_SIZE) {
-               dout("zeroing %d %p len=%d\n", i, pages[i], len);
-               zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
-               len -= PAGE_CACHE_SIZE;
-               i++;
-       }
-       /* trailing partial page? */
-       if (len) {
-               dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
-               zero_user_segment(pages[i], 0, len);
-       }
-}
-
-
  /*
   * Read a range of bytes striped over one or more objects.  Iterate over
   * objects we stripe over.  (That's not atomic, but good enough for now.)
@@ -438,7 +282,7 @@ static int striped_read(struct inode *inode,
                         struct page **pages, int num_pages,
                         int *checkeof)
  {
-       struct ceph_client *client = ceph_inode_to_client(inode);
+       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
         struct ceph_inode_info *ci = ceph_inode(inode);
         u64 pos, this_len;
         int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
@@ -459,7 +303,7 @@ static int striped_read(struct inode *inode,
  
  more:
         this_len = left;
-       ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
+       ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
                                   &ci->i_layout, pos, &this_len,
                                   ci->i_truncate_seq,
                                   ci->i_truncate_size,
@@ -477,8 +321,8 @@ more:
  
                 if (read < pos - off) {
                         dout(" zero gap %llu to %llu\n", off + read, pos);
-                       zero_page_vector_range(page_off + read,
-                                              pos - off - read, pages);
+                       ceph_zero_page_vector_range(page_off + read,
+                                                   pos - off - read, pages);
                 }
                 pos += ret;
                 read = pos - off;
@@ -495,8 +339,8 @@ more:
                 /* was original extent fully inside i_size? */
                 if (pos + left <= inode->i_size) {
                         dout("zero tail\n");
-                       zero_page_vector_range(page_off + read, len - read,
-                                              pages);
+                       ceph_zero_page_vector_range(page_off + read, len - read,
+                                                   pages);
                         read = len;
                         goto out;
                 }
@@ -531,7 +375,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
              (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
  
         if (file->f_flags & O_DIRECT) {
-               pages = get_direct_page_vector(data, num_pages, off, len);
+               pages = ceph_get_direct_page_vector(data, num_pages, off, len);
  
                 /*
                  * flush any page cache pages in this range.  this
@@ -552,13 +396,13 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
         ret = striped_read(inode, off, len, pages, num_pages, checkeof);
  
         if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
-               ret = copy_page_vector_to_user(pages, data, off, ret);
+               ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
         if (ret >= 0)
                 *poff = off + ret;
  
  done:
         if (file->f_flags & O_DIRECT)
-               put_page_vector(pages, num_pages);
+               ceph_put_page_vector(pages, num_pages);
         else
                 ceph_release_page_vector(pages, num_pages);
         dout("sync_read result %d\n", ret);
@@ -594,7 +438,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
  {
         struct inode *inode = file->f_dentry->d_inode;
         struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_client *client = ceph_inode_to_client(inode);
+       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
         struct ceph_osd_request *req;
         struct page **pages;
         int num_pages;
@@ -642,7 +486,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
          */
  more:
         len = left;
-       req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
+       req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
                                     ceph_vino(inode), pos, &len,
                                     CEPH_OSD_OP_WRITE, flags,
                                     ci->i_snap_realm->cached_context,
@@ -655,7 +499,7 @@ more:
         num_pages = calc_pages_for(pos, len);
  
         if (file->f_flags & O_DIRECT) {
-               pages = get_direct_page_vector(data, num_pages, pos, len);
+               pages = ceph_get_direct_page_vector(data, num_pages, pos, len);
                 if (IS_ERR(pages)) {
                         ret = PTR_ERR(pages);
                         goto out;
@@ -673,7 +517,7 @@ more:
                         ret = PTR_ERR(pages);
                         goto out;
                 }
-               ret = copy_user_to_page_vector(pages, data, pos, len);
+               ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
                 if (ret < 0) {
                         ceph_release_page_vector(pages, num_pages);
                         goto out;
@@ -689,7 +533,7 @@ more:
         req->r_num_pages = num_pages;
         req->r_inode = inode;
  
-       ret = ceph_osdc_start_request(&client->osdc, req, false);
+       ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
         if (!ret) {
                 if (req->r_safe_callback) {
                         /*
@@ -701,11 +545,11 @@ more:
                         spin_unlock(&ci->i_unsafe_lock);
                         ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
                 }
-               ret = ceph_osdc_wait_request(&client->osdc, req);
+               ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
         }
  
         if (file->f_flags & O_DIRECT)
-               put_page_vector(pages, num_pages);
+               ceph_put_page_vector(pages, num_pages);
         else if (file->f_flags & O_SYNC)
                 ceph_release_page_vector(pages, num_pages);
  
@@ -814,7 +658,8 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
         struct ceph_file_info *fi = file->private_data;
         struct inode *inode = file->f_dentry->d_inode;
         struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
+       struct ceph_osd_client *osdc =
+               &ceph_sb_to_client(inode->i_sb)->client->osdc;
         loff_t endoff = pos + iov->iov_len;
         int want, got = 0;
         int ret, err;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c

index 62377ec37edf05c14cf8567c0d22e19e02762253..1d6a45b5a04c696591879d141165627746d6a476 100644 (file)
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <linux/module.h>
  #include <linux/fs.h>
@@ -13,7 +13,8 @@
  #include <linux/pagevec.h>
  
  #include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+#include <linux/ceph/decode.h>
  
  /*
   * Ceph inode operations
@@ -384,7 +385,7 @@ void ceph_destroy_inode(struct inode *inode)
          */
         if (ci->i_snap_realm) {
                 struct ceph_mds_client *mdsc =
-                       &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+                       ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
                 struct ceph_snap_realm *realm = ci->i_snap_realm;
  
                 dout(" dropping residual ref to snap realm %p\n", realm);
@@ -685,7 +686,7 @@ static int fill_inode(struct inode *inode,
                 }
  
                 /* it may be better to set st_size in getattr instead? */
-               if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
+               if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
                         inode->i_size = ci->i_rbytes;
                 break;
         default:
@@ -901,7 +902,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
         struct inode *in = NULL;
         struct ceph_mds_reply_inode *ininfo;
         struct ceph_vino vino;
-       struct ceph_client *client = ceph_sb_to_client(sb);
+       struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
         int i = 0;
         int err = 0;
  
@@ -965,7 +966,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
          */
         if (rinfo->head->is_dentry && !req->r_aborted &&
             (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
-                                              client->mount_args->snapdir_name,
+                                              fsc->mount_options->snapdir_name,
                                                req->r_dentry->d_name.len))) {
                 /*
                  * lookup link rename   : null -> possibly existing inode
@@ -1533,7 +1534,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
         struct inode *parent_inode = dentry->d_parent->d_inode;
         const unsigned int ia_valid = attr->ia_valid;
         struct ceph_mds_request *req;
-       struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc;
+       struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
         int issued;
         int release = 0, dirtied = 0;
         int mask = 0;
@@ -1728,8 +1729,8 @@ out:
   */
  int ceph_do_getattr(struct inode *inode, int mask)
  {
-       struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct ceph_mds_request *req;
         int err;
  
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c

index 76e307d2aba16868c9b1e8136496d929e0cd971f..8888c9ba68dbfec194e06f06142547ee2d35c8bc 100644 (file)
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -1,8 +1,10 @@
  #include <linux/in.h>
  
-#include "ioctl.h"
  #include "super.h"
-#include "ceph_debug.h"
+#include "mds_client.h"
+#include <linux/ceph/ceph_debug.h>
+
+#include "ioctl.h"
  
  
  /*
@@ -37,7 +39,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
  {
         struct inode *inode = file->f_dentry->d_inode;
         struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
-       struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+       struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
         struct ceph_mds_request *req;
         struct ceph_ioctl_layout l;
         int err, i;
@@ -89,6 +91,68 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
         return err;
  }
  
+/*
+ * Set a layout policy on a directory inode. All items in the tree
+ * rooted at this inode will inherit this layout on creation,
+ * (It doesn't apply retroactively )
+ * unless a subdirectory has its own layout policy.
+ */
+static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
+{
+       struct inode *inode = file->f_dentry->d_inode;
+       struct ceph_mds_request *req;
+       struct ceph_ioctl_layout l;
+       int err, i;
+       struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+
+       /* copy and validate */
+       if (copy_from_user(&l, arg, sizeof(l)))
+               return -EFAULT;
+
+       if ((l.object_size & ~PAGE_MASK) ||
+           (l.stripe_unit & ~PAGE_MASK) ||
+           !l.stripe_unit ||
+           (l.object_size &&
+               (unsigned)l.object_size % (unsigned)l.stripe_unit))
+               return -EINVAL;
+
+       /* make sure it's a valid data pool */
+       if (l.data_pool > 0) {
+               mutex_lock(&mdsc->mutex);
+               err = -EINVAL;
+               for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
+                       if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
+                               err = 0;
+                               break;
+                       }
+               mutex_unlock(&mdsc->mutex);
+               if (err)
+                       return err;
+       }
+
+       req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT,
+                                      USE_AUTH_MDS);
+
+       if (IS_ERR(req))
+               return PTR_ERR(req);
+       req->r_inode = igrab(inode);
+
+       req->r_args.setlayout.layout.fl_stripe_unit =
+                       cpu_to_le32(l.stripe_unit);
+       req->r_args.setlayout.layout.fl_stripe_count =
+                       cpu_to_le32(l.stripe_count);
+       req->r_args.setlayout.layout.fl_object_size =
+                       cpu_to_le32(l.object_size);
+       req->r_args.setlayout.layout.fl_pg_pool =
+                       cpu_to_le32(l.data_pool);
+       req->r_args.setlayout.layout.fl_pg_preferred =
+                       cpu_to_le32(l.preferred_osd);
+
+       err = ceph_mdsc_do_request(mdsc, inode, req);
+       ceph_mdsc_put_request(req);
+       return err;
+}
+
  /*
   * Return object name, size/offset information, and location (OSD
   * number, network address) for a given file offset.
@@ -98,7 +162,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
         struct ceph_ioctl_dataloc dl;
         struct inode *inode = file->f_dentry->d_inode;
         struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
+       struct ceph_osd_client *osdc =
+               &ceph_sb_to_client(inode->i_sb)->client->osdc;
         u64 len = 1, olen;
         u64 tmp;
         struct ceph_object_layout ol;
@@ -174,11 +239,15 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
         case CEPH_IOC_SET_LAYOUT:
                 return ceph_ioctl_set_layout(file, (void __user *)arg);
  
+       case CEPH_IOC_SET_LAYOUT_POLICY:
+               return ceph_ioctl_set_layout_policy(file, (void __user *)arg);
+
         case CEPH_IOC_GET_DATALOC:
                 return ceph_ioctl_get_dataloc(file, (void __user *)arg);
  
         case CEPH_IOC_LAZYIO:
                 return ceph_ioctl_lazyio(file);
         }
+
         return -ENOTTY;
  }
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h

index 88451a3b6857d14bc7b21f418bcd82e56e9fdc64..a6ce54e94eb5ab435670093cd6ad789e72cc627b 100644 (file)
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -4,7 +4,7 @@
  #include <linux/ioctl.h>
  #include <linux/types.h>
  
-#define CEPH_IOCTL_MAGIC 0x97
+#define CEPH_IOCTL_MAGIC 0x98
  
  /* just use u64 to align sanely on all archs */
  struct ceph_ioctl_layout {
@@ -17,6 +17,8 @@ struct ceph_ioctl_layout {
                                    struct ceph_ioctl_layout)
  #define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2,          \
                                    struct ceph_ioctl_layout)
+#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5,   \
+                                  struct ceph_ioctl_layout)
  
  /*
   * Extract identity, address of the OSD and object storing a given
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c

index ff4e753aae929d37d414567d22fd6afef7316c7e..40abde93c345d054279fd51cbca998525c4931c7 100644 (file)
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -1,11 +1,11 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <linux/file.h>
  #include <linux/namei.h>
  
  #include "super.h"
  #include "mds_client.h"
-#include "pagelist.h"
+#include <linux/ceph/pagelist.h>
  
  /**
   * Implement fcntl and flock locking functions.
@@ -16,7 +16,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
  {
         struct inode *inode = file->f_dentry->d_inode;
         struct ceph_mds_client *mdsc =
-               &ceph_sb_to_client(inode->i_sb)->mdsc;
+               ceph_sb_to_client(inode->i_sb)->mdsc;
         struct ceph_mds_request *req;
         int err;
  
@@ -181,8 +181,9 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
   * Encode the flock and fcntl locks for the given inode into the pagelist.
   * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
   * sequential flock locks.
- * Must be called with BLK already held, and the lock numbers should have
- * been gathered under the same lock holding window.
+ * Must be called with lock_flocks() already held.
+ * If we encounter more of a specific lock type than expected,
+ * we return the value 1.
   */
  int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
                       int num_fcntl_locks, int num_flock_locks)
@@ -190,6 +191,8 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
         struct file_lock *lock;
         struct ceph_filelock cephlock;
         int err = 0;
+       int seen_fcntl = 0;
+       int seen_flock = 0;
  
         dout("encoding %d flock and %d fcntl locks", num_flock_locks,
              num_fcntl_locks);
@@ -198,6 +201,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
                 goto fail;
         for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
                 if (lock->fl_flags & FL_POSIX) {
+                       ++seen_fcntl;
+                       if (seen_fcntl > num_fcntl_locks) {
+                               err = -ENOSPC;
+                               goto fail;
+                       }
                         err = lock_to_ceph_filelock(lock, &cephlock);
                         if (err)
                                 goto fail;
@@ -213,6 +221,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
                 goto fail;
         for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
                 if (lock->fl_flags & FL_FLOCK) {
+                       ++seen_flock;
+                       if (seen_flock > num_flock_locks) {
+                               err = -ENOSPC;
+                               goto fail;
+                       }
                         err = lock_to_ceph_filelock(lock, &cephlock);
                         if (err)
                                 goto fail;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c

index fad95f8f2608bc4e86c0876bedd81cb5be46a4ef..3142b15940c25656a43ec3a5d72af3e1ee1cece9 100644 (file)
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1,17 +1,21 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
+#include <linux/fs.h>
  #include <linux/wait.h>
  #include <linux/slab.h>
  #include <linux/sched.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
  #include <linux/smp_lock.h>
  
-#include "mds_client.h"
-#include "mon_client.h"
  #include "super.h"
-#include "messenger.h"
-#include "decode.h"
-#include "auth.h"
-#include "pagelist.h"
+#include "mds_client.h"
+
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/pagelist.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
  
  /*
   * A cluster of MDS (metadata server) daemons is responsible for
@@ -286,8 +290,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
              atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
         if (atomic_dec_and_test(&s->s_ref)) {
                 if (s->s_authorizer)
-                       s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
-                               s->s_mdsc->client->monc.auth, s->s_authorizer);
+                    s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer(
+                            s->s_mdsc->fsc->client->monc.auth,
+                            s->s_authorizer);
                 kfree(s);
         }
  }
@@ -344,7 +349,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
         s->s_seq = 0;
         mutex_init(&s->s_mutex);
  
-       ceph_con_init(mdsc->client->msgr, &s->s_con);
+       ceph_con_init(mdsc->fsc->client->msgr, &s->s_con);
         s->s_con.private = s;
         s->s_con.ops = &mds_con_ops;
         s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
@@ -599,7 +604,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
         } else if (req->r_dentry) {
                 struct inode *dir = req->r_dentry->d_parent->d_inode;
  
-               if (dir->i_sb != mdsc->client->sb) {
+               if (dir->i_sb != mdsc->fsc->sb) {
                         /* not this fs! */
                         inode = req->r_dentry->d_inode;
                 } else if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -884,7 +889,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
         __ceph_remove_cap(cap);
         if (!__ceph_is_any_real_caps(ci)) {
                 struct ceph_mds_client *mdsc =
-                       &ceph_sb_to_client(inode->i_sb)->mdsc;
+                       ceph_sb_to_client(inode->i_sb)->mdsc;
  
                 spin_lock(&mdsc->cap_dirty_lock);
                 if (!list_empty(&ci->i_dirty_item)) {
@@ -1146,7 +1151,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
         struct ceph_msg *msg, *partial = NULL;
         struct ceph_mds_cap_release *head;
         int err = -ENOMEM;
-       int extra = mdsc->client->mount_args->cap_release_safety;
+       int extra = mdsc->fsc->mount_options->cap_release_safety;
         int num;
  
         dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
@@ -2085,7 +2090,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
  
         /* insert trace into our cache */
         mutex_lock(&req->r_fill_mutex);
-       err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
+       err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
         if (err == 0) {
                 if (result == 0 && rinfo->dir_nr)
                         ceph_readdir_prepopulate(req, req->r_session);
@@ -2361,19 +2366,35 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
  
         if (recon_state->flock) {
                 int num_fcntl_locks, num_flock_locks;
-
-               lock_kernel();
-               ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
-               rec.v2.flock_len = (2*sizeof(u32) +
-                                   (num_fcntl_locks+num_flock_locks) *
-                                   sizeof(struct ceph_filelock));
-
-               err = ceph_pagelist_append(pagelist, &rec, reclen);
-               if (!err)
-                       err = ceph_encode_locks(inode, pagelist,
-                                               num_fcntl_locks,
-                                               num_flock_locks);
-               unlock_kernel();
+               struct ceph_pagelist_cursor trunc_point;
+
+               ceph_pagelist_set_cursor(pagelist, &trunc_point);
+               do {
+                       lock_flocks();
+                       ceph_count_locks(inode, &num_fcntl_locks,
+                                        &num_flock_locks);
+                       rec.v2.flock_len = (2*sizeof(u32) +
+                                           (num_fcntl_locks+num_flock_locks) *
+                                           sizeof(struct ceph_filelock));
+                       unlock_flocks();
+
+                       /* pre-alloc pagelist */
+                       ceph_pagelist_truncate(pagelist, &trunc_point);
+                       err = ceph_pagelist_append(pagelist, &rec, reclen);
+                       if (!err)
+                               err = ceph_pagelist_reserve(pagelist,
+                                                           rec.v2.flock_len);
+
+                       /* encode locks */
+                       if (!err) {
+                               lock_flocks();
+                               err = ceph_encode_locks(inode,
+                                                       pagelist,
+                                                       num_fcntl_locks,
+                                                       num_flock_locks);
+                               unlock_flocks();
+                       }
+               } while (err == -ENOSPC);
         } else {
                 err = ceph_pagelist_append(pagelist, &rec, reclen);
         }
@@ -2613,7 +2634,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
                          struct ceph_mds_session *session,
                          struct ceph_msg *msg)
  {
-       struct super_block *sb = mdsc->client->sb;
+       struct super_block *sb = mdsc->fsc->sb;
         struct inode *inode;
         struct ceph_inode_info *ci;
         struct dentry *parent, *dentry;
@@ -2891,10 +2912,16 @@ static void delayed_work(struct work_struct *work)
         schedule_delayed(mdsc);
  }
  
+int ceph_mdsc_init(struct ceph_fs_client *fsc)
  
-int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
  {
-       mdsc->client = client;
+       struct ceph_mds_client *mdsc;
+
+       mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
+       if (!mdsc)
+               return -ENOMEM;
+       mdsc->fsc = fsc;
+       fsc->mdsc = mdsc;
         mutex_init(&mdsc->mutex);
         mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
         if (mdsc->mdsmap == NULL)
@@ -2927,7 +2954,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
         INIT_LIST_HEAD(&mdsc->dentry_lru);
  
         ceph_caps_init(mdsc);
-       ceph_adjust_min_caps(mdsc, client->min_caps);
+       ceph_adjust_min_caps(mdsc, fsc->min_caps);
  
         return 0;
  }
@@ -2939,7 +2966,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
  static void wait_requests(struct ceph_mds_client *mdsc)
  {
         struct ceph_mds_request *req;
-       struct ceph_client *client = mdsc->client;
+       struct ceph_fs_client *fsc = mdsc->fsc;
  
         mutex_lock(&mdsc->mutex);
         if (__get_oldest_req(mdsc)) {
@@ -2947,7 +2974,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
  
                 dout("wait_requests waiting for requests\n");
                 wait_for_completion_timeout(&mdsc->safe_umount_waiters,
-                                   client->mount_args->mount_timeout * HZ);
+                                   fsc->client->options->mount_timeout * HZ);
  
                 /* tear down remaining requests */
                 mutex_lock(&mdsc->mutex);
@@ -3030,7 +3057,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
  {
         u64 want_tid, want_flush;
  
-       if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+       if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
                 return;
  
         dout("sync\n");
@@ -3053,7 +3080,7 @@ bool done_closing_sessions(struct ceph_mds_client *mdsc)
  {
         int i, n = 0;
  
-       if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+       if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
                 return true;
  
         mutex_lock(&mdsc->mutex);
@@ -3071,8 +3098,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
  {
         struct ceph_mds_session *session;
         int i;
-       struct ceph_client *client = mdsc->client;
-       unsigned long timeout = client->mount_args->mount_timeout * HZ;
+       struct ceph_fs_client *fsc = mdsc->fsc;
+       unsigned long timeout = fsc->client->options->mount_timeout * HZ;
  
         dout("close_sessions\n");
  
@@ -3119,7 +3146,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
         dout("stopped\n");
  }
  
-void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
+static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
  {
         dout("stop\n");
         cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
@@ -3129,6 +3156,15 @@ void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
         ceph_caps_finalize(mdsc);
  }
  
+void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
+{
+       struct ceph_mds_client *mdsc = fsc->mdsc;
+
+       ceph_mdsc_stop(mdsc);
+       fsc->mdsc = NULL;
+       kfree(mdsc);
+}
+
  
  /*
   * handle mds map update.
@@ -3145,14 +3181,14 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
  
         ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
         ceph_decode_copy(&p, &fsid, sizeof(fsid));
-       if (ceph_check_fsid(mdsc->client, &fsid) < 0)
+       if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
                 return;
         epoch = ceph_decode_32(&p);
         maplen = ceph_decode_32(&p);
         dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
  
         /* do we need it? */
-       ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
+       ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
         mutex_lock(&mdsc->mutex);
         if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
                 dout("handle_map epoch %u <= our %u\n",
@@ -3176,7 +3212,7 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
         } else {
                 mdsc->mdsmap = newmap;  /* first mds map */
         }
-       mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
+       mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
  
         __wake_requests(mdsc, &mdsc->waiting_for_map);
  
@@ -3277,7 +3313,7 @@ static int get_authorizer(struct ceph_connection *con,
  {
         struct ceph_mds_session *s = con->private;
         struct ceph_mds_client *mdsc = s->s_mdsc;
-       struct ceph_auth_client *ac = mdsc->client->monc.auth;
+       struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
         int ret = 0;
  
         if (force_new && s->s_authorizer) {
@@ -3311,7 +3347,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
  {
         struct ceph_mds_session *s = con->private;
         struct ceph_mds_client *mdsc = s->s_mdsc;
-       struct ceph_auth_client *ac = mdsc->client->monc.auth;
+       struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
  
         return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
  }
@@ -3320,12 +3356,12 @@ static int invalidate_authorizer(struct ceph_connection *con)
  {
         struct ceph_mds_session *s = con->private;
         struct ceph_mds_client *mdsc = s->s_mdsc;
-       struct ceph_auth_client *ac = mdsc->client->monc.auth;
+       struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
  
         if (ac->ops->invalidate_authorizer)
                 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
  
-       return ceph_monc_validate_auth(&mdsc->client->monc);
+       return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
  }
  
  static const struct ceph_connection_operations mds_con_ops = {
@@ -3338,7 +3374,4 @@ static const struct ceph_connection_operations mds_con_ops = {
         .peer_reset = peer_reset,
  };
  
-
-
-
  /* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h

index c98267ce6d2ad97e1d9c86bc0660e2d82d39366c..d66d63c7235526ef63d16ff0ea1e9ba3899df586 100644 (file)
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -8,9 +8,9 @@
  #include <linux/rbtree.h>
  #include <linux/spinlock.h>
  
-#include "types.h"
-#include "messenger.h"
-#include "mdsmap.h"
+#include <linux/ceph/types.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/mdsmap.h>
  
  /*
   * Some lock dependencies:
@@ -26,7 +26,7 @@
   *
   */
  
-struct ceph_client;
+struct ceph_fs_client;
  struct ceph_cap;
  
  /*
@@ -230,7 +230,7 @@ struct ceph_mds_request {
   * mds client state
   */
  struct ceph_mds_client {
-       struct ceph_client      *client;
+       struct ceph_fs_client  *fsc;
         struct mutex            mutex;         /* all nested structures */
  
         struct ceph_mdsmap      *mdsmap;
@@ -289,11 +289,6 @@ struct ceph_mds_client {
         int             caps_avail_count;    /* unused, unreserved */
         int             caps_min_count;      /* keep at least this many
                                                 (unreserved) */
-
-#ifdef CONFIG_DEBUG_FS
-       struct dentry     *debugfs_file;
-#endif
-
         spinlock_t        dentry_lru_lock;
         struct list_head  dentry_lru;
         int               num_dentry;
@@ -316,10 +311,9 @@ extern void ceph_put_mds_session(struct ceph_mds_session *s);
  extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
                              struct ceph_msg *msg, int mds);
  
-extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
-                          struct ceph_client *client);
+extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
  extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
-extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
  
  extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
  
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c

index 040be6d1150be5ace2be71e955ac3b3525b8fd76..73b7d44e8a354264e3f08f66e8cb788851328029 100644 (file)
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <linux/bug.h>
  #include <linux/err.h>
@@ -6,9 +6,9 @@
  #include <linux/slab.h>
  #include <linux/types.h>
  
-#include "mdsmap.h"
-#include "messenger.h"
-#include "decode.h"
+#include <linux/ceph/mdsmap.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
  
  #include "super.h"
  
@@ -117,7 +117,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                 }
  
                 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
-                    i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr),
+                    i+1, n, global_id, mds, inc,
+                    ceph_pr_addr(&addr.in_addr),
                      ceph_mds_state_name(state));
                 if (mds >= 0 && mds < m->m_max_mds && state > 0) {
                         m->m_info[mds].global_id = global_id;
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c

deleted file mode 100644 (file)

index 46a368b..0000000
--- a/fs/ceph/pagelist.c
+++ /dev/null
@@ -1,63 +0,0 @@
-
-#include <linux/gfp.h>
-#include <linux/pagemap.h>
-#include <linux/highmem.h>
-
-#include "pagelist.h"
-
-static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
-{
-       struct page *page = list_entry(pl->head.prev, struct page,
-                                      lru);
-       kunmap(page);
-}
-
-int ceph_pagelist_release(struct ceph_pagelist *pl)
-{
-       if (pl->mapped_tail)
-               ceph_pagelist_unmap_tail(pl);
-
-       while (!list_empty(&pl->head)) {
-               struct page *page = list_first_entry(&pl->head, struct page,
-                                                    lru);
-               list_del(&page->lru);
-               __free_page(page);
-       }
-       return 0;
-}
-
-static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
-{
-       struct page *page = __page_cache_alloc(GFP_NOFS);
-       if (!page)
-               return -ENOMEM;
-       pl->room += PAGE_SIZE;
-       list_add_tail(&page->lru, &pl->head);
-       if (pl->mapped_tail)
-               ceph_pagelist_unmap_tail(pl);
-       pl->mapped_tail = kmap(page);
-       return 0;
-}
-
-int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
-{
-       while (pl->room < len) {
-               size_t bit = pl->room;
-               int ret;
-
-               memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
-                      buf, bit);
-               pl->length += bit;
-               pl->room -= bit;
-               buf += bit;
-               len -= bit;
-               ret = ceph_pagelist_addpage(pl);
-               if (ret)
-                       return ret;
-       }
-
-       memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
-       pl->length += len;
-       pl->room -= len;
-       return 0;
-}
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c

index 190b6c4a6f2b91aace658f8de7664f75cc0bdcc4..39c243acd062c810d33e60da27af51cbcfe058e8 100644 (file)
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1,10 +1,12 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <linux/sort.h>
  #include <linux/slab.h>
  
  #include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+
+#include <linux/ceph/decode.h>
  
  /*
   * Snapshots in ceph are driven in large part by cooperation from the
@@ -526,7 +528,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
                             struct ceph_cap_snap *capsnap)
  {
         struct inode *inode = &ci->vfs_inode;
-       struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+       struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
  
         BUG_ON(capsnap->writing);
         capsnap->size = inode->i_size;
@@ -747,7 +749,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
                       struct ceph_mds_session *session,
                       struct ceph_msg *msg)
  {
-       struct super_block *sb = mdsc->client->sb;
+       struct super_block *sb = mdsc->fsc->sb;
         int mds = session->s_mds;
         u64 split;
         int op;
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/strings.c

similarity index 59%

rename from fs/ceph/ceph_strings.c

rename to fs/ceph/strings.c

index c6179d3a26a216cb7b8f6ee0119884162887c778..cd5097d7c804e5897eeed60716c962d1bbd59b5c 100644 (file)
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/strings.c
@@ -1,71 +1,9 @@
  /*
- * Ceph string constants
+ * Ceph fs string constants
   */
-#include "types.h"
+#include <linux/module.h>
+#include <linux/ceph/types.h>
  
-const char *ceph_entity_type_name(int type)
-{
-       switch (type) {
-       case CEPH_ENTITY_TYPE_MDS: return "mds";
-       case CEPH_ENTITY_TYPE_OSD: return "osd";
-       case CEPH_ENTITY_TYPE_MON: return "mon";
-       case CEPH_ENTITY_TYPE_CLIENT: return "client";
-       case CEPH_ENTITY_TYPE_AUTH: return "auth";
-       default: return "unknown";
-       }
-}
-
-const char *ceph_osd_op_name(int op)
-{
-       switch (op) {
-       case CEPH_OSD_OP_READ: return "read";
-       case CEPH_OSD_OP_STAT: return "stat";
-
-       case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
-
-       case CEPH_OSD_OP_WRITE: return "write";
-       case CEPH_OSD_OP_DELETE: return "delete";
-       case CEPH_OSD_OP_TRUNCATE: return "truncate";
-       case CEPH_OSD_OP_ZERO: return "zero";
-       case CEPH_OSD_OP_WRITEFULL: return "writefull";
-       case CEPH_OSD_OP_ROLLBACK: return "rollback";
-
-       case CEPH_OSD_OP_APPEND: return "append";
-       case CEPH_OSD_OP_STARTSYNC: return "startsync";
-       case CEPH_OSD_OP_SETTRUNC: return "settrunc";
-       case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
-
-       case CEPH_OSD_OP_TMAPUP: return "tmapup";
-       case CEPH_OSD_OP_TMAPGET: return "tmapget";
-       case CEPH_OSD_OP_TMAPPUT: return "tmapput";
-
-       case CEPH_OSD_OP_GETXATTR: return "getxattr";
-       case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
-       case CEPH_OSD_OP_SETXATTR: return "setxattr";
-       case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
-       case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
-       case CEPH_OSD_OP_RMXATTR: return "rmxattr";
-       case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
-
-       case CEPH_OSD_OP_PULL: return "pull";
-       case CEPH_OSD_OP_PUSH: return "push";
-       case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
-       case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
-       case CEPH_OSD_OP_SCRUB: return "scrub";
-
-       case CEPH_OSD_OP_WRLOCK: return "wrlock";
-       case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
-       case CEPH_OSD_OP_RDLOCK: return "rdlock";
-       case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
-       case CEPH_OSD_OP_UPLOCK: return "uplock";
-       case CEPH_OSD_OP_DNLOCK: return "dnlock";
-
-       case CEPH_OSD_OP_CALL: return "call";
-
-       case CEPH_OSD_OP_PGLS: return "pgls";
-       }
-       return "???";
-}
  
  const char *ceph_mds_state_name(int s)
  {
@@ -177,17 +115,3 @@ const char *ceph_snap_op_name(int o)
         }
         return "???";
  }
-
-const char *ceph_pool_op_name(int op)
-{
-       switch (op) {
-       case POOL_OP_CREATE: return "create";
-       case POOL_OP_DELETE: return "delete";
-       case POOL_OP_AUID_CHANGE: return "auid change";
-       case POOL_OP_CREATE_SNAP: return "create snap";
-       case POOL_OP_DELETE_SNAP: return "delete snap";
-       case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
-       case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
-       }
-       return "???";
-}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c

index 9922628532b2c649dee6761710405adc552ca248..d6e0e042189184183b4ccf82da8787622e7f7d11 100644 (file)
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1,5 +1,5 @@
  
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <linux/backing-dev.h>
  #include <linux/ctype.h>
@@ -15,10 +15,13 @@
  #include <linux/statfs.h>
  #include <linux/string.h>
  
-#include "decode.h"
  #include "super.h"
-#include "mon_client.h"
-#include "auth.h"
+#include "mds_client.h"
+
+#include <linux/ceph/decode.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
  
  /*
   * Ceph superblock operations
@@ -26,36 +29,22 @@
   * Handle the basics of mounting, unmounting.
   */
  
-
-/*
- * find filename portion of a path (/foo/bar/baz -> baz)
- */
-const char *ceph_file_part(const char *s, int len)
-{
-       const char *e = s + len;
-
-       while (e != s && *(e-1) != '/')
-               e--;
-       return e;
-}
-
-
  /*
   * super ops
   */
  static void ceph_put_super(struct super_block *s)
  {
-       struct ceph_client *client = ceph_sb_to_client(s);
+       struct ceph_fs_client *fsc = ceph_sb_to_client(s);
  
         dout("put_super\n");
-       ceph_mdsc_close_sessions(&client->mdsc);
+       ceph_mdsc_close_sessions(fsc->mdsc);
  
         /*
          * ensure we release the bdi before put_anon_super releases
          * the device name.
          */
-       if (s->s_bdi == &client->backing_dev_info) {
-               bdi_unregister(&client->backing_dev_info);
+       if (s->s_bdi == &fsc->backing_dev_info) {
+               bdi_unregister(&fsc->backing_dev_info);
                 s->s_bdi = NULL;
         }
  
@@ -64,14 +53,14 @@ static void ceph_put_super(struct super_block *s)
  
  static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
  {
-       struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
-       struct ceph_monmap *monmap = client->monc.monmap;
+       struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode);
+       struct ceph_monmap *monmap = fsc->client->monc.monmap;
         struct ceph_statfs st;
         u64 fsid;
         int err;
  
         dout("statfs\n");
-       err = ceph_monc_do_statfs(&client->monc, &st);
+       err = ceph_monc_do_statfs(&fsc->client->monc, &st);
         if (err < 0)
                 return err;
  
@@ -104,238 +93,28 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
  
  static int ceph_sync_fs(struct super_block *sb, int wait)
  {
-       struct ceph_client *client = ceph_sb_to_client(sb);
+       struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
  
         if (!wait) {
                 dout("sync_fs (non-blocking)\n");
-               ceph_flush_dirty_caps(&client->mdsc);
+               ceph_flush_dirty_caps(fsc->mdsc);
                 dout("sync_fs (non-blocking) done\n");
                 return 0;
         }
  
         dout("sync_fs (blocking)\n");
-       ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
-       ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
+       ceph_osdc_sync(&fsc->client->osdc);
+       ceph_mdsc_sync(fsc->mdsc);
         dout("sync_fs (blocking) done\n");
         return 0;
  }
  
-static int default_congestion_kb(void)
-{
-       int congestion_kb;
-
-       /*
-        * Copied from NFS
-        *
-        * congestion size, scale with available memory.
-        *
-        *  64MB:    8192k
-        * 128MB:   11585k
-        * 256MB:   16384k
-        * 512MB:   23170k
-        *   1GB:   32768k
-        *   2GB:   46340k
-        *   4GB:   65536k
-        *   8GB:   92681k
-        *  16GB:  131072k
-        *
-        * This allows larger machines to have larger/more transfers.
-        * Limit the default to 256M
-        */
-       congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
-       if (congestion_kb > 256*1024)
-               congestion_kb = 256*1024;
-
-       return congestion_kb;
-}
-
-/**
- * ceph_show_options - Show mount options in /proc/mounts
- * @m: seq_file to write to
- * @mnt: mount descriptor
- */
-static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
-{
-       struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
-       struct ceph_mount_args *args = client->mount_args;
-
-       if (args->flags & CEPH_OPT_FSID)
-               seq_printf(m, ",fsid=%pU", &args->fsid);
-       if (args->flags & CEPH_OPT_NOSHARE)
-               seq_puts(m, ",noshare");
-       if (args->flags & CEPH_OPT_DIRSTAT)
-               seq_puts(m, ",dirstat");
-       if ((args->flags & CEPH_OPT_RBYTES) == 0)
-               seq_puts(m, ",norbytes");
-       if (args->flags & CEPH_OPT_NOCRC)
-               seq_puts(m, ",nocrc");
-       if (args->flags & CEPH_OPT_NOASYNCREADDIR)
-               seq_puts(m, ",noasyncreaddir");
-
-       if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
-               seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
-       if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
-               seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
-       if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
-               seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
-       if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
-               seq_printf(m, ",osdkeepalivetimeout=%d",
-                        args->osd_keepalive_timeout);
-       if (args->wsize)
-               seq_printf(m, ",wsize=%d", args->wsize);
-       if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
-               seq_printf(m, ",rsize=%d", args->rsize);
-       if (args->congestion_kb != default_congestion_kb())
-               seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
-       if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
-               seq_printf(m, ",caps_wanted_delay_min=%d",
-                        args->caps_wanted_delay_min);
-       if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
-               seq_printf(m, ",caps_wanted_delay_max=%d",
-                          args->caps_wanted_delay_max);
-       if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
-               seq_printf(m, ",cap_release_safety=%d",
-                          args->cap_release_safety);
-       if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
-               seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
-       if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
-               seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
-       if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
-               seq_printf(m, ",snapdirname=%s", args->snapdir_name);
-       if (args->name)
-               seq_printf(m, ",name=%s", args->name);
-       if (args->secret)
-               seq_puts(m, ",secret=<hidden>");
-       return 0;
-}
-
-/*
- * caches
- */
-struct kmem_cache *ceph_inode_cachep;
-struct kmem_cache *ceph_cap_cachep;
-struct kmem_cache *ceph_dentry_cachep;
-struct kmem_cache *ceph_file_cachep;
-
-static void ceph_inode_init_once(void *foo)
-{
-       struct ceph_inode_info *ci = foo;
-       inode_init_once(&ci->vfs_inode);
-}
-
-static int __init init_caches(void)
-{
-       ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
-                                     sizeof(struct ceph_inode_info),
-                                     __alignof__(struct ceph_inode_info),
-                                     (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
-                                     ceph_inode_init_once);
-       if (ceph_inode_cachep == NULL)
-               return -ENOMEM;
-
-       ceph_cap_cachep = KMEM_CACHE(ceph_cap,
-                                    SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
-       if (ceph_cap_cachep == NULL)
-               goto bad_cap;
-
-       ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
-                                       SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
-       if (ceph_dentry_cachep == NULL)
-               goto bad_dentry;
-
-       ceph_file_cachep = KMEM_CACHE(ceph_file_info,
-                                     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
-       if (ceph_file_cachep == NULL)
-               goto bad_file;
-
-       return 0;
-
-bad_file:
-       kmem_cache_destroy(ceph_dentry_cachep);
-bad_dentry:
-       kmem_cache_destroy(ceph_cap_cachep);
-bad_cap:
-       kmem_cache_destroy(ceph_inode_cachep);
-       return -ENOMEM;
-}
-
-static void destroy_caches(void)
-{
-       kmem_cache_destroy(ceph_inode_cachep);
-       kmem_cache_destroy(ceph_cap_cachep);
-       kmem_cache_destroy(ceph_dentry_cachep);
-       kmem_cache_destroy(ceph_file_cachep);
-}
-
-
-/*
- * ceph_umount_begin - initiate forced umount.  Tear down down the
- * mount, skipping steps that may hang while waiting for server(s).
- */
-static void ceph_umount_begin(struct super_block *sb)
-{
-       struct ceph_client *client = ceph_sb_to_client(sb);
-
-       dout("ceph_umount_begin - starting forced umount\n");
-       if (!client)
-               return;
-       client->mount_state = CEPH_MOUNT_SHUTDOWN;
-       return;
-}
-
-static const struct super_operations ceph_super_ops = {
-       .alloc_inode    = ceph_alloc_inode,
-       .destroy_inode  = ceph_destroy_inode,
-       .write_inode    = ceph_write_inode,
-       .sync_fs        = ceph_sync_fs,
-       .put_super      = ceph_put_super,
-       .show_options   = ceph_show_options,
-       .statfs         = ceph_statfs,
-       .umount_begin   = ceph_umount_begin,
-};
-
-
-const char *ceph_msg_type_name(int type)
-{
-       switch (type) {
-       case CEPH_MSG_SHUTDOWN: return "shutdown";
-       case CEPH_MSG_PING: return "ping";
-       case CEPH_MSG_AUTH: return "auth";
-       case CEPH_MSG_AUTH_REPLY: return "auth_reply";
-       case CEPH_MSG_MON_MAP: return "mon_map";
-       case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
-       case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
-       case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
-       case CEPH_MSG_STATFS: return "statfs";
-       case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
-       case CEPH_MSG_MDS_MAP: return "mds_map";
-       case CEPH_MSG_CLIENT_SESSION: return "client_session";
-       case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
-       case CEPH_MSG_CLIENT_REQUEST: return "client_request";
-       case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
-       case CEPH_MSG_CLIENT_REPLY: return "client_reply";
-       case CEPH_MSG_CLIENT_CAPS: return "client_caps";
-       case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
-       case CEPH_MSG_CLIENT_SNAP: return "client_snap";
-       case CEPH_MSG_CLIENT_LEASE: return "client_lease";
-       case CEPH_MSG_OSD_MAP: return "osd_map";
-       case CEPH_MSG_OSD_OP: return "osd_op";
-       case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
-       default: return "unknown";
-       }
-}
-
-
  /*
   * mount options
   */
  enum {
         Opt_wsize,
         Opt_rsize,
-       Opt_osdtimeout,
-       Opt_osdkeepalivetimeout,
-       Opt_mount_timeout,
-       Opt_osd_idle_ttl,
         Opt_caps_wanted_delay_min,
         Opt_caps_wanted_delay_max,
         Opt_cap_release_safety,
@@ -344,29 +123,19 @@ enum {
         Opt_congestion_kb,
         Opt_last_int,
         /* int args above */
-       Opt_fsid,
         Opt_snapdirname,
-       Opt_name,
-       Opt_secret,
         Opt_last_string,
         /* string args above */
-       Opt_ip,
-       Opt_noshare,
         Opt_dirstat,
         Opt_nodirstat,
         Opt_rbytes,
         Opt_norbytes,
-       Opt_nocrc,
         Opt_noasyncreaddir,
  };
  
-static match_table_t arg_tokens = {
+static match_table_t fsopt_tokens = {
         {Opt_wsize, "wsize=%d"},
         {Opt_rsize, "rsize=%d"},
-       {Opt_osdtimeout, "osdtimeout=%d"},
-       {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
-       {Opt_mount_timeout, "mount_timeout=%d"},
-       {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
         {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
         {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
         {Opt_cap_release_safety, "cap_release_safety=%d"},
@@ -374,403 +143,459 @@ static match_table_t arg_tokens = {
         {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
         {Opt_congestion_kb, "write_congestion_kb=%d"},
         /* int args above */
-       {Opt_fsid, "fsid=%s"},
         {Opt_snapdirname, "snapdirname=%s"},
-       {Opt_name, "name=%s"},
-       {Opt_secret, "secret=%s"},
         /* string args above */
-       {Opt_ip, "ip=%s"},
-       {Opt_noshare, "noshare"},
         {Opt_dirstat, "dirstat"},
         {Opt_nodirstat, "nodirstat"},
         {Opt_rbytes, "rbytes"},
         {Opt_norbytes, "norbytes"},
-       {Opt_nocrc, "nocrc"},
         {Opt_noasyncreaddir, "noasyncreaddir"},
         {-1, NULL}
  };
  
-static int parse_fsid(const char *str, struct ceph_fsid *fsid)
+static int parse_fsopt_token(char *c, void *private)
  {
-       int i = 0;
-       char tmp[3];
-       int err = -EINVAL;
-       int d;
-
-       dout("parse_fsid '%s'\n", str);
-       tmp[2] = 0;
-       while (*str && i < 16) {
-               if (ispunct(*str)) {
-                       str++;
-                       continue;
+       struct ceph_mount_options *fsopt = private;
+       substring_t argstr[MAX_OPT_ARGS];
+       int token, intval, ret;
+
+       token = match_token((char *)c, fsopt_tokens, argstr);
+       if (token < 0)
+               return -EINVAL;
+
+       if (token < Opt_last_int) {
+               ret = match_int(&argstr[0], &intval);
+               if (ret < 0) {
+                       pr_err("bad mount option arg (not int) "
+                              "at '%s'\n", c);
+                       return ret;
                 }
-               if (!isxdigit(str[0]) || !isxdigit(str[1]))
-                       break;
-               tmp[0] = str[0];
-               tmp[1] = str[1];
-               if (sscanf(tmp, "%x", &d) < 1)
-                       break;
-               fsid->fsid[i] = d & 0xff;
-               i++;
-               str += 2;
+               dout("got int token %d val %d\n", token, intval);
+       } else if (token > Opt_last_int && token < Opt_last_string) {
+               dout("got string token %d val %s\n", token,
+                    argstr[0].from);
+       } else {
+               dout("got token %d\n", token);
         }
  
-       if (i == 16)
-               err = 0;
-       dout("parse_fsid ret %d got fsid %pU", err, fsid);
-       return err;
+       switch (token) {
+       case Opt_snapdirname:
+               kfree(fsopt->snapdir_name);
+               fsopt->snapdir_name = kstrndup(argstr[0].from,
+                                              argstr[0].to-argstr[0].from,
+                                              GFP_KERNEL);
+               if (!fsopt->snapdir_name)
+                       return -ENOMEM;
+               break;
+
+               /* misc */
+       case Opt_wsize:
+               fsopt->wsize = intval;
+               break;
+       case Opt_rsize:
+               fsopt->rsize = intval;
+               break;
+       case Opt_caps_wanted_delay_min:
+               fsopt->caps_wanted_delay_min = intval;
+               break;
+       case Opt_caps_wanted_delay_max:
+               fsopt->caps_wanted_delay_max = intval;
+               break;
+       case Opt_readdir_max_entries:
+               fsopt->max_readdir = intval;
+               break;
+       case Opt_readdir_max_bytes:
+               fsopt->max_readdir_bytes = intval;
+               break;
+       case Opt_congestion_kb:
+               fsopt->congestion_kb = intval;
+               break;
+       case Opt_dirstat:
+               fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
+               break;
+       case Opt_nodirstat:
+               fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
+               break;
+       case Opt_rbytes:
+               fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
+               break;
+       case Opt_norbytes:
+               fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
+               break;
+       case Opt_noasyncreaddir:
+               fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
+               break;
+       default:
+               BUG_ON(token);
+       }
+       return 0;
  }
  
-static struct ceph_mount_args *parse_mount_args(int flags, char *options,
-                                               const char *dev_name,
-                                               const char **path)
+static void destroy_mount_options(struct ceph_mount_options *args)
  {
-       struct ceph_mount_args *args;
-       const char *c;
-       int err = -ENOMEM;
-       substring_t argstr[MAX_OPT_ARGS];
+       dout("destroy_mount_options %p\n", args);
+       kfree(args->snapdir_name);
+       kfree(args);
+}
  
-       args = kzalloc(sizeof(*args), GFP_KERNEL);
-       if (!args)
-               return ERR_PTR(-ENOMEM);
-       args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
-                                GFP_KERNEL);
-       if (!args->mon_addr)
-               goto out;
+static int strcmp_null(const char *s1, const char *s2)
+{
+       if (!s1 && !s2)
+               return 0;
+       if (s1 && !s2)
+               return -1;
+       if (!s1 && s2)
+               return 1;
+       return strcmp(s1, s2);
+}
  
-       dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
-
-       /* start with defaults */
-       args->sb_flags = flags;
-       args->flags = CEPH_OPT_DEFAULT;
-       args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
-       args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
-       args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
-       args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
-       args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
-       args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
-       args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
-       args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
-       args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
-       args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
-       args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
-       args->congestion_kb = default_congestion_kb();
-
-       /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
-       err = -EINVAL;
-       if (!dev_name)
-               goto out;
-       *path = strstr(dev_name, ":/");
-       if (*path == NULL) {
-               pr_err("device name is missing path (no :/ in %s)\n",
-                      dev_name);
-               goto out;
-       }
+static int compare_mount_options(struct ceph_mount_options *new_fsopt,
+                                struct ceph_options *new_opt,
+                                struct ceph_fs_client *fsc)
+{
+       struct ceph_mount_options *fsopt1 = new_fsopt;
+       struct ceph_mount_options *fsopt2 = fsc->mount_options;
+       int ofs = offsetof(struct ceph_mount_options, snapdir_name);
+       int ret;
  
-       /* get mon ip(s) */
-       err = ceph_parse_ips(dev_name, *path, args->mon_addr,
-                            CEPH_MAX_MON, &args->num_mon);
-       if (err < 0)
-               goto out;
+       ret = memcmp(fsopt1, fsopt2, ofs);
+       if (ret)
+               return ret;
+
+       ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
+       if (ret)
+               return ret;
+
+       return ceph_compare_options(new_opt, fsc->client);
+}
+
+static int parse_mount_options(struct ceph_mount_options **pfsopt,
+                              struct ceph_options **popt,
+                              int flags, char *options,
+                              const char *dev_name,
+                              const char **path)
+{
+       struct ceph_mount_options *fsopt;
+       const char *dev_name_end;
+       int err = -ENOMEM;
+
+       fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
+       if (!fsopt)
+               return -ENOMEM;
+
+       dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
+
+        fsopt->sb_flags = flags;
+        fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
+
+        fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
+        fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+        fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
+        fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+        fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
+        fsopt->congestion_kb = default_congestion_kb();
+       
+        /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
+        err = -EINVAL;
+        if (!dev_name)
+                goto out;
+        *path = strstr(dev_name, ":/");
+        if (*path == NULL) {
+                pr_err("device name is missing path (no :/ in %s)\n",
+                       dev_name);
+                goto out;
+        }
+       dev_name_end = *path;
+       dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
  
         /* path on server */
         *path += 2;
         dout("server path '%s'\n", *path);
  
-       /* parse mount options */
-       while ((c = strsep(&options, ",")) != NULL) {
-               int token, intval, ret;
-               if (!*c)
-                       continue;
-               err = -EINVAL;
-               token = match_token((char *)c, arg_tokens, argstr);
-               if (token < 0) {
-                       pr_err("bad mount option at '%s'\n", c);
-                       goto out;
-               }
-               if (token < Opt_last_int) {
-                       ret = match_int(&argstr[0], &intval);
-                       if (ret < 0) {
-                               pr_err("bad mount option arg (not int) "
-                                      "at '%s'\n", c);
-                               continue;
-                       }
-                       dout("got int token %d val %d\n", token, intval);
-               } else if (token > Opt_last_int && token < Opt_last_string) {
-                       dout("got string token %d val %s\n", token,
-                            argstr[0].from);
-               } else {
-                       dout("got token %d\n", token);
-               }
-               switch (token) {
-               case Opt_ip:
-                       err = ceph_parse_ips(argstr[0].from,
-                                            argstr[0].to,
-                                            &args->my_addr,
-                                            1, NULL);
-                       if (err < 0)
-                               goto out;
-                       args->flags |= CEPH_OPT_MYIP;
-                       break;
-
-               case Opt_fsid:
-                       err = parse_fsid(argstr[0].from, &args->fsid);
-                       if (err == 0)
-                               args->flags |= CEPH_OPT_FSID;
-                       break;
-               case Opt_snapdirname:
-                       kfree(args->snapdir_name);
-                       args->snapdir_name = kstrndup(argstr[0].from,
-                                             argstr[0].to-argstr[0].from,
-                                             GFP_KERNEL);
-                       break;
-               case Opt_name:
-                       args->name = kstrndup(argstr[0].from,
-                                             argstr[0].to-argstr[0].from,
-                                             GFP_KERNEL);
-                       break;
-               case Opt_secret:
-                       args->secret = kstrndup(argstr[0].from,
-                                               argstr[0].to-argstr[0].from,
-                                               GFP_KERNEL);
-                       break;
-
-                       /* misc */
-               case Opt_wsize:
-                       args->wsize = intval;
-                       break;
-               case Opt_rsize:
-                       args->rsize = intval;
-                       break;
-               case Opt_osdtimeout:
-                       args->osd_timeout = intval;
-                       break;
-               case Opt_osdkeepalivetimeout:
-                       args->osd_keepalive_timeout = intval;
-                       break;
-               case Opt_osd_idle_ttl:
-                       args->osd_idle_ttl = intval;
-                       break;
-               case Opt_mount_timeout:
-                       args->mount_timeout = intval;
-                       break;
-               case Opt_caps_wanted_delay_min:
-                       args->caps_wanted_delay_min = intval;
-                       break;
-               case Opt_caps_wanted_delay_max:
-                       args->caps_wanted_delay_max = intval;
-                       break;
-               case Opt_readdir_max_entries:
-                       args->max_readdir = intval;
-                       break;
-               case Opt_readdir_max_bytes:
-                       args->max_readdir_bytes = intval;
-                       break;
-               case Opt_congestion_kb:
-                       args->congestion_kb = intval;
-                       break;
-
-               case Opt_noshare:
-                       args->flags |= CEPH_OPT_NOSHARE;
-                       break;
-
-               case Opt_dirstat:
-                       args->flags |= CEPH_OPT_DIRSTAT;
-                       break;
-               case Opt_nodirstat:
-                       args->flags &= ~CEPH_OPT_DIRSTAT;
-                       break;
-               case Opt_rbytes:
-                       args->flags |= CEPH_OPT_RBYTES;
-                       break;
-               case Opt_norbytes:
-                       args->flags &= ~CEPH_OPT_RBYTES;
-                       break;
-               case Opt_nocrc:
-                       args->flags |= CEPH_OPT_NOCRC;
-                       break;
-               case Opt_noasyncreaddir:
-                       args->flags |= CEPH_OPT_NOASYNCREADDIR;
-                       break;
-
-               default:
-                       BUG_ON(token);
-               }
-       }
-       return args;
+       err = ceph_parse_options(popt, options, dev_name, dev_name_end,
+                                parse_fsopt_token, (void *)fsopt);
+       if (err)
+               goto out;
+
+       /* success */
+       *pfsopt = fsopt;
+       return 0;
  
  out:
-       kfree(args->mon_addr);
-       kfree(args);
-       return ERR_PTR(err);
+       destroy_mount_options(fsopt);
+       return err;
  }
  
-static void destroy_mount_args(struct ceph_mount_args *args)
+/**
+ * ceph_show_options - Show mount options in /proc/mounts
+ * @m: seq_file to write to
+ * @mnt: mount descriptor
+ */
+static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
  {
-       dout("destroy_mount_args %p\n", args);
-       kfree(args->snapdir_name);
-       args->snapdir_name = NULL;
-       kfree(args->name);
-       args->name = NULL;
-       kfree(args->secret);
-       args->secret = NULL;
-       kfree(args);
+       struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb);
+       struct ceph_mount_options *fsopt = fsc->mount_options;
+       struct ceph_options *opt = fsc->client->options;
+
+       if (opt->flags & CEPH_OPT_FSID)
+               seq_printf(m, ",fsid=%pU", &opt->fsid);
+       if (opt->flags & CEPH_OPT_NOSHARE)
+               seq_puts(m, ",noshare");
+       if (opt->flags & CEPH_OPT_NOCRC)
+               seq_puts(m, ",nocrc");
+
+       if (opt->name)
+               seq_printf(m, ",name=%s", opt->name);
+       if (opt->secret)
+               seq_puts(m, ",secret=<hidden>");
+
+       if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
+               seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
+       if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
+               seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
+       if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
+               seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
+       if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
+               seq_printf(m, ",osdkeepalivetimeout=%d",
+                          opt->osd_keepalive_timeout);
+
+       if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
+               seq_puts(m, ",dirstat");
+       if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
+               seq_puts(m, ",norbytes");
+       if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
+               seq_puts(m, ",noasyncreaddir");
+
+       if (fsopt->wsize)
+               seq_printf(m, ",wsize=%d", fsopt->wsize);
+       if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
+               seq_printf(m, ",rsize=%d", fsopt->rsize);
+       if (fsopt->congestion_kb != default_congestion_kb())
+               seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
+       if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
+               seq_printf(m, ",caps_wanted_delay_min=%d",
+                        fsopt->caps_wanted_delay_min);
+       if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
+               seq_printf(m, ",caps_wanted_delay_max=%d",
+                          fsopt->caps_wanted_delay_max);
+       if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
+               seq_printf(m, ",cap_release_safety=%d",
+                          fsopt->cap_release_safety);
+       if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
+               seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
+       if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
+               seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
+       if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
+               seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
+       return 0;
  }
  
  /*
- * create a fresh client instance
+ * handle any mon messages the standard library doesn't understand.
+ * return error if we don't either.
   */
-static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
+static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
  {
-       struct ceph_client *client;
+       struct ceph_fs_client *fsc = client->private;
+       int type = le16_to_cpu(msg->hdr.type);
+
+       switch (type) {
+       case CEPH_MSG_MDS_MAP:
+               ceph_mdsc_handle_map(fsc->mdsc, msg);
+               return 0;
+
+       default:
+               return -1;
+       }
+}
+
+/*
+ * create a new fs client
+ */
+struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
+                                       struct ceph_options *opt)
+{
+       struct ceph_fs_client *fsc;
         int err = -ENOMEM;
  
-       client = kzalloc(sizeof(*client), GFP_KERNEL);
-       if (client == NULL)
+       fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
+       if (!fsc)
                 return ERR_PTR(-ENOMEM);
  
-       mutex_init(&client->mount_mutex);
-
-       init_waitqueue_head(&client->auth_wq);
+       fsc->client = ceph_create_client(opt, fsc);
+       if (IS_ERR(fsc->client)) {
+               err = PTR_ERR(fsc->client);
+               goto fail;
+       }
+       fsc->client->extra_mon_dispatch = extra_mon_dispatch;
+       fsc->client->supported_features |= CEPH_FEATURE_FLOCK;
+       fsc->client->monc.want_mdsmap = 1;
  
-       client->sb = NULL;
-       client->mount_state = CEPH_MOUNT_MOUNTING;
-       client->mount_args = args;
+       fsc->mount_options = fsopt;
  
-       client->msgr = NULL;
+       fsc->sb = NULL;
+       fsc->mount_state = CEPH_MOUNT_MOUNTING;
  
-       client->auth_err = 0;
-       atomic_long_set(&client->writeback_count, 0);
+       atomic_long_set(&fsc->writeback_count, 0);
  
-       err = bdi_init(&client->backing_dev_info);
+       err = bdi_init(&fsc->backing_dev_info);
         if (err < 0)
-               goto fail;
+               goto fail_client;
  
         err = -ENOMEM;
-       client->wb_wq = create_workqueue("ceph-writeback");
-       if (client->wb_wq == NULL)
+       fsc->wb_wq = create_workqueue("ceph-writeback");
+       if (fsc->wb_wq == NULL)
                 goto fail_bdi;
-       client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
-       if (client->pg_inv_wq == NULL)
+       fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
+       if (fsc->pg_inv_wq == NULL)
                 goto fail_wb_wq;
-       client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
-       if (client->trunc_wq == NULL)
+       fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc");
+       if (fsc->trunc_wq == NULL)
                 goto fail_pg_inv_wq;
  
         /* set up mempools */
         err = -ENOMEM;
-       client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
-                             client->mount_args->wsize >> PAGE_CACHE_SHIFT);
-       if (!client->wb_pagevec_pool)
+       fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
+                             fsc->mount_options->wsize >> PAGE_CACHE_SHIFT);
+       if (!fsc->wb_pagevec_pool)
                 goto fail_trunc_wq;
  
         /* caps */
-       client->min_caps = args->max_readdir;
+       fsc->min_caps = fsopt->max_readdir;
+
+       return fsc;
  
-       /* subsystems */
-       err = ceph_monc_init(&client->monc, client);
-       if (err < 0)
-               goto fail_mempool;
-       err = ceph_osdc_init(&client->osdc, client);
-       if (err < 0)
-               goto fail_monc;
-       err = ceph_mdsc_init(&client->mdsc, client);
-       if (err < 0)
-               goto fail_osdc;
-       return client;
-
-fail_osdc:
-       ceph_osdc_stop(&client->osdc);
-fail_monc:
-       ceph_monc_stop(&client->monc);
-fail_mempool:
-       mempool_destroy(client->wb_pagevec_pool);
  fail_trunc_wq:
-       destroy_workqueue(client->trunc_wq);
+       destroy_workqueue(fsc->trunc_wq);
  fail_pg_inv_wq:
-       destroy_workqueue(client->pg_inv_wq);
+       destroy_workqueue(fsc->pg_inv_wq);
  fail_wb_wq:
-       destroy_workqueue(client->wb_wq);
+       destroy_workqueue(fsc->wb_wq);
  fail_bdi:
-       bdi_destroy(&client->backing_dev_info);
+       bdi_destroy(&fsc->backing_dev_info);
+fail_client:
+       ceph_destroy_client(fsc->client);
  fail:
-       kfree(client);
+       kfree(fsc);
         return ERR_PTR(err);
  }
  
-static void ceph_destroy_client(struct ceph_client *client)
+void destroy_fs_client(struct ceph_fs_client *fsc)
  {
-       dout("destroy_client %p\n", client);
+       dout("destroy_fs_client %p\n", fsc);
  
-       /* unmount */
-       ceph_mdsc_stop(&client->mdsc);
-       ceph_osdc_stop(&client->osdc);
+       destroy_workqueue(fsc->wb_wq);
+       destroy_workqueue(fsc->pg_inv_wq);
+       destroy_workqueue(fsc->trunc_wq);
  
-       /*
-        * make sure mds and osd connections close out before destroying
-        * the auth module, which is needed to free those connections'
-        * ceph_authorizers.
-        */
-       ceph_msgr_flush();
-
-       ceph_monc_stop(&client->monc);
+       bdi_destroy(&fsc->backing_dev_info);
  
-       ceph_debugfs_client_cleanup(client);
-       destroy_workqueue(client->wb_wq);
-       destroy_workqueue(client->pg_inv_wq);
-       destroy_workqueue(client->trunc_wq);
+       mempool_destroy(fsc->wb_pagevec_pool);
  
-       bdi_destroy(&client->backing_dev_info);
+       destroy_mount_options(fsc->mount_options);
  
-       if (client->msgr)
-               ceph_messenger_destroy(client->msgr);
-       mempool_destroy(client->wb_pagevec_pool);
+       ceph_fs_debugfs_cleanup(fsc);
  
-       destroy_mount_args(client->mount_args);
+       ceph_destroy_client(fsc->client);
  
-       kfree(client);
-       dout("destroy_client %p done\n", client);
+       kfree(fsc);
+       dout("destroy_fs_client %p done\n", fsc);
  }
  
  /*
- * Initially learn our fsid, or verify an fsid matches.
+ * caches
   */
-int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
+struct kmem_cache *ceph_inode_cachep;
+struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_dentry_cachep;
+struct kmem_cache *ceph_file_cachep;
+
+static void ceph_inode_init_once(void *foo)
  {
-       if (client->have_fsid) {
-               if (ceph_fsid_compare(&client->fsid, fsid)) {
-                       pr_err("bad fsid, had %pU got %pU",
-                              &client->fsid, fsid);
-                       return -1;
-               }
-       } else {
-               pr_info("client%lld fsid %pU\n", client->monc.auth->global_id,
-                       fsid);
-               memcpy(&client->fsid, fsid, sizeof(*fsid));
-               ceph_debugfs_client_init(client);
-               client->have_fsid = true;
-       }
+       struct ceph_inode_info *ci = foo;
+       inode_init_once(&ci->vfs_inode);
+}
+
+static int __init init_caches(void)
+{
+       ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
+                                     sizeof(struct ceph_inode_info),
+                                     __alignof__(struct ceph_inode_info),
+                                     (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+                                     ceph_inode_init_once);
+       if (ceph_inode_cachep == NULL)
+               return -ENOMEM;
+
+       ceph_cap_cachep = KMEM_CACHE(ceph_cap,
+                                    SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+       if (ceph_cap_cachep == NULL)
+               goto bad_cap;
+
+       ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
+                                       SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+       if (ceph_dentry_cachep == NULL)
+               goto bad_dentry;
+
+       ceph_file_cachep = KMEM_CACHE(ceph_file_info,
+                                     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+       if (ceph_file_cachep == NULL)
+               goto bad_file;
+
         return 0;
+
+bad_file:
+       kmem_cache_destroy(ceph_dentry_cachep);
+bad_dentry:
+       kmem_cache_destroy(ceph_cap_cachep);
+bad_cap:
+       kmem_cache_destroy(ceph_inode_cachep);
+       return -ENOMEM;
  }
  
+static void destroy_caches(void)
+{
+       kmem_cache_destroy(ceph_inode_cachep);
+       kmem_cache_destroy(ceph_cap_cachep);
+       kmem_cache_destroy(ceph_dentry_cachep);
+       kmem_cache_destroy(ceph_file_cachep);
+}
+
+
  /*
- * true if we have the mon map (and have thus joined the cluster)
+ * ceph_umount_begin - initiate forced umount.  Tear down down the
+ * mount, skipping steps that may hang while waiting for server(s).
   */
-static int have_mon_and_osd_map(struct ceph_client *client)
+static void ceph_umount_begin(struct super_block *sb)
  {
-       return client->monc.monmap && client->monc.monmap->epoch &&
-              client->osdc.osdmap && client->osdc.osdmap->epoch;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+
+       dout("ceph_umount_begin - starting forced umount\n");
+       if (!fsc)
+               return;
+       fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
+       return;
  }
  
+static const struct super_operations ceph_super_ops = {
+       .alloc_inode    = ceph_alloc_inode,
+       .destroy_inode  = ceph_destroy_inode,
+       .write_inode    = ceph_write_inode,
+       .sync_fs        = ceph_sync_fs,
+       .put_super      = ceph_put_super,
+       .show_options   = ceph_show_options,
+       .statfs         = ceph_statfs,
+       .umount_begin   = ceph_umount_begin,
+};
+
  /*
   * Bootstrap mount by opening the root directory.  Note the mount
   * @started time from caller, and time out if this takes too long.
   */
-static struct dentry *open_root_dentry(struct ceph_client *client,
+static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
                                        const char *path,
                                        unsigned long started)
  {
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct ceph_mds_request *req = NULL;
         int err;
         struct dentry *root;
@@ -784,14 +609,14 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
         req->r_ino1.ino = CEPH_INO_ROOT;
         req->r_ino1.snap = CEPH_NOSNAP;
         req->r_started = started;
-       req->r_timeout = client->mount_args->mount_timeout * HZ;
+       req->r_timeout = fsc->client->options->mount_timeout * HZ;
         req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
         req->r_num_caps = 2;
         err = ceph_mdsc_do_request(mdsc, NULL, req);
         if (err == 0) {
                 dout("open_root_inode success\n");
                 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
-                   client->sb->s_root == NULL)
+                   fsc->sb->s_root == NULL)
                         root = d_alloc_root(req->r_target_inode);
                 else
                         root = d_obtain_alias(req->r_target_inode);
@@ -804,105 +629,86 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
         return root;
  }
  
+
+
+
  /*
   * mount: join the ceph cluster, and open root directory.
   */
-static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
+static int ceph_mount(struct ceph_fs_client *fsc, struct vfsmount *mnt,
                       const char *path)
  {
-       struct ceph_entity_addr *myaddr = NULL;
         int err;
-       unsigned long timeout = client->mount_args->mount_timeout * HZ;
         unsigned long started = jiffies;  /* note the start time */
         struct dentry *root;
+       int first = 0;   /* first vfsmount for this super_block */
  
         dout("mount start\n");
-       mutex_lock(&client->mount_mutex);
-
-       /* initialize the messenger */
-       if (client->msgr == NULL) {
-               if (ceph_test_opt(client, MYIP))
-                       myaddr = &client->mount_args->my_addr;
-               client->msgr = ceph_messenger_create(myaddr);
-               if (IS_ERR(client->msgr)) {
-                       err = PTR_ERR(client->msgr);
-                       client->msgr = NULL;
-                       goto out;
-               }
-               client->msgr->nocrc = ceph_test_opt(client, NOCRC);
-       }
+       mutex_lock(&fsc->client->mount_mutex);
  
-       /* open session, and wait for mon, mds, and osd maps */
-       err = ceph_monc_open_session(&client->monc);
+       err = __ceph_open_session(fsc->client, started);
         if (err < 0)
                 goto out;
  
-       while (!have_mon_and_osd_map(client)) {
-               err = -EIO;
-               if (timeout && time_after_eq(jiffies, started + timeout))
-                       goto out;
-
-               /* wait */
-               dout("mount waiting for mon_map\n");
-               err = wait_event_interruptible_timeout(client->auth_wq,
-                      have_mon_and_osd_map(client) || (client->auth_err < 0),
-                      timeout);
-               if (err == -EINTR || err == -ERESTARTSYS)
-                       goto out;
-               if (client->auth_err < 0) {
-                       err = client->auth_err;
-                       goto out;
-               }
-       }
-
         dout("mount opening root\n");
-       root = open_root_dentry(client, "", started);
+       root = open_root_dentry(fsc, "", started);
         if (IS_ERR(root)) {
                 err = PTR_ERR(root);
                 goto out;
         }
-       if (client->sb->s_root)
+       if (fsc->sb->s_root) {
                 dput(root);
-       else
-               client->sb->s_root = root;
+       } else {
+               fsc->sb->s_root = root;
+               first = 1;
+
+               err = ceph_fs_debugfs_init(fsc);
+               if (err < 0)
+                       goto fail;
+       }
  
         if (path[0] == 0) {
                 dget(root);
         } else {
                 dout("mount opening base mountpoint\n");
-               root = open_root_dentry(client, path, started);
+               root = open_root_dentry(fsc, path, started);
                 if (IS_ERR(root)) {
                         err = PTR_ERR(root);
-                       dput(client->sb->s_root);
-                       client->sb->s_root = NULL;
-                       goto out;
+                       goto fail;
                 }
         }
  
         mnt->mnt_root = root;
-       mnt->mnt_sb = client->sb;
+       mnt->mnt_sb = fsc->sb;
  
-       client->mount_state = CEPH_MOUNT_MOUNTED;
+       fsc->mount_state = CEPH_MOUNT_MOUNTED;
         dout("mount success\n");
         err = 0;
  
  out:
-       mutex_unlock(&client->mount_mutex);
+       mutex_unlock(&fsc->client->mount_mutex);
         return err;
+
+fail:
+       if (first) {
+               dput(fsc->sb->s_root);
+               fsc->sb->s_root = NULL;
+       }
+       goto out;
  }
  
  static int ceph_set_super(struct super_block *s, void *data)
  {
-       struct ceph_client *client = data;
+       struct ceph_fs_client *fsc = data;
         int ret;
  
         dout("set_super %p data %p\n", s, data);
  
-       s->s_flags = client->mount_args->sb_flags;
+       s->s_flags = fsc->mount_options->sb_flags;
         s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */
  
-       s->s_fs_info = client;
-       client->sb = s;
+       s->s_fs_info = fsc;
+       fsc->sb = s;
  
         s->s_op = &ceph_super_ops;
         s->s_export_op = &ceph_export_ops;
@@ -917,7 +723,7 @@ static int ceph_set_super(struct super_block *s, void *data)
  
  fail:
         s->s_fs_info = NULL;
-       client->sb = NULL;
+       fsc->sb = NULL;
         return ret;
  }
  
@@ -926,30 +732,23 @@ fail:
   */
  static int ceph_compare_super(struct super_block *sb, void *data)
  {
-       struct ceph_client *new = data;
-       struct ceph_mount_args *args = new->mount_args;
-       struct ceph_client *other = ceph_sb_to_client(sb);
-       int i;
+       struct ceph_fs_client *new = data;
+       struct ceph_mount_options *fsopt = new->mount_options;
+       struct ceph_options *opt = new->client->options;
+       struct ceph_fs_client *other = ceph_sb_to_client(sb);
  
         dout("ceph_compare_super %p\n", sb);
-       if (args->flags & CEPH_OPT_FSID) {
-               if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
-                       dout("fsid doesn't match\n");
-                       return 0;
-               }
-       } else {
-               /* do we share (a) monitor? */
-               for (i = 0; i < new->monc.monmap->num_mon; i++)
-                       if (ceph_monmap_contains(other->monc.monmap,
-                                        &new->monc.monmap->mon_inst[i].addr))
-                               break;
-               if (i == new->monc.monmap->num_mon) {
-                       dout("mon ip not part of monmap\n");
-                       return 0;
-               }
-               dout("mon ip matches existing sb %p\n", sb);
+
+       if (compare_mount_options(fsopt, opt, other)) {
+               dout("monitor(s)/mount options don't match\n");
+               return 0;
         }
-       if (args->sb_flags != other->mount_args->sb_flags) {
+       if ((opt->flags & CEPH_OPT_FSID) &&
+           ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
+               dout("fsid doesn't match\n");
+               return 0;
+       }
+       if (fsopt->sb_flags != other->mount_options->sb_flags) {
                 dout("flags differ\n");
                 return 0;
         }
@@ -961,19 +760,20 @@ static int ceph_compare_super(struct super_block *sb, void *data)
   */
  static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
  
-static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
+static int ceph_register_bdi(struct super_block *sb,
+                            struct ceph_fs_client *fsc)
  {
         int err;
  
         /* set ra_pages based on rsize mount option? */
-       if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
-               client->backing_dev_info.ra_pages =
-                       (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
+       if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
+               fsc->backing_dev_info.ra_pages =
+                       (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
                         >> PAGE_SHIFT;
-       err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
+       err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
                            atomic_long_inc_return(&bdi_seq));
         if (!err)
-               sb->s_bdi = &client->backing_dev_info;
+               sb->s_bdi = &fsc->backing_dev_info;
         return err;
  }
  
@@ -982,46 +782,52 @@ static int ceph_get_sb(struct file_system_type *fs_type,
                        struct vfsmount *mnt)
  {
         struct super_block *sb;
-       struct ceph_client *client;
+       struct ceph_fs_client *fsc;
         int err;
         int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
         const char *path = NULL;
-       struct ceph_mount_args *args;
+       struct ceph_mount_options *fsopt = NULL;
+       struct ceph_options *opt = NULL;
  
         dout("ceph_get_sb\n");
-       args = parse_mount_args(flags, data, dev_name, &path);
-       if (IS_ERR(args)) {
-               err = PTR_ERR(args);
+       err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
+       if (err < 0)
                 goto out_final;
-       }
  
         /* create client (which we may/may not use) */
-       client = ceph_create_client(args);
-       if (IS_ERR(client)) {
-               err = PTR_ERR(client);
+       fsc = create_fs_client(fsopt, opt);
+       if (IS_ERR(fsc)) {
+               err = PTR_ERR(fsc);
+               kfree(fsopt);
+               kfree(opt);
                 goto out_final;
         }
  
-       if (client->mount_args->flags & CEPH_OPT_NOSHARE)
+       err = ceph_mdsc_init(fsc);
+       if (err < 0)
+               goto out;
+
+       if (ceph_test_opt(fsc->client, NOSHARE))
                 compare_super = NULL;
-       sb = sget(fs_type, compare_super, ceph_set_super, client);
+       sb = sget(fs_type, compare_super, ceph_set_super, fsc);
         if (IS_ERR(sb)) {
                 err = PTR_ERR(sb);
                 goto out;
         }
  
-       if (ceph_sb_to_client(sb) != client) {
-               ceph_destroy_client(client);
-               client = ceph_sb_to_client(sb);
-               dout("get_sb got existing client %p\n", client);
+       if (ceph_sb_to_client(sb) != fsc) {
+               ceph_mdsc_destroy(fsc);
+               destroy_fs_client(fsc);
+               fsc = ceph_sb_to_client(sb);
+               dout("get_sb got existing client %p\n", fsc);
         } else {
-               dout("get_sb using new client %p\n", client);
-               err = ceph_register_bdi(sb, client);
+               dout("get_sb using new client %p\n", fsc);
+               err = ceph_register_bdi(sb, fsc);
                 if (err < 0)
                         goto out_splat;
         }
  
-       err = ceph_mount(client, mnt, path);
+       err = ceph_mount(fsc, mnt, path);
         if (err < 0)
                 goto out_splat;
         dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
@@ -1029,12 +835,13 @@ static int ceph_get_sb(struct file_system_type *fs_type,
         return 0;
  
  out_splat:
-       ceph_mdsc_close_sessions(&client->mdsc);
+       ceph_mdsc_close_sessions(fsc->mdsc);
         deactivate_locked_super(sb);
         goto out_final;
  
  out:
-       ceph_destroy_client(client);
+       ceph_mdsc_destroy(fsc);
+       destroy_fs_client(fsc);
  out_final:
         dout("ceph_get_sb fail %d\n", err);
         return err;
@@ -1042,11 +849,12 @@ out_final:
  
  static void ceph_kill_sb(struct super_block *s)
  {
-       struct ceph_client *client = ceph_sb_to_client(s);
+       struct ceph_fs_client *fsc = ceph_sb_to_client(s);
         dout("kill_sb %p\n", s);
-       ceph_mdsc_pre_umount(&client->mdsc);
+       ceph_mdsc_pre_umount(fsc->mdsc);
         kill_anon_super(s);    /* will call put_super after sb is r/o */
-       ceph_destroy_client(client);
+       ceph_mdsc_destroy(fsc);
+       destroy_fs_client(fsc);
  }
  
  static struct file_system_type ceph_fs_type = {
@@ -1062,36 +870,20 @@ static struct file_system_type ceph_fs_type = {
  
  static int __init init_ceph(void)
  {
-       int ret = 0;
-
-       ret = ceph_debugfs_init();
-       if (ret < 0)
-               goto out;
-
-       ret = ceph_msgr_init();
-       if (ret < 0)
-               goto out_debugfs;
-
-       ret = init_caches();
+       int ret = init_caches();
         if (ret)
-               goto out_msgr;
+               goto out;
  
         ret = register_filesystem(&ceph_fs_type);
         if (ret)
                 goto out_icache;
  
-       pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n",
-               CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL,
-               CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
-               CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
+       pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
+
         return 0;
  
  out_icache:
         destroy_caches();
-out_msgr:
-       ceph_msgr_exit();
-out_debugfs:
-       ceph_debugfs_cleanup();
  out:
         return ret;
  }
@@ -1101,8 +893,6 @@ static void __exit exit_ceph(void)
         dout("exit_ceph\n");
         unregister_filesystem(&ceph_fs_type);
         destroy_caches();
-       ceph_msgr_exit();
-       ceph_debugfs_cleanup();
  }
  
  module_init(init_ceph);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h

index b87638e84c4bc266e9b325f12e7fb98fc78e8986..1886294e12f7a3106c00f3376d92e43330514081 100644 (file)
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1,7 +1,7 @@
  #ifndef _FS_CEPH_SUPER_H
  #define _FS_CEPH_SUPER_H
  
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <asm/unaligned.h>
  #include <linux/backing-dev.h>
@@ -14,13 +14,7 @@
  #include <linux/writeback.h>
  #include <linux/slab.h>
  
-#include "types.h"
-#include "messenger.h"
-#include "msgpool.h"
-#include "mon_client.h"
-#include "mds_client.h"
-#include "osd_client.h"
-#include "ceph_fs.h"
+#include <linux/ceph/libceph.h>
  
  /* f_type in struct statfs */
  #define CEPH_SUPER_MAGIC 0x00c36400
@@ -30,42 +24,25 @@
  #define CEPH_BLOCK_SHIFT   20  /* 1 MB */
  #define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)
  
-/*
- * Supported features
- */
-#define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR | CEPH_FEATURE_FLOCK
-#define CEPH_FEATURE_REQUIRED  CEPH_FEATURE_NOSRCADDR
+#define CEPH_MOUNT_OPT_DIRSTAT         (1<<4) /* `cat dirname` for stats */
+#define CEPH_MOUNT_OPT_RBYTES          (1<<5) /* dir st_bytes = rbytes */
+#define CEPH_MOUNT_OPT_NOASYNCREADDIR  (1<<7) /* no dcache readdir */
  
-/*
- * mount options
- */
-#define CEPH_OPT_FSID             (1<<0)
-#define CEPH_OPT_NOSHARE          (1<<1) /* don't share client with other sbs */
-#define CEPH_OPT_MYIP             (1<<2) /* specified my ip */
-#define CEPH_OPT_DIRSTAT          (1<<4) /* funky `cat dirname` for stats */
-#define CEPH_OPT_RBYTES           (1<<5) /* dir st_bytes = rbytes */
-#define CEPH_OPT_NOCRC            (1<<6) /* no data crc on writes */
-#define CEPH_OPT_NOASYNCREADDIR   (1<<7) /* no dcache readdir */
+#define CEPH_MOUNT_OPT_DEFAULT    (CEPH_MOUNT_OPT_RBYTES)
  
-#define CEPH_OPT_DEFAULT   (CEPH_OPT_RBYTES)
+#define ceph_set_mount_opt(fsc, opt) \
+       (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
+#define ceph_test_mount_opt(fsc, opt) \
+       (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
  
-#define ceph_set_opt(client, opt) \
-       (client)->mount_args->flags |= CEPH_OPT_##opt;
-#define ceph_test_opt(client, opt) \
-       (!!((client)->mount_args->flags & CEPH_OPT_##opt))
+#define CEPH_MAX_READDIR_DEFAULT        1024
+#define CEPH_MAX_READDIR_BYTES_DEFAULT  (512*1024)
+#define CEPH_SNAPDIRNAME_DEFAULT        ".snap"
  
-
-struct ceph_mount_args {
-       int sb_flags;
+struct ceph_mount_options {
         int flags;
-       struct ceph_fsid fsid;
-       struct ceph_entity_addr my_addr;
-       int num_mon;
-       struct ceph_entity_addr *mon_addr;
-       int mount_timeout;
-       int osd_idle_ttl;
-       int osd_timeout;
-       int osd_keepalive_timeout;
+       int sb_flags;
+
         int wsize;
         int rsize;            /* max readahead */
         int congestion_kb;    /* max writeback in flight */
@@ -73,82 +50,25 @@ struct ceph_mount_args {
         int cap_release_safety;
         int max_readdir;       /* max readdir result (entires) */
         int max_readdir_bytes; /* max readdir result (bytes) */
-       char *snapdir_name;   /* default ".snap" */
-       char *name;
-       char *secret;
-};
  
-/*
- * defaults
- */
-#define CEPH_MOUNT_TIMEOUT_DEFAULT  60
-#define CEPH_OSD_TIMEOUT_DEFAULT    60  /* seconds */
-#define CEPH_OSD_KEEPALIVE_DEFAULT  5
-#define CEPH_OSD_IDLE_TTL_DEFAULT    60
-#define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
-#define CEPH_MAX_READDIR_DEFAULT    1024
-#define CEPH_MAX_READDIR_BYTES_DEFAULT    (512*1024)
-
-#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
-#define CEPH_MSG_MAX_DATA_LEN  (16*1024*1024)
-
-#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
-#define CEPH_AUTH_NAME_DEFAULT   "guest"
-/*
- * Delay telling the MDS we no longer want caps, in case we reopen
- * the file.  Delay a minimum amount of time, even if we send a cap
- * message for some other reason.  Otherwise, take the oppotunity to
- * update the mds to avoid sending another message later.
- */
-#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
-#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
-
-#define CEPH_CAP_RELEASE_SAFETY_DEFAULT        (CEPH_CAPS_PER_RELEASE * 4)
-
-/* mount state */
-enum {
-       CEPH_MOUNT_MOUNTING,
-       CEPH_MOUNT_MOUNTED,
-       CEPH_MOUNT_UNMOUNTING,
-       CEPH_MOUNT_UNMOUNTED,
-       CEPH_MOUNT_SHUTDOWN,
-};
-
-/*
- * subtract jiffies
- */
-static inline unsigned long time_sub(unsigned long a, unsigned long b)
-{
-       BUG_ON(time_after(b, a));
-       return (long)a - (long)b;
-}
-
-/*
- * per-filesystem client state
- *
- * possibly shared by multiple mount points, if they are
- * mounting the same ceph filesystem/cluster.
- */
-struct ceph_client {
-       struct ceph_fsid fsid;
-       bool have_fsid;
+       /*
+        * everything above this point can be memcmp'd; everything below
+        * is handled in compare_mount_options()
+        */
  
-       struct mutex mount_mutex;       /* serialize mount attempts */
-       struct ceph_mount_args *mount_args;
+       char *snapdir_name;   /* default ".snap" */
+};
  
+struct ceph_fs_client {
         struct super_block *sb;
  
-       unsigned long mount_state;
-       wait_queue_head_t auth_wq;
-
-       int auth_err;
+       struct ceph_mount_options *mount_options;
+       struct ceph_client *client;
  
+       unsigned long mount_state;
         int min_caps;                  /* min caps i added */
  
-       struct ceph_messenger *msgr;   /* messenger instance */
-       struct ceph_mon_client monc;
-       struct ceph_mds_client mdsc;
-       struct ceph_osd_client osdc;
+       struct ceph_mds_client *mdsc;
  
         /* writeback */
         mempool_t *wb_pagevec_pool;
@@ -160,14 +80,14 @@ struct ceph_client {
         struct backing_dev_info backing_dev_info;
  
  #ifdef CONFIG_DEBUG_FS
-       struct dentry *debugfs_monmap;
-       struct dentry *debugfs_mdsmap, *debugfs_osdmap;
-       struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
+       struct dentry *debugfs_dentry_lru, *debugfs_caps;
         struct dentry *debugfs_congestion_kb;
         struct dentry *debugfs_bdi;
+       struct dentry *debugfs_mdsc, *debugfs_mdsmap;
  #endif
  };
  
+
  /*
   * File i/o capability.  This tracks shared state with the metadata
   * server that allows us to cache or writeback attributes or to read
@@ -275,6 +195,20 @@ struct ceph_inode_xattr {
         int should_free_val;
  };
  
+/*
+ * Ceph dentry state
+ */
+struct ceph_dentry_info {
+       struct ceph_mds_session *lease_session;
+       u32 lease_gen, lease_shared_gen;
+       u32 lease_seq;
+       unsigned long lease_renew_after, lease_renew_from;
+       struct list_head lru;
+       struct dentry *dentry;
+       u64 time;
+       u64 offset;
+};
+
  struct ceph_inode_xattrs_info {
         /*
          * (still encoded) xattr blob. we avoid the overhead of parsing
@@ -296,11 +230,6 @@ struct ceph_inode_xattrs_info {
  /*
   * Ceph inode.
   */
-#define CEPH_I_COMPLETE  1  /* we have complete directory cached */
-#define CEPH_I_NODELAY   4  /* do not delay cap release */
-#define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
-#define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */
-
  struct ceph_inode_info {
         struct ceph_vino i_vino;   /* ceph ino + snap */
  
@@ -391,6 +320,63 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
         return container_of(inode, struct ceph_inode_info, vfs_inode);
  }
  
+static inline struct ceph_vino ceph_vino(struct inode *inode)
+{
+       return ceph_inode(inode)->i_vino;
+}
+
+/*
+ * ino_t is <64 bits on many architectures, blech.
+ *
+ * don't include snap in ino hash, at least for now.
+ */
+static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
+{
+       ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */
+#if BITS_PER_LONG == 32
+       ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
+       if (!ino)
+               ino = 1;
+#endif
+       return ino;
+}
+
+/* for printf-style formatting */
+#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
+
+static inline u64 ceph_ino(struct inode *inode)
+{
+       return ceph_inode(inode)->i_vino.ino;
+}
+static inline u64 ceph_snap(struct inode *inode)
+{
+       return ceph_inode(inode)->i_vino.snap;
+}
+
+static inline int ceph_ino_compare(struct inode *inode, void *data)
+{
+       struct ceph_vino *pvino = (struct ceph_vino *)data;
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       return ci->i_vino.ino == pvino->ino &&
+               ci->i_vino.snap == pvino->snap;
+}
+
+static inline struct inode *ceph_find_inode(struct super_block *sb,
+                                           struct ceph_vino vino)
+{
+       ino_t t = ceph_vino_to_ino(vino);
+       return ilookup5(sb, t, ceph_ino_compare, &vino);
+}
+
+
+/*
+ * Ceph inode.
+ */
+#define CEPH_I_COMPLETE  1  /* we have complete directory cached */
+#define CEPH_I_NODELAY   4  /* do not delay cap release */
+#define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
+#define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */
+
  static inline void ceph_i_clear(struct inode *inode, unsigned mask)
  {
         struct ceph_inode_info *ci = ceph_inode(inode);
@@ -414,8 +400,9 @@ static inline bool ceph_i_test(struct inode *inode, unsigned mask)
         struct ceph_inode_info *ci = ceph_inode(inode);
         bool r;
  
-       smp_mb();
+       spin_lock(&inode->i_lock);
         r = (ci->i_ceph_flags & mask) == mask;
+       spin_unlock(&inode->i_lock);
         return r;
  }
  
@@ -432,20 +419,6 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
                             struct ceph_inode_frag *pfrag,
                             int *found);
  
-/*
- * Ceph dentry state
- */
-struct ceph_dentry_info {
-       struct ceph_mds_session *lease_session;
-       u32 lease_gen, lease_shared_gen;
-       u32 lease_seq;
-       unsigned long lease_renew_after, lease_renew_from;
-       struct list_head lru;
-       struct dentry *dentry;
-       u64 time;
-       u64 offset;
-};
-
  static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
  {
         return (struct ceph_dentry_info *)dentry->d_fsdata;
@@ -456,22 +429,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
         return ((loff_t)frag << 32) | (loff_t)off;
  }
  
-/*
- * ino_t is <64 bits on many architectures, blech.
- *
- * don't include snap in ino hash, at least for now.
- */
-static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
-{
-       ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */
-#if BITS_PER_LONG == 32
-       ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
-       if (!ino)
-               ino = 1;
-#endif
-       return ino;
-}
-
  static inline int ceph_set_ino_cb(struct inode *inode, void *data)
  {
         ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
@@ -479,39 +436,6 @@ static inline int ceph_set_ino_cb(struct inode *inode, void *data)
         return 0;
  }
  
-static inline struct ceph_vino ceph_vino(struct inode *inode)
-{
-       return ceph_inode(inode)->i_vino;
-}
-
-/* for printf-style formatting */
-#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
-
-static inline u64 ceph_ino(struct inode *inode)
-{
-       return ceph_inode(inode)->i_vino.ino;
-}
-static inline u64 ceph_snap(struct inode *inode)
-{
-       return ceph_inode(inode)->i_vino.snap;
-}
-
-static inline int ceph_ino_compare(struct inode *inode, void *data)
-{
-       struct ceph_vino *pvino = (struct ceph_vino *)data;
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       return ci->i_vino.ino == pvino->ino &&
-               ci->i_vino.snap == pvino->snap;
-}
-
-static inline struct inode *ceph_find_inode(struct super_block *sb,
-                                           struct ceph_vino vino)
-{
-       ino_t t = ceph_vino_to_ino(vino);
-       return ilookup5(sb, t, ceph_ino_compare, &vino);
-}
-
-
  /*
   * caps helpers
   */
@@ -576,18 +500,18 @@ extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
                              struct ceph_cap_reservation *ctx, int need);
  extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
                                struct ceph_cap_reservation *ctx);
-extern void ceph_reservation_status(struct ceph_client *client,
+extern void ceph_reservation_status(struct ceph_fs_client *client,
                                     int *total, int *avail, int *used,
                                     int *reserved, int *min);
  
-static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
+static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
  {
-       return (struct ceph_client *)inode->i_sb->s_fs_info;
+       return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
  }
  
-static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
+static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
  {
-       return (struct ceph_client *)sb->s_fs_info;
+       return (struct ceph_fs_client *)sb->s_fs_info;
  }
  
  
@@ -616,51 +540,6 @@ struct ceph_file_info {
  
  
  
-/*
- * snapshots
- */
-
-/*
- * A "snap context" is the set of existing snapshots when we
- * write data.  It is used by the OSD to guide its COW behavior.
- *
- * The ceph_snap_context is refcounted, and attached to each dirty
- * page, indicating which context the dirty data belonged when it was
- * dirtied.
- */
-struct ceph_snap_context {
-       atomic_t nref;
-       u64 seq;
-       int num_snaps;
-       u64 snaps[];
-};
-
-static inline struct ceph_snap_context *
-ceph_get_snap_context(struct ceph_snap_context *sc)
-{
-       /*
-       printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
-              atomic_read(&sc->nref)+1);
-       */
-       if (sc)
-               atomic_inc(&sc->nref);
-       return sc;
-}
-
-static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
-{
-       if (!sc)
-               return;
-       /*
-       printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
-              atomic_read(&sc->nref)-1);
-       */
-       if (atomic_dec_and_test(&sc->nref)) {
-               /*printk(" deleting snap_context %p\n", sc);*/
-               kfree(sc);
-       }
-}
-
  /*
   * A "snap realm" describes a subset of the file hierarchy sharing
   * the same set of snapshots that apply to it.  The realms themselves
@@ -699,16 +578,33 @@ struct ceph_snap_realm {
         spinlock_t inodes_with_caps_lock;
  };
  
-
-
-/*
- * calculate the number of pages a given length and offset map onto,
- * if we align the data.
- */
-static inline int calc_pages_for(u64 off, u64 len)
+static inline int default_congestion_kb(void)
  {
-       return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
-               (off >> PAGE_CACHE_SHIFT);
+       int congestion_kb;
+
+       /*
+        * Copied from NFS
+        *
+        * congestion size, scale with available memory.
+        *
+        *  64MB:    8192k
+        * 128MB:   11585k
+        * 256MB:   16384k
+        * 512MB:   23170k
+        *   1GB:   32768k
+        *   2GB:   46340k
+        *   4GB:   65536k
+        *   8GB:   92681k
+        *  16GB:  131072k
+        *
+        * This allows larger machines to have larger/more transfers.
+        * Limit the default to 256M
+        */
+       congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+       if (congestion_kb > 256*1024)
+               congestion_kb = 256*1024;
+
+       return congestion_kb;
  }
  
  
@@ -741,16 +637,6 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
                            ci_item)->writing;
  }
  
-
-/* super.c */
-extern struct kmem_cache *ceph_inode_cachep;
-extern struct kmem_cache *ceph_cap_cachep;
-extern struct kmem_cache *ceph_dentry_cachep;
-extern struct kmem_cache *ceph_file_cachep;
-
-extern const char *ceph_msg_type_name(int type);
-extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
-
  /* inode.c */
  extern const struct inode_operations ceph_file_iops;
  
@@ -857,12 +743,18 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
  /* file.c */
  extern const struct file_operations ceph_file_fops;
  extern const struct address_space_operations ceph_aops;
+extern int ceph_copy_to_page_vector(struct page **pages,
+                                   const char *data,
+                                   loff_t off, size_t len);
+extern int ceph_copy_from_page_vector(struct page **pages,
+                                   char *data,
+                                   loff_t off, size_t len);
+extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
  extern int ceph_open(struct inode *inode, struct file *file);
  extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
                                        struct nameidata *nd, int mode,
                                        int locked_dir);
  extern int ceph_release(struct inode *inode, struct file *filp);
-extern void ceph_release_page_vector(struct page **pages, int num_pages);
  
  /* dir.c */
  extern const struct file_operations ceph_dir_fops;
@@ -892,12 +784,6 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
  /* export.c */
  extern const struct export_operations ceph_export_ops;
  
-/* debugfs.c */
-extern int ceph_debugfs_init(void);
-extern void ceph_debugfs_cleanup(void);
-extern int ceph_debugfs_client_init(struct ceph_client *client);
-extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
-
  /* locks.c */
  extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
  extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
@@ -914,4 +800,8 @@ static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
         return NULL;
  }
  
+/* debugfs.c */
+extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
+extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
+
  #endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c

index 9578af610b73fb48b69872ddeed222e58c8340f0..6e12a6ba5f79daabc1bc455a3a4db464b240c00a 100644 (file)
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1,6 +1,9 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
+
  #include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+
+#include <linux/ceph/decode.h>
  
  #include <linux/xattr.h>
  #include <linux/slab.h>
@@ -620,12 +623,12 @@ out:
  static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
                               const char *value, size_t size, int flags)
  {
-       struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
+       struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
         struct inode *inode = dentry->d_inode;
         struct ceph_inode_info *ci = ceph_inode(inode);
         struct inode *parent_inode = dentry->d_parent->d_inode;
         struct ceph_mds_request *req;
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         int err;
         int i, nr_pages;
         struct page **pages = NULL;
@@ -713,10 +716,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
  
         /* preallocate memory for xattr name, value, index node */
         err = -ENOMEM;
-       newname = kmalloc(name_len + 1, GFP_NOFS);
+       newname = kmemdup(name, name_len + 1, GFP_NOFS);
         if (!newname)
                 goto out;
-       memcpy(newname, name, name_len + 1);
  
         if (val_len) {
                 newval = kmalloc(val_len + 1, GFP_NOFS);
@@ -777,8 +779,8 @@ out:
  
  static int ceph_send_removexattr(struct dentry *dentry, const char *name)
  {
-       struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct inode *inode = dentry->d_inode;
         struct inode *parent_inode = dentry->d_parent->d_inode;
         struct ceph_mds_request *req;
diff --git a/fs/exec.c b/fs/exec.c

index 828dd2461d6beb7c37ed7df74ef11177c2bcba6d..6d2b6f93685813ba2b2119dc71c14a941061cf46 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -2014,3 +2014,43 @@ fail_creds:
  fail:
         return;
  }
+
+/*
+ * Core dumping helper functions.  These are the only things you should
+ * do on a core-file: use only these functions to write out all the
+ * necessary info.
+ */
+int dump_write(struct file *file, const void *addr, int nr)
+{
+       return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
+}
+EXPORT_SYMBOL(dump_write);
+
+int dump_seek(struct file *file, loff_t off)
+{
+       int ret = 1;
+
+       if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
+               if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
+                       return 0;
+       } else {
+               char *buf = (char *)get_zeroed_page(GFP_KERNEL);
+
+               if (!buf)
+                       return 0;
+               while (off > 0) {
+                       unsigned long n = off;
+
+                       if (n > PAGE_SIZE)
+                               n = PAGE_SIZE;
+                       if (!dump_write(file, buf, n)) {
+                               ret = 0;
+                               break;
+                       }
+                       off -= n;
+               }
+               free_page((unsigned long)buf);
+       }
+       return ret;
+}
+EXPORT_SYMBOL(dump_seek);
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig

index cc9665522148a730b010953596cc24edefc00ed6..c465ae066c62c6392ee22047d2be0ec650b89c98 100644 (file)
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
  config GFS2_FS
         tristate "GFS2 file system support"
-       depends on EXPERIMENTAL && (64BIT || LBDAF)
+       depends on (64BIT || LBDAF)
         select DLM if GFS2_FS_LOCKING_DLM
         select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
         select SYSFS if GFS2_FS_LOCKING_DLM
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c

index 194fe16d8418a332a274a74769b15277ff2d6858..6b24afb96aaedade304b48bb427e664eae8e6e53 100644 (file)
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -36,8 +36,8 @@
  #include "glops.h"
  
  
-static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
-                                  unsigned int from, unsigned int to)
+void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+                           unsigned int from, unsigned int to)
  {
         struct buffer_head *head = page_buffers(page);
         unsigned int bsize = head->b_size;
@@ -615,7 +615,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
         unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
         int alloc_required;
         int error = 0;
-       struct gfs2_alloc *al;
+       struct gfs2_alloc *al = NULL;
         pgoff_t index = pos >> PAGE_CACHE_SHIFT;
         unsigned from = pos & (PAGE_CACHE_SIZE - 1);
         unsigned to = from + len;
@@ -663,6 +663,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
                 rblocks += RES_STATFS + RES_QUOTA;
         if (&ip->i_inode == sdp->sd_rindex)
                 rblocks += 2 * RES_STATFS;
+       if (alloc_required)
+               rblocks += gfs2_rg_blocks(al);
  
         error = gfs2_trans_begin(sdp, rblocks,
                                  PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -696,13 +698,11 @@ out:
  
         page_cache_release(page);
  
-       /*
-        * XXX(truncate): the call below should probably be replaced with
-        * a call to the gfs2-specific truncate blocks helper to actually
-        * release disk blocks..
-        */
+       gfs2_trans_end(sdp);
         if (pos + len > ip->i_inode.i_size)
-               truncate_setsize(&ip->i_inode, ip->i_inode.i_size);
+               gfs2_trim_blocks(&ip->i_inode);
+       goto out_trans_fail;
+
  out_endtrans:
         gfs2_trans_end(sdp);
  out_trans_fail:
@@ -802,10 +802,8 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
         page_cache_release(page);
  
         if (copied) {
-               if (inode->i_size < to) {
+               if (inode->i_size < to)
                         i_size_write(inode, to);
-                       ip->i_disksize = inode->i_size;
-               }
                 gfs2_dinode_out(ip, di);
                 mark_inode_dirty(inode);
         }
@@ -876,8 +874,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
  
         ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
         if (ret > 0) {
-               if (inode->i_size > ip->i_disksize)
-                       ip->i_disksize = inode->i_size;
                 gfs2_dinode_out(ip, dibh->b_data);
                 mark_inode_dirty(inode);
         }
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c

index 6f482809d1a35b4787e9cb62357958d532aeaa30..5476c066d4ee336733445eda2f804561179ecb41 100644 (file)
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -50,7 +50,7 @@ struct strip_mine {
   * @ip: the inode
   * @dibh: the dinode buffer
   * @block: the block number that was allocated
- * @private: any locked page held by the caller process
+ * @page: The (optional) page. This is looked up if @page is NULL
   *
   * Returns: errno
   */
@@ -109,8 +109,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
  /**
   * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
   * @ip: The GFS2 inode to unstuff
- * @unstuffer: the routine that handles unstuffing a non-zero length file
- * @private: private data for the unstuffer
+ * @page: The (optional) page. This is looked up if the @page is NULL
   *
   * This routine unstuffs a dinode and returns it to a "normal" state such
   * that the height can be grown in the traditional way.
@@ -132,7 +131,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
         if (error)
                 goto out;
  
-       if (ip->i_disksize) {
+       if (i_size_read(&ip->i_inode)) {
                 /* Get a free block, fill it with the stuffed data,
                    and write it out to disk */
  
@@ -161,7 +160,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
         di = (struct gfs2_dinode *)dibh->b_data;
         gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
  
-       if (ip->i_disksize) {
+       if (i_size_read(&ip->i_inode)) {
                 *(__be64 *)(di + 1) = cpu_to_be64(block);
                 gfs2_add_inode_blocks(&ip->i_inode, 1);
                 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
@@ -884,84 +883,15 @@ out:
         return error;
  }
  
-/**
- * do_grow - Make a file look bigger than it is
- * @ip: the inode
- * @size: the size to set the file to
- *
- * Called with an exclusive lock on @ip.
- *
- * Returns: errno
- */
-
-static int do_grow(struct gfs2_inode *ip, u64 size)
-{
-       struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-       struct gfs2_alloc *al;
-       struct buffer_head *dibh;
-       int error;
-
-       al = gfs2_alloc_get(ip);
-       if (!al)
-               return -ENOMEM;
-
-       error = gfs2_quota_lock_check(ip);
-       if (error)
-               goto out;
-
-       al->al_requested = sdp->sd_max_height + RES_DATA;
-
-       error = gfs2_inplace_reserve(ip);
-       if (error)
-               goto out_gunlock_q;
-
-       error = gfs2_trans_begin(sdp,
-                       sdp->sd_max_height + al->al_rgd->rd_length +
-                       RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
-       if (error)
-               goto out_ipres;
-
-       error = gfs2_meta_inode_buffer(ip, &dibh);
-       if (error)
-               goto out_end_trans;
-
-       if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
-               if (gfs2_is_stuffed(ip)) {
-                       error = gfs2_unstuff_dinode(ip, NULL);
-                       if (error)
-                               goto out_brelse;
-               }
-       }
-
-       ip->i_disksize = size;
-       ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-       gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-       gfs2_dinode_out(ip, dibh->b_data);
-
-out_brelse:
-       brelse(dibh);
-out_end_trans:
-       gfs2_trans_end(sdp);
-out_ipres:
-       gfs2_inplace_release(ip);
-out_gunlock_q:
-       gfs2_quota_unlock(ip);
-out:
-       gfs2_alloc_put(ip);
-       return error;
-}
-
-
  /**
   * gfs2_block_truncate_page - Deal with zeroing out data for truncate
   *
   * This is partly borrowed from ext3.
   */
-static int gfs2_block_truncate_page(struct address_space *mapping)
+static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
  {
         struct inode *inode = mapping->host;
         struct gfs2_inode *ip = GFS2_I(inode);
-       loff_t from = inode->i_size;
         unsigned long index = from >> PAGE_CACHE_SHIFT;
         unsigned offset = from & (PAGE_CACHE_SIZE-1);
         unsigned blocksize, iblock, length, pos;
@@ -1023,9 +953,11 @@ unlock:
         return err;
  }
  
-static int trunc_start(struct gfs2_inode *ip, u64 size)
+static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
  {
-       struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+       struct gfs2_inode *ip = GFS2_I(inode);
+       struct gfs2_sbd *sdp = GFS2_SB(inode);
+       struct address_space *mapping = inode->i_mapping;
         struct buffer_head *dibh;
         int journaled = gfs2_is_jdata(ip);
         int error;
@@ -1039,31 +971,26 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
         if (error)
                 goto out;
  
+       gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
         if (gfs2_is_stuffed(ip)) {
-               u64 dsize = size + sizeof(struct gfs2_dinode);
-               ip->i_disksize = size;
-               ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-               gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-               gfs2_dinode_out(ip, dibh->b_data);
-               if (dsize > dibh->b_size)
-                       dsize = dibh->b_size;
-               gfs2_buffer_clear_tail(dibh, dsize);
-               error = 1;
+               gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
         } else {
-               if (size & (u64)(sdp->sd_sb.sb_bsize - 1))
-                       error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
-
-               if (!error) {
-                       ip->i_disksize = size;
-                       ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-                       ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
-                       gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-                       gfs2_dinode_out(ip, dibh->b_data);
+               if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) {
+                       error = gfs2_block_truncate_page(mapping, newsize);
+                       if (error)
+                               goto out_brelse;
                 }
+               ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
         }
  
-       brelse(dibh);
+       i_size_write(inode, newsize);
+       ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+       gfs2_dinode_out(ip, dibh->b_data);
  
+       truncate_pagecache(inode, oldsize, newsize);
+out_brelse:
+       brelse(dibh);
  out:
         gfs2_trans_end(sdp);
         return error;
@@ -1123,7 +1050,7 @@ static int trunc_end(struct gfs2_inode *ip)
         if (error)
                 goto out;
  
-       if (!ip->i_disksize) {
+       if (!i_size_read(&ip->i_inode)) {
                 ip->i_height = 0;
                 ip->i_goal = ip->i_no_addr;
                 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
@@ -1143,92 +1070,154 @@ out:
  
  /**
   * do_shrink - make a file smaller
- * @ip: the inode
- * @size: the size to make the file
- * @truncator: function to truncate the last partial block
+ * @inode: the inode
+ * @oldsize: the current inode size
+ * @newsize: the size to make the file
   *
- * Called with an exclusive lock on @ip.
+ * Called with an exclusive lock on @inode. The @size must
+ * be equal to or smaller than the current inode size.
   *
   * Returns: errno
   */
  
-static int do_shrink(struct gfs2_inode *ip, u64 size)
+static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize)
  {
+       struct gfs2_inode *ip = GFS2_I(inode);
         int error;
  
-       error = trunc_start(ip, size);
+       error = trunc_start(inode, oldsize, newsize);
         if (error < 0)
                 return error;
-       if (error > 0)
+       if (gfs2_is_stuffed(ip))
                 return 0;
  
-       error = trunc_dealloc(ip, size);
-       if (!error)
+       error = trunc_dealloc(ip, newsize);
+       if (error == 0)
                 error = trunc_end(ip);
  
         return error;
  }
  
-static int do_touch(struct gfs2_inode *ip, u64 size)
+void gfs2_trim_blocks(struct inode *inode)
  {
-       struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+       u64 size = inode->i_size;
+       int ret;
+
+       ret = do_shrink(inode, size, size);
+       WARN_ON(ret != 0);
+}
+
+/**
+ * do_grow - Touch and update inode size
+ * @inode: The inode
+ * @size: The new size
+ *
+ * This function updates the timestamps on the inode and
+ * may also increase the size of the inode. This function
+ * must not be called with @size any smaller than the current
+ * inode size.
+ *
+ * Although it is not strictly required to unstuff files here,
+ * earlier versions of GFS2 have a bug in the stuffed file reading
+ * code which will result in a buffer overrun if the size is larger
+ * than the max stuffed file size. In order to prevent this from
+ * occuring, such files are unstuffed, but in other cases we can
+ * just update the inode size directly.
+ *
+ * Returns: 0 on success, or -ve on error
+ */
+
+static int do_grow(struct inode *inode, u64 size)
+{
+       struct gfs2_inode *ip = GFS2_I(inode);
+       struct gfs2_sbd *sdp = GFS2_SB(inode);
         struct buffer_head *dibh;
+       struct gfs2_alloc *al = NULL;
         int error;
  
-       error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+       if (gfs2_is_stuffed(ip) &&
+           (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
+               al = gfs2_alloc_get(ip);
+               if (al == NULL)
+                       return -ENOMEM;
+
+               error = gfs2_quota_lock_check(ip);
+               if (error)
+                       goto do_grow_alloc_put;
+
+               al->al_requested = 1;
+               error = gfs2_inplace_reserve(ip);
+               if (error)
+                       goto do_grow_qunlock;
+       }
+
+       error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0);
         if (error)
-               return error;
+               goto do_grow_release;
  
-       down_write(&ip->i_rw_mutex);
+       if (al) {
+               error = gfs2_unstuff_dinode(ip, NULL);
+               if (error)
+                       goto do_end_trans;
+       }
  
         error = gfs2_meta_inode_buffer(ip, &dibh);
         if (error)
-               goto do_touch_out;
+               goto do_end_trans;
  
+       i_size_write(inode, size);
         ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
         gfs2_trans_add_bh(ip->i_gl, dibh, 1);
         gfs2_dinode_out(ip, dibh->b_data);
         brelse(dibh);
  
-do_touch_out:
-       up_write(&ip->i_rw_mutex);
+do_end_trans:
         gfs2_trans_end(sdp);
+do_grow_release:
+       if (al) {
+               gfs2_inplace_release(ip);
+do_grow_qunlock:
+               gfs2_quota_unlock(ip);
+do_grow_alloc_put:
+               gfs2_alloc_put(ip);
+       }
         return error;
  }
  
  /**
- * gfs2_truncatei - make a file a given size
- * @ip: the inode
- * @size: the size to make the file
- * @truncator: function to truncate the last partial block
+ * gfs2_setattr_size - make a file a given size
+ * @inode: the inode
+ * @newsize: the size to make the file
   *
- * The file size can grow, shrink, or stay the same size.
+ * The file size can grow, shrink, or stay the same size. This
+ * is called holding i_mutex and an exclusive glock on the inode
+ * in question.
   *
   * Returns: errno
   */
  
-int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
+int gfs2_setattr_size(struct inode *inode, u64 newsize)
  {
-       int error;
+       int ret;
+       u64 oldsize;
  
-       if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode)))
-               return -EINVAL;
+       BUG_ON(!S_ISREG(inode->i_mode));
  
-       if (size > ip->i_disksize)
-               error = do_grow(ip, size);
-       else if (size < ip->i_disksize)
-               error = do_shrink(ip, size);
-       else
-               /* update time stamps */
-               error = do_touch(ip, size);
+       ret = inode_newsize_ok(inode, newsize);
+       if (ret)
+               return ret;
  
-       return error;
+       oldsize = inode->i_size;
+       if (newsize >= oldsize)
+               return do_grow(inode, newsize);
+
+       return do_shrink(inode, oldsize, newsize);
  }
  
  int gfs2_truncatei_resume(struct gfs2_inode *ip)
  {
         int error;
-       error = trunc_dealloc(ip, ip->i_disksize);
+       error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
         if (!error)
                 error = trunc_end(ip);
         return error;
@@ -1269,7 +1258,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
  
         shift = sdp->sd_sb.sb_bsize_shift;
         BUG_ON(gfs2_is_dir(ip));
-       end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift;
+       end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
         lblock = offset >> shift;
         lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
         if (lblock_stop > end_of_file)
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h

index a20a5213135a50ae9293da676eb533e9c04063db..42fea03e2bd962b6674747967ac00499009d923f 100644 (file)
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -44,14 +44,16 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
         }
  }
  
-int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
-int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create);
-int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
-
-int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
-int gfs2_truncatei_resume(struct gfs2_inode *ip);
-int gfs2_file_dealloc(struct gfs2_inode *ip);
-int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
-                             unsigned int len);
+extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
+extern int gfs2_block_map(struct inode *inode, sector_t lblock,
+                         struct buffer_head *bh, int create);
+extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new,
+                          u64 *dblock, unsigned *extlen);
+extern int gfs2_setattr_size(struct inode *inode, u64 size);
+extern void gfs2_trim_blocks(struct inode *inode);
+extern int gfs2_truncatei_resume(struct gfs2_inode *ip);
+extern int gfs2_file_dealloc(struct gfs2_inode *ip);
+extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
+                                    unsigned int len);
  
  #endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c

index bb7907bde3d81b63b6ae5b198283a52db36e97b0..6798755b3858685b611da5e439393bcffb332563 100644 (file)
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -49,7 +49,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
                 ip = GFS2_I(inode);
         }
  
-       if (sdp->sd_args.ar_localcaching)
+       if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
                 goto valid;
  
         had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c

index b9dd88a78dd47073e3af1645a1e0fa3bcb94d1bb..5c356d09c321c10133afc7cf93aba2eddd1cb3c1 100644 (file)
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -79,6 +79,9 @@
  #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
  #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
  
+struct qstr gfs2_qdot __read_mostly;
+struct qstr gfs2_qdotdot __read_mostly;
+
  typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len,
                             u64 leaf_no, void *data);
  typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
@@ -127,8 +130,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
  
         gfs2_trans_add_bh(ip->i_gl, dibh, 1);
         memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
-       if (ip->i_disksize < offset + size)
-               ip->i_disksize = offset + size;
+       if (ip->i_inode.i_size < offset + size)
+               i_size_write(&ip->i_inode, offset + size);
         ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
         gfs2_dinode_out(ip, dibh->b_data);
  
@@ -225,8 +228,8 @@ out:
         if (error)
                 return error;
  
-       if (ip->i_disksize < offset + copied)
-               ip->i_disksize = offset + copied;
+       if (ip->i_inode.i_size < offset + copied)
+               i_size_write(&ip->i_inode, offset + copied);
         ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
  
         gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -275,12 +278,13 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
         unsigned int o;
         int copied = 0;
         int error = 0;
+       u64 disksize = i_size_read(&ip->i_inode);
  
-       if (offset >= ip->i_disksize)
+       if (offset >= disksize)
                 return 0;
  
-       if (offset + size > ip->i_disksize)
-               size = ip->i_disksize - offset;
+       if (offset + size > disksize)
+               size = disksize - offset;
  
         if (!size)
                 return 0;
@@ -727,7 +731,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
                 unsigned hsize = 1 << ip->i_depth;
                 unsigned index;
                 u64 ln;
-               if (hsize * sizeof(u64) != ip->i_disksize) {
+               if (hsize * sizeof(u64) != i_size_read(inode)) {
                         gfs2_consist_inode(ip);
                         return ERR_PTR(-EIO);
                 }
@@ -879,7 +883,7 @@ static int dir_make_exhash(struct inode *inode)
         for (x = sdp->sd_hash_ptrs; x--; lp++)
                 *lp = cpu_to_be64(bn);
  
-       dip->i_disksize = sdp->sd_sb.sb_bsize / 2;
+       i_size_write(inode, sdp->sd_sb.sb_bsize / 2);
         gfs2_add_inode_blocks(&dip->i_inode, 1);
         dip->i_diskflags |= GFS2_DIF_EXHASH;
  
@@ -1057,11 +1061,12 @@ static int dir_double_exhash(struct gfs2_inode *dip)
         u64 *buf;
         u64 *from, *to;
         u64 block;
+       u64 disksize = i_size_read(&dip->i_inode);
         int x;
         int error = 0;
  
         hsize = 1 << dip->i_depth;
-       if (hsize * sizeof(u64) != dip->i_disksize) {
+       if (hsize * sizeof(u64) != disksize) {
                 gfs2_consist_inode(dip);
                 return -EIO;
         }
@@ -1072,7 +1077,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
         if (!buf)
                 return -ENOMEM;
  
-       for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) {
+       for (block = disksize >> sdp->sd_hash_bsize_shift; block--;) {
                 error = gfs2_dir_read_data(dip, (char *)buf,
                                             block * sdp->sd_hash_bsize,
                                             sdp->sd_hash_bsize, 1);
@@ -1370,7 +1375,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
         unsigned depth = 0;
  
         hsize = 1 << dip->i_depth;
-       if (hsize * sizeof(u64) != dip->i_disksize) {
+       if (hsize * sizeof(u64) != i_size_read(inode)) {
                 gfs2_consist_inode(dip);
                 return -EIO;
         }
@@ -1784,7 +1789,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
         int error = 0;
  
         hsize = 1 << dip->i_depth;
-       if (hsize * sizeof(u64) != dip->i_disksize) {
+       if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) {
                 gfs2_consist_inode(dip);
                 return -EIO;
         }
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h

index 4f919440c3be3e20ed49c2acb742db6758096bea..a98f644bd3df33596cf2382767b89ca0cdd08161 100644 (file)
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -17,23 +17,24 @@ struct inode;
  struct gfs2_inode;
  struct gfs2_inum;
  
-struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *filename);
-int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
-                  const struct gfs2_inode *ip);
-int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
-                const struct gfs2_inode *ip, unsigned int type);
-int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
-int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
-                 filldir_t filldir);
-int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
-                  const struct gfs2_inode *nip, unsigned int new_type);
+extern struct inode *gfs2_dir_search(struct inode *dir,
+                                    const struct qstr *filename);
+extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
+                         const struct gfs2_inode *ip);
+extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
+                       const struct gfs2_inode *ip, unsigned int type);
+extern int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
+extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
+                        filldir_t filldir);
+extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
+                         const struct gfs2_inode *nip, unsigned int new_type);
  
-int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
+extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
  
-int gfs2_diradd_alloc_required(struct inode *dir,
-                              const struct qstr *filename);
-int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
-                           struct buffer_head **bhp);
+extern int gfs2_diradd_alloc_required(struct inode *dir,
+                                     const struct qstr *filename);
+extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
+                                  struct buffer_head **bhp);
  
  static inline u32 gfs2_disk_hash(const char *data, int len)
  {
@@ -61,4 +62,7 @@ static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct
         memcpy(dent + 1, name->name, name->len);
  }
  
+extern struct qstr gfs2_qdot;
+extern struct qstr gfs2_qdotdot;
+
  #endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c

index dfe237a3f8ad9e2a0f11bae403ff1f1d8687cbd0..06d582732d3427a058864d03ca6cb667c5063481 100644 (file)
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -126,16 +126,9 @@ static int gfs2_get_name(struct dentry *parent, char *name,
  
  static struct dentry *gfs2_get_parent(struct dentry *child)
  {
-       struct qstr dotdot;
         struct dentry *dentry;
  
-       /*
-        * XXX(hch): it would be a good idea to keep this around as a
-        *           static variable.
-        */
-       gfs2_str2qstr(&dotdot, "..");
-
-       dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &dotdot, 1));
+       dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
         if (!IS_ERR(dentry))
                 dentry->d_op = &gfs2_dops;
         return dentry;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c

index 4edd662c8232b2c24d1f1767b937f258071f8211..237ee6a940df23ab0720ed99252657611d0be7ec 100644 (file)
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -382,8 +382,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
         rblocks = RES_DINODE + ind_blocks;
         if (gfs2_is_jdata(ip))
                 rblocks += data_blocks ? data_blocks : 1;
-       if (ind_blocks || data_blocks)
+       if (ind_blocks || data_blocks) {
                 rblocks += RES_STATFS + RES_QUOTA;
+               rblocks += gfs2_rg_blocks(al);
+       }
         ret = gfs2_trans_begin(sdp, rblocks, 0);
         if (ret)
                 goto out_trans_fail;
@@ -491,7 +493,7 @@ static int gfs2_open(struct inode *inode, struct file *file)
                         goto fail;
  
                 if (!(file->f_flags & O_LARGEFILE) &&
-                   ip->i_disksize > MAX_NON_LFS) {
+                   i_size_read(inode) > MAX_NON_LFS) {
                         error = -EOVERFLOW;
                         goto fail_gunlock;
                 }
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c

index 9adf8f924e08991c32d12938702e570f163c3215..87778857f0994fa504224d93c7c5611d10bc1e8c 100644 (file)
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -441,6 +441,8 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
                 else
                         gfs2_glock_put_nolock(gl);
         }
+       if (held1 && held2 && list_empty(&gl->gl_holders))
+               clear_bit(GLF_QUEUED, &gl->gl_flags);
  
         gl->gl_state = new_state;
         gl->gl_tchange = jiffies;
@@ -1012,6 +1014,7 @@ fail:
                 if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
                         insert_pt = &gh2->gh_list;
         }
+       set_bit(GLF_QUEUED, &gl->gl_flags);
         if (likely(insert_pt == NULL)) {
                 list_add_tail(&gh->gh_list, &gl->gl_holders);
                 if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
@@ -1310,10 +1313,12 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
  
         gfs2_glock_hold(gl);
         holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
-       if (time_before(now, holdtime))
-               delay = holdtime - now;
-       if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
-               delay = gl->gl_ops->go_min_hold_time;
+       if (test_bit(GLF_QUEUED, &gl->gl_flags)) {
+               if (time_before(now, holdtime))
+                       delay = holdtime - now;
+               if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+                       delay = gl->gl_ops->go_min_hold_time;
+       }
  
         spin_lock(&gl->gl_spin);
         handle_callback(gl, state, delay);
@@ -1512,7 +1517,7 @@ static void clear_glock(struct gfs2_glock *gl)
         spin_unlock(&lru_lock);
  
         spin_lock(&gl->gl_spin);
-       if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED)
+       if (gl->gl_state != LM_ST_UNLOCKED)
                 handle_callback(gl, LM_ST_UNLOCKED, 0);
         spin_unlock(&gl->gl_spin);
         gfs2_glock_hold(gl);
@@ -1660,6 +1665,8 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
                 *p++ = 'I';
         if (test_bit(GLF_FROZEN, gflags))
                 *p++ = 'F';
+       if (test_bit(GLF_QUEUED, gflags))
+               *p++ = 'q';
         *p = 0;
         return buf;
  }
@@ -1776,10 +1783,12 @@ int __init gfs2_glock_init(void)
         }
  #endif
  
-       glock_workqueue = create_workqueue("glock_workqueue");
+       glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER |
+                                         WQ_HIGHPRI | WQ_FREEZEABLE, 0);
         if (IS_ERR(glock_workqueue))
                 return PTR_ERR(glock_workqueue);
-       gfs2_delete_workqueue = create_workqueue("delete_workqueue");
+       gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER |
+                                               WQ_FREEZEABLE, 0);
         if (IS_ERR(gfs2_delete_workqueue)) {
                 destroy_workqueue(glock_workqueue);
                 return PTR_ERR(gfs2_delete_workqueue);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h

index 2bda1911b1563347b52d4d2da7892768d93bd947..db1c26d6d2206c8f9e9b68396380ed8791f3c720 100644 (file)
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -215,7 +215,7 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
  void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
  
  /**
- * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
+ * gfs2_glock_nq_init - initialize a holder and enqueue it on a glock
   * @gl: the glock
   * @state: the state we're requesting
   * @flags: the modifier flags
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c

index 49f97d3bb690c512cb0f338d85938e622d501a49..0d149dcc04e515adfaaeb632a6677e5e3b555f45 100644 (file)
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -262,13 +262,12 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
         const struct gfs2_inode *ip = gl->gl_object;
         if (ip == NULL)
                 return 0;
-       gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n",
+       gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n",
                   (unsigned long long)ip->i_no_formal_ino,
                   (unsigned long long)ip->i_no_addr,
                   IF2DT(ip->i_inode.i_mode), ip->i_flags,
                   (unsigned int)ip->i_diskflags,
-                 (unsigned long long)ip->i_inode.i_size,
-                 (unsigned long long)ip->i_disksize);
+                 (unsigned long long)i_size_read(&ip->i_inode));
         return 0;
  }
  
@@ -453,7 +452,6 @@ const struct gfs2_glock_operations *gfs2_glops_list[] = {
         [LM_TYPE_META] = &gfs2_meta_glops,
         [LM_TYPE_INODE] = &gfs2_inode_glops,
         [LM_TYPE_RGRP] = &gfs2_rgrp_glops,
-       [LM_TYPE_NONDISK] = &gfs2_trans_glops,
         [LM_TYPE_IOPEN] = &gfs2_iopen_glops,
         [LM_TYPE_FLOCK] = &gfs2_flock_glops,
         [LM_TYPE_NONDISK] = &gfs2_nondisk_glops,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h

index fdbf4b366fa540d295dcbd73b298099b50418f8c..764fbb49efc8e3adbdeda7f83f178b0fd6ea70f8 100644 (file)
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -196,6 +196,7 @@ enum {
         GLF_REPLY_PENDING               = 9,
         GLF_INITIAL                     = 10,
         GLF_FROZEN                      = 11,
+       GLF_QUEUED                      = 12,
  };
  
  struct gfs2_glock {
@@ -267,7 +268,6 @@ struct gfs2_inode {
         u64 i_no_formal_ino;
         u64 i_generation;
         u64 i_eattr;
-       loff_t i_disksize;
         unsigned long i_flags;          /* GIF_... */
         struct gfs2_glock *i_gl; /* Move into i_gh? */
         struct gfs2_holder i_iopen_gh;
@@ -416,11 +416,8 @@ struct gfs2_args {
         char ar_locktable[GFS2_LOCKNAME_LEN];   /* Name of the Lock Table */
         char ar_hostdata[GFS2_LOCKNAME_LEN];    /* Host specific data */
         unsigned int ar_spectator:1;            /* Don't get a journal */
-       unsigned int ar_ignore_local_fs:1;      /* Ignore optimisations */
         unsigned int ar_localflocks:1;          /* Let the VFS do flock|fcntl */
-       unsigned int ar_localcaching:1;         /* Local caching */
         unsigned int ar_debug:1;                /* Oops on errors */
-       unsigned int ar_upgrade:1;              /* Upgrade ondisk format */
         unsigned int ar_posix_acl:1;            /* Enable posix acls */
         unsigned int ar_quota:2;                /* off/account/on */
         unsigned int ar_suiddir:1;              /* suiddir support */
@@ -497,7 +494,7 @@ struct gfs2_sb_host {
   */
  
  struct lm_lockstruct {
-       unsigned int ls_jid;
+       int ls_jid;
         unsigned int ls_first;
         unsigned int ls_first_done;
         unsigned int ls_nodir;
@@ -572,6 +569,7 @@ struct gfs2_sbd {
         struct list_head sd_rindex_mru_list;
         struct gfs2_rgrpd *sd_rindex_forward;
         unsigned int sd_rgrps;
+       unsigned int sd_max_rg_data;
  
         /* Journal index stuff */
  
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c

index 08140f185a3792153e23bab24f03ac3107d04757..06370f8bd8cf4aafa95fd4df64e93ec8d657328d 100644 (file)
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -359,8 +359,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
          * to do that.
          */
         ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
-       ip->i_disksize = be64_to_cpu(str->di_size);
-       i_size_write(&ip->i_inode, ip->i_disksize);
+       i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
         gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
         atime.tv_sec = be64_to_cpu(str->di_atime);
         atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
@@ -1055,7 +1054,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
         str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
         str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
         str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
-       str->di_size = cpu_to_be64(ip->i_disksize);
+       str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
         str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
         str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
         str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
@@ -1085,8 +1084,8 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
                (unsigned long long)ip->i_no_formal_ino);
         printk(KERN_INFO "  no_addr = %llu\n",
                (unsigned long long)ip->i_no_addr);
-       printk(KERN_INFO "  i_disksize = %llu\n",
-              (unsigned long long)ip->i_disksize);
+       printk(KERN_INFO "  i_size = %llu\n",
+              (unsigned long long)i_size_read(&ip->i_inode));
         printk(KERN_INFO "  blocks = %llu\n",
                (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
         printk(KERN_INFO "  i_goal = %llu\n",
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h

index 300ada3f21de0cf5fc22677283343a76caf8200e..6720d7d5fbc6aac91083b399b95ba6c978922c67 100644 (file)
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -19,6 +19,8 @@ extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
  extern int gfs2_internal_read(struct gfs2_inode *ip,
                               struct file_ra_state *ra_state,
                               char *buf, loff_t *pos, unsigned size);
+extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+                                  unsigned int from, unsigned int to);
  extern void gfs2_set_aops(struct inode *inode);
  
  static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
@@ -80,6 +82,19 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
         dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr);
  }
  
+static inline int gfs2_check_internal_file_size(struct inode *inode,
+                                               u64 minsize, u64 maxsize)
+{
+       u64 size = i_size_read(inode);
+       if (size < minsize || size > maxsize)
+               goto err;
+       if (size & ((1 << inode->i_blkbits) - 1))
+               goto err;
+       return 0;
+err:
+       gfs2_consist_inode(GFS2_I(inode));
+       return -EIO;
+}
  
  extern void gfs2_set_iop(struct inode *inode);
  extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c

index 0e0470ed34c273a341aed5860191bfadf42b41e5..1c09425b45fd728ba52c1f5f49c3feac187640a2 100644 (file)
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -42,9 +42,9 @@ static void gdlm_ast(void *arg)
                 ret |= LM_OUT_CANCELED;
                 goto out;
         case -EAGAIN: /* Try lock fails */
+       case -EDEADLK: /* Deadlock detected */
                 goto out;
-       case -EINVAL: /* Invalid */
-       case -ENOMEM: /* Out of memory */
+       case -ETIMEDOUT: /* Canceled due to timeout */
                 ret |= LM_OUT_ERROR;
                 goto out;
         case 0: /* Success */
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c

index b1e9630eb46a8d0338fef57ffa15caf23ab1cf0f..d7eb1e209aa899561f7168a6d2964460ae73a89f 100644 (file)
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -24,6 +24,7 @@
  #include "glock.h"
  #include "quota.h"
  #include "recovery.h"
+#include "dir.h"
  
  static struct shrinker qd_shrinker = {
         .shrink = gfs2_shrink_qd_memory,
@@ -78,6 +79,9 @@ static int __init init_gfs2_fs(void)
  {
         int error;
  
+       gfs2_str2qstr(&gfs2_qdot, ".");
+       gfs2_str2qstr(&gfs2_qdotdot, "..");
+
         error = gfs2_sys_init();
         if (error)
                 return error;
@@ -140,7 +144,7 @@ static int __init init_gfs2_fs(void)
  
         error = -ENOMEM;
         gfs_recovery_wq = alloc_workqueue("gfs_recovery",
-                                         WQ_NON_REENTRANT | WQ_RESCUER, 0);
+                                         WQ_RESCUER | WQ_FREEZEABLE, 0);
         if (!gfs_recovery_wq)
                 goto fail_wq;
  
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c

index 4d4b1e8ac64c02ef64ffd216a71ec801e2fdb625..aeafc233dc897fdb102df29bf052fc040b61faef 100644 (file)
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -38,14 +38,6 @@
  #define DO 0
  #define UNDO 1
  
-static const u32 gfs2_old_fs_formats[] = {
-        0
-};
-
-static const u32 gfs2_old_multihost_formats[] = {
-        0
-};
-
  /**
   * gfs2_tune_init - Fill a gfs2_tune structure with default values
   * @gt: tune
@@ -135,8 +127,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
  
  static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
  {
-       unsigned int x;
-
         if (sb->sb_magic != GFS2_MAGIC ||
             sb->sb_type != GFS2_METATYPE_SB) {
                 if (!silent)
@@ -150,55 +140,9 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int sile
             sb->sb_multihost_format == GFS2_FORMAT_MULTI)
                 return 0;
  
-       if (sb->sb_fs_format != GFS2_FORMAT_FS) {
-               for (x = 0; gfs2_old_fs_formats[x]; x++)
-                       if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
-                               break;
+       fs_warn(sdp, "Unknown on-disk format, unable to mount\n");
  
-               if (!gfs2_old_fs_formats[x]) {
-                       printk(KERN_WARNING
-                              "GFS2: code version (%u, %u) is incompatible "
-                              "with ondisk format (%u, %u)\n",
-                              GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-                              sb->sb_fs_format, sb->sb_multihost_format);
-                       printk(KERN_WARNING
-                              "GFS2: I don't know how to upgrade this FS\n");
-                       return -EINVAL;
-               }
-       }
-
-       if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
-               for (x = 0; gfs2_old_multihost_formats[x]; x++)
-                       if (gfs2_old_multihost_formats[x] ==
-                           sb->sb_multihost_format)
-                               break;
-
-               if (!gfs2_old_multihost_formats[x]) {
-                       printk(KERN_WARNING
-                              "GFS2: code version (%u, %u) is incompatible "
-                              "with ondisk format (%u, %u)\n",
-                              GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-                              sb->sb_fs_format, sb->sb_multihost_format);
-                       printk(KERN_WARNING
-                              "GFS2: I don't know how to upgrade this FS\n");
-                       return -EINVAL;
-               }
-       }
-
-       if (!sdp->sd_args.ar_upgrade) {
-               printk(KERN_WARNING
-                      "GFS2: code version (%u, %u) is incompatible "
-                      "with ondisk format (%u, %u)\n",
-                      GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-                      sb->sb_fs_format, sb->sb_multihost_format);
-               printk(KERN_INFO
-                      "GFS2: Use the \"upgrade\" mount option to upgrade "
-                      "the FS\n");
-               printk(KERN_INFO "GFS2: See the manual for more details\n");
-               return -EINVAL;
-       }
-
-       return 0;
+       return -EINVAL;
  }
  
  static void end_bio_io_page(struct bio *bio, int error)
@@ -586,7 +530,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
  
         prev_db = 0;
  
-       for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) {
+       for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) {
                 bh.b_state = 0;
                 bh.b_blocknr = 0;
                 bh.b_size = 1 << ip->i_inode.i_blkbits;
@@ -1022,7 +966,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
         if (!strcmp("lock_nolock", proto)) {
                 lm = &nolock_ops;
                 sdp->sd_args.ar_localflocks = 1;
-               sdp->sd_args.ar_localcaching = 1;
  #ifdef CONFIG_GFS2_FS_LOCKING_DLM
         } else if (!strcmp("lock_dlm", proto)) {
                 lm = &gfs2_dlm_ops;
@@ -1113,8 +1056,6 @@ static int gfs2_journalid_wait(void *word)
  
  static int wait_on_journal(struct gfs2_sbd *sdp)
  {
-       if (sdp->sd_args.ar_spectator)
-               return 0;
         if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
                 return 0;
  
@@ -1217,6 +1158,20 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
         if (error)
                 goto fail_sb;
  
+       /*
+        * If user space has failed to join the cluster or some similar
+        * failure has occurred, then the journal id will contain a
+        * negative (error) number. This will then be returned to the
+        * caller (of the mount syscall). We do this even for spectator
+        * mounts (which just write a jid of 0 to indicate "ok" even though
+        * the jid is unused in the spectator case)
+        */
+       if (sdp->sd_lockstruct.ls_jid < 0) {
+               error = sdp->sd_lockstruct.ls_jid;
+               sdp->sd_lockstruct.ls_jid = 0;
+               goto fail_sb;
+       }
+
         error = init_inodes(sdp, DO);
         if (error)
                 goto fail_sb;
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c

index 1009be2c9737687cdee8b48668a5f5d09752abeb..0534510200d5961887d3ee957d799e3b4669e76b 100644 (file)
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -18,6 +18,8 @@
  #include <linux/gfs2_ondisk.h>
  #include <linux/crc32.h>
  #include <linux/fiemap.h>
+#include <linux/swap.h>
+#include <linux/falloc.h>
  #include <asm/uaccess.h>
  
  #include "gfs2.h"
@@ -217,7 +219,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
                         goto out_gunlock_q;
  
                 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-                                        al->al_rgd->rd_length +
+                                        gfs2_rg_blocks(al) +
                                          2 * RES_DINODE + RES_STATFS +
                                          RES_QUOTA, 0);
                 if (error)
@@ -406,7 +408,6 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
  
         ip = ghs[1].gh_gl->gl_object;
  
-       ip->i_disksize = size;
         i_size_write(inode, size);
  
         error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -461,7 +462,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
         ip = ghs[1].gh_gl->gl_object;
  
         ip->i_inode.i_nlink = 2;
-       ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
+       i_size_write(inode, sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode));
         ip->i_diskflags |= GFS2_DIF_JDATA;
         ip->i_entries = 2;
  
@@ -470,18 +471,15 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
         if (!gfs2_assert_withdraw(sdp, !error)) {
                 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
                 struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
-               struct qstr str;
  
-               gfs2_str2qstr(&str, ".");
                 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-               gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent);
+               gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent);
                 dent->de_inum = di->di_num; /* already GFS2 endian */
                 dent->de_type = cpu_to_be16(DT_DIR);
                 di->di_entries = cpu_to_be32(1);
  
-               gfs2_str2qstr(&str, "..");
                 dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
-               gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
+               gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
  
                 gfs2_inum_out(dip, dent);
                 dent->de_type = cpu_to_be16(DT_DIR);
@@ -522,7 +520,6 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
  static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
                        struct gfs2_inode *ip)
  {
-       struct qstr dotname;
         int error;
  
         if (ip->i_entries != 2) {
@@ -539,13 +536,11 @@ static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
         if (error)
                 return error;
  
-       gfs2_str2qstr(&dotname, ".");
-       error = gfs2_dir_del(ip, &dotname);
+       error = gfs2_dir_del(ip, &gfs2_qdot);
         if (error)
                 return error;
  
-       gfs2_str2qstr(&dotname, "..");
-       error = gfs2_dir_del(ip, &dotname);
+       error = gfs2_dir_del(ip, &gfs2_qdotdot);
         if (error)
                 return error;
  
@@ -694,11 +689,8 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
         struct inode *dir = &to->i_inode;
         struct super_block *sb = dir->i_sb;
         struct inode *tmp;
-       struct qstr dotdot;
         int error = 0;
  
-       gfs2_str2qstr(&dotdot, "..");
-
         igrab(dir);
  
         for (;;) {
@@ -711,7 +703,7 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
                         break;
                 }
  
-               tmp = gfs2_lookupi(dir, &dotdot, 1);
+               tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1);
                 if (IS_ERR(tmp)) {
                         error = PTR_ERR(tmp);
                         break;
@@ -744,7 +736,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
         struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
         struct gfs2_inode *nip = NULL;
         struct gfs2_sbd *sdp = GFS2_SB(odir);
-       struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
+       struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh;
         struct gfs2_rgrpd *nrgd;
         unsigned int num_gh;
         int dir_rename = 0;
@@ -758,6 +750,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                         return 0;
         }
  
+       error = gfs2_rindex_hold(sdp, &ri_gh);
+       if (error)
+               return error;
  
         if (odip != ndip) {
                 error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
@@ -887,12 +882,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
  
                 al->al_requested = sdp->sd_max_dirres;
  
-               error = gfs2_inplace_reserve(ndip);
+               error = gfs2_inplace_reserve_ri(ndip);
                 if (error)
                         goto out_gunlock_q;
  
                 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-                                        al->al_rgd->rd_length +
+                                        gfs2_rg_blocks(al) +
                                          4 * RES_DINODE + 4 * RES_LEAF +
                                          RES_STATFS + RES_QUOTA + 4, 0);
                 if (error)
@@ -920,9 +915,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
         }
  
         if (dir_rename) {
-               struct qstr name;
-               gfs2_str2qstr(&name, "..");
-
                 error = gfs2_change_nlink(ndip, +1);
                 if (error)
                         goto out_end_trans;
@@ -930,7 +922,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                 if (error)
                         goto out_end_trans;
  
-               error = gfs2_dir_mvino(ip, &name, ndip, DT_DIR);
+               error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
                 if (error)
                         goto out_end_trans;
         } else {
@@ -972,6 +964,7 @@ out_gunlock_r:
         if (r_gh.gh_gl)
                 gfs2_glock_dq_uninit(&r_gh);
  out:
+       gfs2_glock_dq_uninit(&ri_gh);
         return error;
  }
  
@@ -990,7 +983,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
         struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
         struct gfs2_holder i_gh;
         struct buffer_head *dibh;
-       unsigned int x;
+       unsigned int x, size;
         char *buf;
         int error;
  
@@ -1002,7 +995,8 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
                 return NULL;
         }
  
-       if (!ip->i_disksize) {
+       size = (unsigned int)i_size_read(&ip->i_inode);
+       if (size == 0) {
                 gfs2_consist_inode(ip);
                 buf = ERR_PTR(-EIO);
                 goto out;
@@ -1014,7 +1008,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
                 goto out;
         }
  
-       x = ip->i_disksize + 1;
+       x = size + 1;
         buf = kmalloc(x, GFP_NOFS);
         if (!buf)
                 buf = ERR_PTR(-ENOMEM);
@@ -1071,30 +1065,6 @@ int gfs2_permission(struct inode *inode, int mask)
         return error;
  }
  
-/*
- * XXX(truncate): the truncate_setsize calls should be moved to the end.
- */
-static int setattr_size(struct inode *inode, struct iattr *attr)
-{
-       struct gfs2_inode *ip = GFS2_I(inode);
-       struct gfs2_sbd *sdp = GFS2_SB(inode);
-       int error;
-
-       if (attr->ia_size != ip->i_disksize) {
-               error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
-               if (error)
-                       return error;
-               truncate_setsize(inode, attr->ia_size);
-               gfs2_trans_end(sdp);
-       }
-
-       error = gfs2_truncatei(ip, attr->ia_size);
-       if (error && (inode->i_size != ip->i_disksize))
-               i_size_write(inode, ip->i_disksize);
-
-       return error;
-}
-
  static int setattr_chown(struct inode *inode, struct iattr *attr)
  {
         struct gfs2_inode *ip = GFS2_I(inode);
@@ -1195,7 +1165,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
                 goto out;
  
         if (attr->ia_valid & ATTR_SIZE)
-               error = setattr_size(inode, attr);
+               error = gfs2_setattr_size(inode, attr->ia_size);
         else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
                 error = setattr_chown(inode, attr);
         else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
@@ -1301,6 +1271,257 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
         return ret;
  }
  
+static void empty_write_end(struct page *page, unsigned from,
+                          unsigned to)
+{
+       struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+
+       page_zero_new_buffers(page, from, to);
+       flush_dcache_page(page);
+       mark_page_accessed(page);
+
+       if (!gfs2_is_writeback(ip))
+               gfs2_page_add_databufs(ip, page, from, to);
+
+       block_commit_write(page, from, to);
+}
+
+
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
+{
+       unsigned start, end, next;
+       struct buffer_head *bh, *head;
+       int error;
+
+       if (!page_has_buffers(page)) {
+               error = block_prepare_write(page, from, to, gfs2_block_map);
+               if (unlikely(error))
+                       return error;
+
+               empty_write_end(page, from, to);
+               return 0;
+       }
+
+       bh = head = page_buffers(page);
+       next = end = 0;
+       while (next < from) {
+               next += bh->b_size;
+               bh = bh->b_this_page;
+       }
+       start = next;
+       do {
+               next += bh->b_size;
+               if (buffer_mapped(bh)) {
+                       if (end) {
+                               error = block_prepare_write(page, start, end,
+                                                           gfs2_block_map);
+                               if (unlikely(error))
+                                       return error;
+                               empty_write_end(page, start, end);
+                               end = 0;
+                       }
+                       start = next;
+               }
+               else
+                       end = next;
+               bh = bh->b_this_page;
+       } while (next < to);
+
+       if (end) {
+               error = block_prepare_write(page, start, end, gfs2_block_map);
+               if (unlikely(error))
+                       return error;
+               empty_write_end(page, start, end);
+       }
+
+       return 0;
+}
+
+static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
+                          int mode)
+{
+       struct gfs2_inode *ip = GFS2_I(inode);
+       struct buffer_head *dibh;
+       int error;
+       u64 start = offset >> PAGE_CACHE_SHIFT;
+       unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
+       u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+       pgoff_t curr;
+       struct page *page;
+       unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
+       unsigned int from, to;
+
+       if (!end_offset)
+               end_offset = PAGE_CACHE_SIZE;
+
+       error = gfs2_meta_inode_buffer(ip, &dibh);
+       if (unlikely(error))
+               goto out;
+
+       gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
+       if (gfs2_is_stuffed(ip)) {
+               error = gfs2_unstuff_dinode(ip, NULL);
+               if (unlikely(error))
+                       goto out;
+       }
+
+       curr = start;
+       offset = start << PAGE_CACHE_SHIFT;
+       from = start_offset;
+       to = PAGE_CACHE_SIZE;
+       while (curr <= end) {
+               page = grab_cache_page_write_begin(inode->i_mapping, curr,
+                                                  AOP_FLAG_NOFS);
+               if (unlikely(!page)) {
+                       error = -ENOMEM;
+                       goto out;
+               }
+
+               if (curr == end)
+                       to = end_offset;
+               error = write_empty_blocks(page, from, to);
+               if (!error && offset + to > inode->i_size &&
+                   !(mode & FALLOC_FL_KEEP_SIZE)) {
+                       i_size_write(inode, offset + to);
+               }
+               unlock_page(page);
+               page_cache_release(page);
+               if (error)
+                       goto out;
+               curr++;
+               offset += PAGE_CACHE_SIZE;
+               from = 0;
+       }
+
+       gfs2_dinode_out(ip, dibh->b_data);
+       mark_inode_dirty(inode);
+
+       brelse(dibh);
+
+out:
+       return error;
+}
+
+static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
+                           unsigned int *data_blocks, unsigned int *ind_blocks)
+{
+       const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+       unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
+       unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
+
+       for (tmp = max_data; tmp > sdp->sd_diptrs;) {
+               tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+               max_data -= tmp;
+       }
+       /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
+          so it might end up with fewer data blocks */
+       if (max_data <= *data_blocks)
+               return;
+       *data_blocks = max_data;
+       *ind_blocks = max_blocks - max_data;
+       *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
+       if (*len > max) {
+               *len = max;
+               gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
+       }
+}
+
+static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset,
+                          loff_t len)
+{
+       struct gfs2_sbd *sdp = GFS2_SB(inode);
+       struct gfs2_inode *ip = GFS2_I(inode);
+       unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
+       loff_t bytes, max_bytes;
+       struct gfs2_alloc *al;
+       int error;
+       loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
+       next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
+
+       offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
+                sdp->sd_sb.sb_bsize_shift;
+
+       len = next - offset;
+       bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
+       if (!bytes)
+               bytes = UINT_MAX;
+
+       gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+       error = gfs2_glock_nq(&ip->i_gh);
+       if (unlikely(error))
+               goto out_uninit;
+
+       if (!gfs2_write_alloc_required(ip, offset, len))
+               goto out_unlock;
+
+       while (len > 0) {
+               if (len < bytes)
+                       bytes = len;
+               al = gfs2_alloc_get(ip);
+               if (!al) {
+                       error = -ENOMEM;
+                       goto out_unlock;
+               }
+
+               error = gfs2_quota_lock_check(ip);
+               if (error)
+                       goto out_alloc_put;
+
+retry:
+               gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
+
+               al->al_requested = data_blocks + ind_blocks;
+               error = gfs2_inplace_reserve(ip);
+               if (error) {
+                       if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
+                               bytes >>= 1;
+                               goto retry;
+                       }
+                       goto out_qunlock;
+               }
+               max_bytes = bytes;
+               calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
+               al->al_requested = data_blocks + ind_blocks;
+
+               rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
+                         RES_RG_HDR + gfs2_rg_blocks(al);
+               if (gfs2_is_jdata(ip))
+                       rblocks += data_blocks ? data_blocks : 1;
+
+               error = gfs2_trans_begin(sdp, rblocks,
+                                        PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+               if (error)
+                       goto out_trans_fail;
+
+               error = fallocate_chunk(inode, offset, max_bytes, mode);
+               gfs2_trans_end(sdp);
+
+               if (error)
+                       goto out_trans_fail;
+
+               len -= max_bytes;
+               offset += max_bytes;
+               gfs2_inplace_release(ip);
+               gfs2_quota_unlock(ip);
+               gfs2_alloc_put(ip);
+       }
+       goto out_unlock;
+
+out_trans_fail:
+       gfs2_inplace_release(ip);
+out_qunlock:
+       gfs2_quota_unlock(ip);
+out_alloc_put:
+       gfs2_alloc_put(ip);
+out_unlock:
+       gfs2_glock_dq(&ip->i_gh);
+out_uninit:
+       gfs2_holder_uninit(&ip->i_gh);
+       return error;
+}
+
+
  static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        u64 start, u64 len)
  {
@@ -1351,6 +1572,7 @@ const struct inode_operations gfs2_file_iops = {
         .getxattr = gfs2_getxattr,
         .listxattr = gfs2_listxattr,
         .removexattr = gfs2_removexattr,
+       .fallocate = gfs2_fallocate,
         .fiemap = gfs2_fiemap,
  };
  
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c

index 1bc6b5695e6dfb34870810b87bd09c819f25c93f..58a9b9998b42d0d9603c7a49ffc746ab4a22ef87 100644 (file)
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -735,10 +735,8 @@ get_a_page:
                 goto out;
  
         size = loc + sizeof(struct gfs2_quota);
-       if (size > inode->i_size) {
-               ip->i_disksize = size;
+       if (size > inode->i_size)
                 i_size_write(inode, size);
-       }
         inode->i_mtime = inode->i_atime = CURRENT_TIME;
         gfs2_trans_add_bh(ip->i_gl, dibh, 1);
         gfs2_dinode_out(ip, dibh->b_data);
@@ -817,7 +815,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
                 goto out_alloc;
  
         if (nalloc)
-               blocks += al->al_rgd->rd_length + nalloc * ind_blocks + RES_STATFS;
+               blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS;
  
         error = gfs2_trans_begin(sdp, blocks, 0);
         if (error)
@@ -1190,18 +1188,17 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *
  int gfs2_quota_init(struct gfs2_sbd *sdp)
  {
         struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
-       unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
+       u64 size = i_size_read(sdp->sd_qc_inode);
+       unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift;
         unsigned int x, slot = 0;
         unsigned int found = 0;
         u64 dblock;
         u32 extlen = 0;
         int error;
  
-       if (!ip->i_disksize || ip->i_disksize > (64 << 20) ||
-           ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) {
-               gfs2_consist_inode(ip);
+       if (gfs2_check_internal_file_size(sdp->sd_qc_inode, 1, 64 << 20))
                 return -EIO;
-       }
+
         sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
         sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
  
@@ -1589,6 +1586,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
                 error = gfs2_inplace_reserve(ip);
                 if (error)
                         goto out_alloc;
+               blocks += gfs2_rg_blocks(al);
         }
  
         error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c

index f7f89a94a5a4598a4b532016a0c16846a62cef2b..f2a02edcac8f43e9de1dd22fd4c72ac7347f39a5 100644 (file)
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -455,11 +455,13 @@ void gfs2_recover_func(struct work_struct *work)
         int ro = 0;
         unsigned int pass;
         int error;
+       int jlocked = 0;
  
-       if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
+       if (sdp->sd_args.ar_spectator ||
+           (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) {
                 fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
                         jd->jd_jid);
-
+               jlocked = 1;
                 /* Acquire the journal lock so we can do recovery */
  
                 error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
@@ -554,13 +556,12 @@ void gfs2_recover_func(struct work_struct *work)
                         jd->jd_jid, t);
         }
  
-       if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
-               gfs2_glock_dq_uninit(&ji_gh);
-
         gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
  
-       if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
+       if (jlocked) {
+               gfs2_glock_dq_uninit(&ji_gh);
                 gfs2_glock_dq_uninit(&j_gh);
+       }
  
         fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
         goto done;
@@ -568,7 +569,7 @@ void gfs2_recover_func(struct work_struct *work)
  fail_gunlock_tr:
         gfs2_glock_dq_uninit(&t_gh);
  fail_gunlock_ji:
-       if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
+       if (jlocked) {
                 gfs2_glock_dq_uninit(&ji_gh);
  fail_gunlock_j:
                 gfs2_glock_dq_uninit(&j_gh);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c

index 171a744f8e45d172f4e43eb793ae4e08dba8ba23..fb67f593f40856b213f03887642c44a3e4c7ccc5 100644 (file)
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
         for (rgrps = 0;; rgrps++) {
                 loff_t pos = rgrps * sizeof(struct gfs2_rindex);
  
-               if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize)
+               if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode))
                         break;
                 error = gfs2_internal_read(ip, &ra_state, buf, &pos,
                                            sizeof(struct gfs2_rindex));
@@ -588,7 +588,9 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
         struct inode *inode = &ip->i_inode;
         struct file_ra_state ra_state;
-       u64 rgrp_count = ip->i_disksize;
+       u64 rgrp_count = i_size_read(inode);
+       struct gfs2_rgrpd *rgd;
+       unsigned int max_data = 0;
         int error;
  
         do_div(rgrp_count, sizeof(struct gfs2_rindex));
@@ -603,6 +605,10 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
                 }
         }
  
+       list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
+               if (rgd->rd_data > max_data)
+                       max_data = rgd->rd_data;
+       sdp->sd_max_rg_data = max_data;
         sdp->sd_rindex_uptodate = 1;
         return 0;
  }
@@ -622,13 +628,15 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
         struct inode *inode = &ip->i_inode;
         struct file_ra_state ra_state;
+       struct gfs2_rgrpd *rgd;
+       unsigned int max_data = 0;
         int error;
  
         file_ra_state_init(&ra_state, inode->i_mapping);
         for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
                 /* Ignore partials */
                 if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
-                   ip->i_disksize)
+                   i_size_read(inode))
                         break;
                 error = read_rindex_entry(ip, &ra_state);
                 if (error) {
@@ -636,6 +644,10 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
                         return error;
                 }
         }
+       list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
+               if (rgd->rd_data > max_data)
+                       max_data = rgd->rd_data;
+       sdp->sd_max_rg_data = max_data;
  
         sdp->sd_rindex_uptodate = 1;
         return 0;
@@ -1188,7 +1200,8 @@ out:
   * Returns: errno
   */
  
-int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
+int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
+                          char *file, unsigned int line)
  {
         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
         struct gfs2_alloc *al = ip->i_alloc;
@@ -1199,12 +1212,15 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
                 return -EINVAL;
  
  try_again:
-       /* We need to hold the rindex unless the inode we're using is
-          the rindex itself, in which case it's already held. */
-       if (ip != GFS2_I(sdp->sd_rindex))
-               error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
-       else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */
-               error = gfs2_ri_update_special(ip);
+       if (hold_rindex) {
+               /* We need to hold the rindex unless the inode we're using is
+                  the rindex itself, in which case it's already held. */
+               if (ip != GFS2_I(sdp->sd_rindex))
+                       error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
+               else if (!sdp->sd_rgrps) /* We may not have the rindex read
+                                           in, so: */
+                       error = gfs2_ri_update_special(ip);
+       }
  
         if (error)
                 return error;
@@ -1215,7 +1231,7 @@ try_again:
            try to free it, and try the allocation again. */
         error = get_local_rgrp(ip, &unlinked, &last_unlinked);
         if (error) {
-               if (ip != GFS2_I(sdp->sd_rindex))
+               if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
                         gfs2_glock_dq_uninit(&al->al_ri_gh);
                 if (error != -EAGAIN)
                         return error;
@@ -1257,7 +1273,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
         al->al_rgd = NULL;
         if (al->al_rgd_gh.gh_gl)
                 gfs2_glock_dq_uninit(&al->al_rgd_gh);
-       if (ip != GFS2_I(sdp->sd_rindex))
+       if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl)
                 gfs2_glock_dq_uninit(&al->al_ri_gh);
  }
  
@@ -1496,11 +1512,19 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
         struct buffer_head *dibh;
         struct gfs2_alloc *al = ip->i_alloc;
-       struct gfs2_rgrpd *rgd = al->al_rgd;
+       struct gfs2_rgrpd *rgd;
         u32 goal, blk;
         u64 block;
         int error;
  
+       /* Only happens if there is a bug in gfs2, return something distinctive
+        * to ensure that it is noticed.
+        */
+       if (al == NULL)
+               return -ECANCELED;
+
+       rgd = al->al_rgd;
+
         if (rgrp_contains_block(rgd, ip->i_goal))
                 goal = ip->i_goal - rgd->rd_data0;
         else
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h

index f07119d89557855fc9a8673b32e9119cad921570..0e35c0466f9a6c5979a3fe8c339def323bc37fad 100644 (file)
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -39,10 +39,12 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
         ip->i_alloc = NULL;
  }
  
-extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file,
-                                 unsigned int line);
+extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
+                                 char *file, unsigned int line);
  #define gfs2_inplace_reserve(ip) \
-gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
+       gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__)
+#define gfs2_inplace_reserve_ri(ip) \
+       gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__)
  
  extern void gfs2_inplace_release(struct gfs2_inode *ip);
  
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c

index 77cb9f830ee47eb51520bd8581ebc700b426455e..047d1176096c79d6f0ce50227b8c098755fb0150 100644 (file)
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -85,6 +85,7 @@ static const match_table_t tokens = {
         {Opt_locktable, "locktable=%s"},
         {Opt_hostdata, "hostdata=%s"},
         {Opt_spectator, "spectator"},
+       {Opt_spectator, "norecovery"},
         {Opt_ignore_local_fs, "ignore_local_fs"},
         {Opt_localflocks, "localflocks"},
         {Opt_localcaching, "localcaching"},
@@ -159,13 +160,13 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
                         args->ar_spectator = 1;
                         break;
                 case Opt_ignore_local_fs:
-                       args->ar_ignore_local_fs = 1;
+                       /* Retained for backwards compat only */
                         break;
                 case Opt_localflocks:
                         args->ar_localflocks = 1;
                         break;
                 case Opt_localcaching:
-                       args->ar_localcaching = 1;
+                       /* Retained for backwards compat only */
                         break;
                 case Opt_debug:
                         if (args->ar_errors == GFS2_ERRORS_PANIC) {
@@ -179,7 +180,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
                         args->ar_debug = 0;
                         break;
                 case Opt_upgrade:
-                       args->ar_upgrade = 1;
+                       /* Retained for backwards compat only */
                         break;
                 case Opt_acl:
                         args->ar_posix_acl = 1;
@@ -342,15 +343,14 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
  {
         struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
         struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+       u64 size = i_size_read(jd->jd_inode);
  
-       if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) ||
-           (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) {
-               gfs2_consist_inode(ip);
+       if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30))
                 return -EIO;
-       }
-       jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
  
-       if (gfs2_write_alloc_required(ip, 0, ip->i_disksize)) {
+       jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift;
+
+       if (gfs2_write_alloc_required(ip, 0, size)) {
                 gfs2_consist_inode(ip);
                 return -EIO;
         }
@@ -1129,9 +1129,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
  
         /* Some flags must not be changed */
         if (args_neq(&args, &sdp->sd_args, spectator) ||
-           args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
             args_neq(&args, &sdp->sd_args, localflocks) ||
-           args_neq(&args, &sdp->sd_args, localcaching) ||
             args_neq(&args, &sdp->sd_args, meta))
                 return -EINVAL;
  
@@ -1234,16 +1232,10 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
                 seq_printf(s, ",hostdata=%s", args->ar_hostdata);
         if (args->ar_spectator)
                 seq_printf(s, ",spectator");
-       if (args->ar_ignore_local_fs)
-               seq_printf(s, ",ignore_local_fs");
         if (args->ar_localflocks)
                 seq_printf(s, ",localflocks");
-       if (args->ar_localcaching)
-               seq_printf(s, ",localcaching");
         if (args->ar_debug)
                 seq_printf(s, ",debug");
-       if (args->ar_upgrade)
-               seq_printf(s, ",upgrade");
         if (args->ar_posix_acl)
                 seq_printf(s, ",acl");
         if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c

index ccacffd2faaa6d65d1f116b9b2788ee732786f3f..748ccb557c18fc504c28951f13d634fe9f05fa0a 100644 (file)
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -230,7 +230,10 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
  
         if (gltype > LM_TYPE_JOURNAL)
                 return -EINVAL;
-       glops = gfs2_glops_list[gltype];
+       if (gltype == LM_TYPE_NONDISK && glnum == GFS2_TRANS_LOCK)
+               glops = &gfs2_trans_glops;
+       else
+               glops = gfs2_glops_list[gltype];
         if (glops == NULL)
                 return -EINVAL;
         if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags))
@@ -399,31 +402,32 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
  
  static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
  {
-       return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid);
+       return sprintf(buf, "%d\n", sdp->sd_lockstruct.ls_jid);
  }
  
  static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
  {
-        unsigned jid;
+        int jid;
         int rv;
  
-       rv = sscanf(buf, "%u", &jid);
+       rv = sscanf(buf, "%d", &jid);
         if (rv != 1)
                 return -EINVAL;
  
         spin_lock(&sdp->sd_jindex_spin);
         rv = -EINVAL;
-       if (sdp->sd_args.ar_spectator)
-               goto out;
         if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
                 goto out;
         rv = -EBUSY;
-       if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
+       if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
                 goto out;
+       rv = 0;
+       if (sdp->sd_args.ar_spectator && jid > 0)
+               rv = jid = -EINVAL;
         sdp->sd_lockstruct.ls_jid = jid;
+       clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
         smp_mb__after_clear_bit();
         wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
-       rv = 0;
  out:
         spin_unlock(&sdp->sd_jindex_spin);
         return rv ? rv : len;
@@ -617,7 +621,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
         add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
         add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
         if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags))
-               add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid);
+               add_uevent_var(env, "JOURNALID=%d", sdp->sd_lockstruct.ls_jid);
         if (gfs2_uuid_valid(uuid))
                 add_uevent_var(env, "UUID=%pUB", uuid);
         return 0;
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h

index 148d55c14171dee39435c83d56c906c09dcd0cf8..cedb0bb96d968414d14e6eca3e5b934b68ee606e 100644 (file)
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -39,7 +39,8 @@
         {(1UL << GLF_INVALIDATE_IN_PROGRESS),   "i" },          \
         {(1UL << GLF_REPLY_PENDING),            "r" },          \
         {(1UL << GLF_INITIAL),                  "I" },          \
-       {(1UL << GLF_FROZEN),                   "F" })
+       {(1UL << GLF_FROZEN),                   "F" },          \
+       {(1UL << GLF_QUEUED),                   "q" })
  
  #ifndef NUMPTY
  #define NUMPTY
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h

index edf9d4bd908ee2726991f12ac869bfeb60971381..fb56b783e028c8ce61b0b663e67e2df14e408b07 100644 (file)
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -20,11 +20,20 @@ struct gfs2_glock;
  #define RES_JDATA      1
  #define RES_DATA       1
  #define RES_LEAF       1
+#define RES_RG_HDR     1
  #define RES_RG_BIT     2
  #define RES_EATTR      1
  #define RES_STATFS     1
  #define RES_QUOTA      2
  
+/* reserve either the number of blocks to be allocated plus the rg header
+ * block, or all of the blocks in the rg, whichever is smaller */
+static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al)
+{
+       return (al->al_requested < al->al_rgd->rd_length)?
+              al->al_requested + 1 : al->al_rgd->rd_length;
+}
+
  int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
                      unsigned int revokes);
  
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c

index 776af6eb4bcb1b193ecf5ef858ac09cb0535b95f..30b58f07c8a6b219fc964efe101ce5f861397885 100644 (file)
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -734,7 +734,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
                 goto out_gunlock_q;
  
         error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
-                                blks + al->al_rgd->rd_length +
+                                blks + gfs2_rg_blocks(al) +
                                  RES_DINODE + RES_STATFS + RES_QUOTA, 0);
         if (error)
                 goto out_ipres;
diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c

index 4129cdb3f0d8fbf80c4b161d877d9e922666f9ea..571abe97b42a2919caf381238eec6e58c57d0eaf 100644 (file)
--- a/fs/hfs/bfind.c
+++ b/fs/hfs/bfind.c
@@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
         fd->search_key = ptr;
         fd->key = ptr + tree->max_key_len + 2;
         dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
-       down(&tree->tree_lock);
+       mutex_lock(&tree->tree_lock);
         return 0;
  }
  
@@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
         hfs_bnode_put(fd->bnode);
         kfree(fd->search_key);
         dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
-       up(&fd->tree->tree_lock);
+       mutex_unlock(&fd->tree->tree_lock);
         fd->tree = NULL;
  }
  
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c

index 38a0a9917d7f3a67b0eaad49a75246430eef7d76..3ebc437736febb4e0ef37a3612ea6501c7eac59a 100644 (file)
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -27,7 +27,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
         if (!tree)
                 return NULL;
  
-       init_MUTEX(&tree->tree_lock);
+       mutex_init(&tree->tree_lock);
         spin_lock_init(&tree->hash_lock);
         /* Set the correct compare function */
         tree->sb = sb;
diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h

index cc51905ac21de4f110929dedd9160152d76a0f26..2a1d712f85dccfb5bb2983615e84278a1c4923e7 100644 (file)
--- a/fs/hfs/btree.h
+++ b/fs/hfs/btree.h
@@ -33,7 +33,7 @@ struct hfs_btree {
         unsigned int depth;
  
         //unsigned int map1_size, map_size;
-       struct semaphore tree_lock;
+       struct mutex tree_lock;
  
         unsigned int pages_per_bnode;
         spinlock_t hash_lock;
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c

index 5007a41f1be9d345ff11dd7420285ee6a79c08e2..d182438c7ae4bea8b4ae108ad910ee795417410a 100644 (file)
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
         fd->search_key = ptr;
         fd->key = ptr + tree->max_key_len + 2;
         dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
-       down(&tree->tree_lock);
+       mutex_lock(&tree->tree_lock);
         return 0;
  }
  
@@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
         hfs_bnode_put(fd->bnode);
         kfree(fd->search_key);
         dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
-       up(&fd->tree->tree_lock);
+       mutex_unlock(&fd->tree->tree_lock);
         fd->tree = NULL;
  }
  
@@ -52,6 +52,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
                 rec = (e + b) / 2;
                 len = hfs_brec_lenoff(bnode, rec, &off);
                 keylen = hfs_brec_keylen(bnode, rec);
+               if (keylen == 0) {
+                       res = -EINVAL;
+                       goto fail;
+               }
                 hfs_bnode_read(bnode, fd->key, off, keylen);
                 cmpval = bnode->tree->keycmp(fd->key, fd->search_key);
                 if (!cmpval) {
@@ -67,6 +71,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
         if (rec != e && e >= 0) {
                 len = hfs_brec_lenoff(bnode, e, &off);
                 keylen = hfs_brec_keylen(bnode, e);
+               if (keylen == 0) {
+                       res = -EINVAL;
+                       goto fail;
+               }
                 hfs_bnode_read(bnode, fd->key, off, keylen);
         }
  done:
@@ -75,6 +83,7 @@ done:
         fd->keylength = keylen;
         fd->entryoffset = off + keylen;
         fd->entrylength = len - keylen;
+fail:
         return res;
  }
  
@@ -198,6 +207,10 @@ int hfs_brec_goto(struct hfs_find_data *fd, int cnt)
  
         len = hfs_brec_lenoff(bnode, fd->record, &off);
         keylen = hfs_brec_keylen(bnode, fd->record);
+       if (keylen == 0) {
+               res = -EINVAL;
+               goto out;
+       }
         fd->keyoffset = off;
         fd->keylength = keylen;
         fd->entryoffset = off + keylen;
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c

index ea30afc2a03c774221cc17341c446d30a0cb341b..ad57f5991eb1f14e3ac24dafa207e440d6d74be3 100644 (file)
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -17,6 +17,7 @@
  
  int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max)
  {
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
         struct page *page;
         struct address_space *mapping;
         __be32 *pptr, *curr, *end;
@@ -29,8 +30,8 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
                 return size;
  
         dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
-       mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
-       mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
+       mutex_lock(&sbi->alloc_mutex);
+       mapping = sbi->alloc_file->i_mapping;
         page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
         if (IS_ERR(page)) {
                 start = size;
@@ -150,16 +151,17 @@ done:
         set_page_dirty(page);
         kunmap(page);
         *max = offset + (curr - pptr) * 32 + i - start;
-       HFSPLUS_SB(sb).free_blocks -= *max;
+       sbi->free_blocks -= *max;
         sb->s_dirt = 1;
         dprint(DBG_BITMAP, "-> %u,%u\n", start, *max);
  out:
-       mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
+       mutex_unlock(&sbi->alloc_mutex);
         return start;
  }
  
  int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
  {
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
         struct page *page;
         struct address_space *mapping;
         __be32 *pptr, *curr, *end;
@@ -172,11 +174,11 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
  
         dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count);
         /* are all of the bits in range? */
-       if ((offset + count) > HFSPLUS_SB(sb).total_blocks)
+       if ((offset + count) > sbi->total_blocks)
                 return -2;
  
-       mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
-       mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
+       mutex_lock(&sbi->alloc_mutex);
+       mapping = sbi->alloc_file->i_mapping;
         pnr = offset / PAGE_CACHE_BITS;
         page = read_mapping_page(mapping, pnr, NULL);
         pptr = kmap(page);
@@ -224,9 +226,9 @@ done:
  out:
         set_page_dirty(page);
         kunmap(page);
-       HFSPLUS_SB(sb).free_blocks += len;
+       sbi->free_blocks += len;
         sb->s_dirt = 1;
-       mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
+       mutex_unlock(&sbi->alloc_mutex);
  
         return 0;
  }
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c

index c88e5d72a402ae2d29a8905cdccf7b59ccd4337d..2f39d05443e1a374b197359f70d20337562aa9e8 100644 (file)
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -42,10 +42,13 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
                 recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2);
                 if (!recoff)
                         return 0;
-               if (node->tree->attributes & HFS_TREE_BIGKEYS)
-                       retval = hfs_bnode_read_u16(node, recoff) + 2;
-               else
-                       retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1;
+
+               retval = hfs_bnode_read_u16(node, recoff) + 2;
+               if (retval > node->tree->max_key_len + 2) {
+                       printk(KERN_ERR "hfs: keylen %d too large\n",
+                               retval);
+                       retval = 0;
+               }
         }
         return retval;
  }
@@ -216,7 +219,7 @@ skip:
  static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
  {
         struct hfs_btree *tree;
-       struct hfs_bnode *node, *new_node;
+       struct hfs_bnode *node, *new_node, *next_node;
         struct hfs_bnode_desc node_desc;
         int num_recs, new_rec_off, new_off, old_rec_off;
         int data_start, data_end, size;
@@ -235,6 +238,17 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
         new_node->type = node->type;
         new_node->height = node->height;
  
+       if (node->next)
+               next_node = hfs_bnode_find(tree, node->next);
+       else
+               next_node = NULL;
+
+       if (IS_ERR(next_node)) {
+               hfs_bnode_put(node);
+               hfs_bnode_put(new_node);
+               return next_node;
+       }
+
         size = tree->node_size / 2 - node->num_recs * 2 - 14;
         old_rec_off = tree->node_size - 4;
         num_recs = 1;
@@ -248,6 +262,8 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
                 /* panic? */
                 hfs_bnode_put(node);
                 hfs_bnode_put(new_node);
+               if (next_node)
+                       hfs_bnode_put(next_node);
                 return ERR_PTR(-ENOSPC);
         }
  
@@ -302,8 +318,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
         hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc));
  
         /* update next bnode header */
-       if (new_node->next) {
-               struct hfs_bnode *next_node = hfs_bnode_find(tree, new_node->next);
+       if (next_node) {
                 next_node->prev = new_node->this;
                 hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc));
                 node_desc.prev = cpu_to_be32(next_node->prev);
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c

index e49fcee1e293f725786e84ea6126e408e5eda7c8..22e4d4e329999c3ba9848036a3639bc194598f74 100644 (file)
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -30,7 +30,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
         if (!tree)
                 return NULL;
  
-       init_MUTEX(&tree->tree_lock);
+       mutex_init(&tree->tree_lock);
         spin_lock_init(&tree->hash_lock);
         tree->sb = sb;
         tree->cnid = id;
@@ -39,10 +39,16 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
                 goto free_tree;
         tree->inode = inode;
  
+       if (!HFSPLUS_I(tree->inode)->first_blocks) {
+               printk(KERN_ERR
+                      "hfs: invalid btree extent records (0 size).\n");
+               goto free_inode;
+       }
+
         mapping = tree->inode->i_mapping;
         page = read_mapping_page(mapping, 0, NULL);
         if (IS_ERR(page))
-               goto free_tree;
+               goto free_inode;
  
         /* Load the header */
         head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
@@ -57,27 +63,56 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
         tree->max_key_len = be16_to_cpu(head->max_key_len);
         tree->depth = be16_to_cpu(head->depth);
  
-       /* Set the correct compare function */
-       if (id == HFSPLUS_EXT_CNID) {
+       /* Verify the tree and set the correct compare function */
+       switch (id) {
+       case HFSPLUS_EXT_CNID:
+               if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) {
+                       printk(KERN_ERR "hfs: invalid extent max_key_len %d\n",
+                               tree->max_key_len);
+                       goto fail_page;
+               }
+               if (tree->attributes & HFS_TREE_VARIDXKEYS) {
+                       printk(KERN_ERR "hfs: invalid extent btree flag\n");
+                       goto fail_page;
+               }
+
                 tree->keycmp = hfsplus_ext_cmp_key;
-       } else if (id == HFSPLUS_CAT_CNID) {
-               if ((HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX) &&
+               break;
+       case HFSPLUS_CAT_CNID:
+               if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) {
+                       printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n",
+                               tree->max_key_len);
+                       goto fail_page;
+               }
+               if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) {
+                       printk(KERN_ERR "hfs: invalid catalog btree flag\n");
+                       goto fail_page;
+               }
+
+               if (test_bit(HFSPLUS_SB_HFSX, &HFSPLUS_SB(sb)->flags) &&
                     (head->key_type == HFSPLUS_KEY_BINARY))
                         tree->keycmp = hfsplus_cat_bin_cmp_key;
                 else {
                         tree->keycmp = hfsplus_cat_case_cmp_key;
-                       HFSPLUS_SB(sb).flags |= HFSPLUS_SB_CASEFOLD;
+                       set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
                 }
-       } else {
+               break;
+       default:
                 printk(KERN_ERR "hfs: unknown B*Tree requested\n");
                 goto fail_page;
         }
  
+       if (!(tree->attributes & HFS_TREE_BIGKEYS)) {
+               printk(KERN_ERR "hfs: invalid btree flag\n");
+               goto fail_page;
+       }
+
         size = tree->node_size;
         if (!is_power_of_2(size))
                 goto fail_page;
         if (!tree->node_count)
                 goto fail_page;
+
         tree->node_size_shift = ffs(size) - 1;
  
         tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
@@ -87,10 +122,11 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
         return tree;
  
   fail_page:
-       tree->inode->i_mapping->a_ops = &hfsplus_aops;
         page_cache_release(page);
- free_tree:
+ free_inode:
+       tree->inode->i_mapping->a_ops = &hfsplus_aops;
         iput(tree->inode);
+ free_tree:
         kfree(tree);
         return NULL;
  }
@@ -192,17 +228,18 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
  
         while (!tree->free_nodes) {
                 struct inode *inode = tree->inode;
+               struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
                 u32 count;
                 int res;
  
                 res = hfsplus_file_extend(inode);
                 if (res)
                         return ERR_PTR(res);
-               HFSPLUS_I(inode).phys_size = inode->i_size =
-                               (loff_t)HFSPLUS_I(inode).alloc_blocks <<
-                               HFSPLUS_SB(tree->sb).alloc_blksz_shift;
-               HFSPLUS_I(inode).fs_blocks = HFSPLUS_I(inode).alloc_blocks <<
-                                            HFSPLUS_SB(tree->sb).fs_shift;
+               hip->phys_size = inode->i_size =
+                       (loff_t)hip->alloc_blocks <<
+                               HFSPLUS_SB(tree->sb)->alloc_blksz_shift;
+               hip->fs_blocks =
+                       hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift;
                 inode_set_bytes(inode, inode->i_size);
                 count = inode->i_size >> tree->node_size_shift;
                 tree->free_nodes = count - tree->node_count;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c

index f6874acb2cf2a3a81a242f1983600eb912b9f607..8af45fc5b051abb3353501441fa9ea9030d9395a 100644 (file)
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -67,7 +67,7 @@ static void hfsplus_cat_build_key_uni(hfsplus_btree_key *key, u32 parent,
         key->key_len = cpu_to_be16(6 + ustrlen);
  }
  
-static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
+void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms)
  {
         if (inode->i_flags & S_IMMUTABLE)
                 perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
@@ -77,15 +77,24 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
                 perms->rootflags |= HFSPLUS_FLG_APPEND;
         else
                 perms->rootflags &= ~HFSPLUS_FLG_APPEND;
-       HFSPLUS_I(inode).rootflags = perms->rootflags;
-       HFSPLUS_I(inode).userflags = perms->userflags;
+
+       perms->userflags = HFSPLUS_I(inode)->userflags;
         perms->mode = cpu_to_be16(inode->i_mode);
         perms->owner = cpu_to_be32(inode->i_uid);
         perms->group = cpu_to_be32(inode->i_gid);
+
+       if (S_ISREG(inode->i_mode))
+               perms->dev = cpu_to_be32(inode->i_nlink);
+       else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
+               perms->dev = cpu_to_be32(inode->i_rdev);
+       else
+               perms->dev = 0;
  }
  
  static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode)
  {
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
+
         if (S_ISDIR(inode->i_mode)) {
                 struct hfsplus_cat_folder *folder;
  
@@ -93,13 +102,13 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
                 memset(folder, 0, sizeof(*folder));
                 folder->type = cpu_to_be16(HFSPLUS_FOLDER);
                 folder->id = cpu_to_be32(inode->i_ino);
-               HFSPLUS_I(inode).create_date =
+               HFSPLUS_I(inode)->create_date =
                         folder->create_date =
                         folder->content_mod_date =
                         folder->attribute_mod_date =
                         folder->access_date = hfsp_now2mt();
-               hfsplus_set_perms(inode, &folder->permissions);
-               if (inode == HFSPLUS_SB(inode->i_sb).hidden_dir)
+               hfsplus_cat_set_perms(inode, &folder->permissions);
+               if (inode == sbi->hidden_dir)
                         /* invisible and namelocked */
                         folder->user_info.frFlags = cpu_to_be16(0x5000);
                 return sizeof(*folder);
@@ -111,19 +120,19 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
                 file->type = cpu_to_be16(HFSPLUS_FILE);
                 file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS);
                 file->id = cpu_to_be32(cnid);
-               HFSPLUS_I(inode).create_date =
+               HFSPLUS_I(inode)->create_date =
                         file->create_date =
                         file->content_mod_date =
                         file->attribute_mod_date =
                         file->access_date = hfsp_now2mt();
                 if (cnid == inode->i_ino) {
-                       hfsplus_set_perms(inode, &file->permissions);
+                       hfsplus_cat_set_perms(inode, &file->permissions);
                         if (S_ISLNK(inode->i_mode)) {
                                 file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE);
                                 file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR);
                         } else {
-                               file->user_info.fdType = cpu_to_be32(HFSPLUS_SB(inode->i_sb).type);
-                               file->user_info.fdCreator = cpu_to_be32(HFSPLUS_SB(inode->i_sb).creator);
+                               file->user_info.fdType = cpu_to_be32(sbi->type);
+                               file->user_info.fdCreator = cpu_to_be32(sbi->creator);
                         }
                         if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
                                 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
@@ -131,8 +140,8 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
                         file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE);
                         file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR);
                         file->user_info.fdFlags = cpu_to_be16(0x100);
-                       file->create_date = HFSPLUS_I(HFSPLUS_SB(inode->i_sb).hidden_dir).create_date;
-                       file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode).dev);
+                       file->create_date = HFSPLUS_I(sbi->hidden_dir)->create_date;
+                       file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode)->linkid);
                 }
                 return sizeof(*file);
         }
@@ -180,15 +189,14 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
  
  int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode)
  {
+       struct super_block *sb = dir->i_sb;
         struct hfs_find_data fd;
-       struct super_block *sb;
         hfsplus_cat_entry entry;
         int entry_size;
         int err;
  
         dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink);
-       sb = dir->i_sb;
-       hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+       hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
  
         hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
         entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
@@ -234,7 +242,7 @@ err2:
  
  int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
  {
-       struct super_block *sb;
+       struct super_block *sb = dir->i_sb;
         struct hfs_find_data fd;
         struct hfsplus_fork_raw fork;
         struct list_head *pos;
@@ -242,8 +250,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
         u16 type;
  
         dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
-       sb = dir->i_sb;
-       hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+       hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
  
         if (!str) {
                 int len;
@@ -279,7 +286,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
                 hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC);
         }
  
-       list_for_each(pos, &HFSPLUS_I(dir).open_dir_list) {
+       list_for_each(pos, &HFSPLUS_I(dir)->open_dir_list) {
                 struct hfsplus_readdir_data *rd =
                         list_entry(pos, struct hfsplus_readdir_data, list);
                 if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0)
@@ -312,7 +319,7 @@ int hfsplus_rename_cat(u32 cnid,
                        struct inode *src_dir, struct qstr *src_name,
                        struct inode *dst_dir, struct qstr *dst_name)
  {
-       struct super_block *sb;
+       struct super_block *sb = src_dir->i_sb;
         struct hfs_find_data src_fd, dst_fd;
         hfsplus_cat_entry entry;
         int entry_size, type;
@@ -320,8 +327,7 @@ int hfsplus_rename_cat(u32 cnid,
  
         dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name,
                 dst_dir->i_ino, dst_name->name);
-       sb = src_dir->i_sb;
-       hfs_find_init(HFSPLUS_SB(sb).cat_tree, &src_fd);
+       hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd);
         dst_fd = src_fd;
  
         /* find the old dir entry and read the data */
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c

index 764fd1bdca882da08028e34b133930d9d155a364..d236d85ec9d73f703384ecaa9d6522fe7433c775 100644 (file)
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -39,7 +39,7 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
  
         dentry->d_op = &hfsplus_dentry_operations;
         dentry->d_fsdata = NULL;
-       hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+       hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
         hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
  again:
         err = hfs_brec_read(&fd, &entry, sizeof(entry));
@@ -68,9 +68,9 @@ again:
                 cnid = be32_to_cpu(entry.file.id);
                 if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) &&
                     entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
-                   (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb).hidden_dir).create_date ||
-                    entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode).create_date) &&
-                   HFSPLUS_SB(sb).hidden_dir) {
+                   (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->create_date ||
+                    entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode)->create_date) &&
+                   HFSPLUS_SB(sb)->hidden_dir) {
                         struct qstr str;
                         char name[32];
  
@@ -86,7 +86,8 @@ again:
                                 linkid = be32_to_cpu(entry.file.permissions.dev);
                                 str.len = sprintf(name, "iNode%d", linkid);
                                 str.name = name;
-                               hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_SB(sb).hidden_dir->i_ino, &str);
+                               hfsplus_cat_build_key(sb, fd.search_key,
+                                       HFSPLUS_SB(sb)->hidden_dir->i_ino, &str);
                                 goto again;
                         }
                 } else if (!dentry->d_fsdata)
@@ -101,7 +102,7 @@ again:
         if (IS_ERR(inode))
                 return ERR_CAST(inode);
         if (S_ISREG(inode->i_mode))
-               HFSPLUS_I(inode).dev = linkid;
+               HFSPLUS_I(inode)->linkid = linkid;
  out:
         d_add(dentry, inode);
         return NULL;
@@ -124,7 +125,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
         if (filp->f_pos >= inode->i_size)
                 return 0;
  
-       hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+       hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
         hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
         err = hfs_brec_find(&fd);
         if (err)
@@ -180,8 +181,9 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                 err = -EIO;
                                 goto out;
                         }
-                       if (HFSPLUS_SB(sb).hidden_dir &&
-                           HFSPLUS_SB(sb).hidden_dir->i_ino == be32_to_cpu(entry.folder.id))
+                       if (HFSPLUS_SB(sb)->hidden_dir &&
+                           HFSPLUS_SB(sb)->hidden_dir->i_ino ==
+                                       be32_to_cpu(entry.folder.id))
                                 goto next;
                         if (filldir(dirent, strbuf, len, filp->f_pos,
                                     be32_to_cpu(entry.folder.id), DT_DIR))
@@ -217,7 +219,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
                 }
                 filp->private_data = rd;
                 rd->file = filp;
-               list_add(&rd->list, &HFSPLUS_I(inode).open_dir_list);
+               list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list);
         }
         memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key));
  out:
@@ -229,38 +231,18 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file)
  {
         struct hfsplus_readdir_data *rd = file->private_data;
         if (rd) {
+               mutex_lock(&inode->i_mutex);
                 list_del(&rd->list);
+               mutex_unlock(&inode->i_mutex);
                 kfree(rd);
         }
         return 0;
  }
  
-static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
-                         struct nameidata *nd)
-{
-       struct inode *inode;
-       int res;
-
-       inode = hfsplus_new_inode(dir->i_sb, mode);
-       if (!inode)
-               return -ENOSPC;
-
-       res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
-       if (res) {
-               inode->i_nlink = 0;
-               hfsplus_delete_inode(inode);
-               iput(inode);
-               return res;
-       }
-       hfsplus_instantiate(dentry, inode, inode->i_ino);
-       mark_inode_dirty(inode);
-       return 0;
-}
-
  static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
                         struct dentry *dst_dentry)
  {
-       struct super_block *sb = dst_dir->i_sb;
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(dst_dir->i_sb);
         struct inode *inode = src_dentry->d_inode;
         struct inode *src_dir = src_dentry->d_parent->d_inode;
         struct qstr str;
@@ -270,7 +252,10 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
  
         if (HFSPLUS_IS_RSRC(inode))
                 return -EPERM;
+       if (!S_ISREG(inode->i_mode))
+               return -EPERM;
  
+       mutex_lock(&sbi->vh_mutex);
         if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) {
                 for (;;) {
                         get_random_bytes(&id, sizeof(cnid));
@@ -279,40 +264,41 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
                         str.len = sprintf(name, "iNode%d", id);
                         res = hfsplus_rename_cat(inode->i_ino,
                                                  src_dir, &src_dentry->d_name,
-                                                HFSPLUS_SB(sb).hidden_dir, &str);
+                                                sbi->hidden_dir, &str);
                         if (!res)
                                 break;
                         if (res != -EEXIST)
-                               return res;
+                               goto out;
                 }
-               HFSPLUS_I(inode).dev = id;
-               cnid = HFSPLUS_SB(sb).next_cnid++;
+               HFSPLUS_I(inode)->linkid = id;
+               cnid = sbi->next_cnid++;
                 src_dentry->d_fsdata = (void *)(unsigned long)cnid;
                 res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode);
                 if (res)
                         /* panic? */
-                       return res;
-               HFSPLUS_SB(sb).file_count++;
+                       goto out;
+               sbi->file_count++;
         }
-       cnid = HFSPLUS_SB(sb).next_cnid++;
+       cnid = sbi->next_cnid++;
         res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode);
         if (res)
-               return res;
+               goto out;
  
         inc_nlink(inode);
         hfsplus_instantiate(dst_dentry, inode, cnid);
         atomic_inc(&inode->i_count);
         inode->i_ctime = CURRENT_TIME_SEC;
         mark_inode_dirty(inode);
-       HFSPLUS_SB(sb).file_count++;
-       sb->s_dirt = 1;
-
-       return 0;
+       sbi->file_count++;
+       dst_dir->i_sb->s_dirt = 1;
+out:
+       mutex_unlock(&sbi->vh_mutex);
+       return res;
  }
  
  static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
  {
-       struct super_block *sb = dir->i_sb;
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
         struct inode *inode = dentry->d_inode;
         struct qstr str;
         char name[32];
@@ -322,21 +308,22 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
         if (HFSPLUS_IS_RSRC(inode))
                 return -EPERM;
  
+       mutex_lock(&sbi->vh_mutex);
         cnid = (u32)(unsigned long)dentry->d_fsdata;
         if (inode->i_ino == cnid &&
-           atomic_read(&HFSPLUS_I(inode).opencnt)) {
+           atomic_read(&HFSPLUS_I(inode)->opencnt)) {
                 str.name = name;
                 str.len = sprintf(name, "temp%lu", inode->i_ino);
                 res = hfsplus_rename_cat(inode->i_ino,
                                          dir, &dentry->d_name,
-                                        HFSPLUS_SB(sb).hidden_dir, &str);
+                                        sbi->hidden_dir, &str);
                 if (!res)
                         inode->i_flags |= S_DEAD;
-               return res;
+               goto out;
         }
         res = hfsplus_delete_cat(cnid, dir, &dentry->d_name);
         if (res)
-               return res;
+               goto out;
  
         if (inode->i_nlink > 0)
                 drop_nlink(inode);
@@ -344,10 +331,10 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
                 clear_nlink(inode);
         if (!inode->i_nlink) {
                 if (inode->i_ino != cnid) {
-                       HFSPLUS_SB(sb).file_count--;
-                       if (!atomic_read(&HFSPLUS_I(inode).opencnt)) {
+                       sbi->file_count--;
+                       if (!atomic_read(&HFSPLUS_I(inode)->opencnt)) {
                                 res = hfsplus_delete_cat(inode->i_ino,
-                                                        HFSPLUS_SB(sb).hidden_dir,
+                                                        sbi->hidden_dir,
                                                          NULL);
                                 if (!res)
                                         hfsplus_delete_inode(inode);
@@ -356,107 +343,108 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
                 } else
                         hfsplus_delete_inode(inode);
         } else
-               HFSPLUS_SB(sb).file_count--;
+               sbi->file_count--;
         inode->i_ctime = CURRENT_TIME_SEC;
         mark_inode_dirty(inode);
-
+out:
+       mutex_unlock(&sbi->vh_mutex);
         return res;
  }
  
-static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-       struct inode *inode;
-       int res;
-
-       inode = hfsplus_new_inode(dir->i_sb, S_IFDIR | mode);
-       if (!inode)
-               return -ENOSPC;
-
-       res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
-       if (res) {
-               inode->i_nlink = 0;
-               hfsplus_delete_inode(inode);
-               iput(inode);
-               return res;
-       }
-       hfsplus_instantiate(dentry, inode, inode->i_ino);
-       mark_inode_dirty(inode);
-       return 0;
-}
-
  static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
  {
-       struct inode *inode;
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
+       struct inode *inode = dentry->d_inode;
         int res;
  
-       inode = dentry->d_inode;
         if (inode->i_size != 2)
                 return -ENOTEMPTY;
+
+       mutex_lock(&sbi->vh_mutex);
         res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name);
         if (res)
-               return res;
+               goto out;
         clear_nlink(inode);
         inode->i_ctime = CURRENT_TIME_SEC;
         hfsplus_delete_inode(inode);
         mark_inode_dirty(inode);
-       return 0;
+out:
+       mutex_unlock(&sbi->vh_mutex);
+       return res;
  }
  
  static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
                            const char *symname)
  {
-       struct super_block *sb;
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
         struct inode *inode;
-       int res;
+       int res = -ENOSPC;
  
-       sb = dir->i_sb;
-       inode = hfsplus_new_inode(sb, S_IFLNK | S_IRWXUGO);
+       mutex_lock(&sbi->vh_mutex);
+       inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO);
         if (!inode)
-               return -ENOSPC;
+               goto out;
  
         res = page_symlink(inode, symname, strlen(symname) + 1);
-       if (res) {
-               inode->i_nlink = 0;
-               hfsplus_delete_inode(inode);
-               iput(inode);
-               return res;
-       }
+       if (res)
+               goto out_err;
  
-       mark_inode_dirty(inode);
         res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
+       if (res)
+               goto out_err;
  
-       if (!res) {
-               hfsplus_instantiate(dentry, inode, inode->i_ino);
-               mark_inode_dirty(inode);
-       }
+       hfsplus_instantiate(dentry, inode, inode->i_ino);
+       mark_inode_dirty(inode);
+       goto out;
  
+out_err:
+       inode->i_nlink = 0;
+       hfsplus_delete_inode(inode);
+       iput(inode);
+out:
+       mutex_unlock(&sbi->vh_mutex);
         return res;
  }
  
  static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
                          int mode, dev_t rdev)
  {
-       struct super_block *sb;
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
         struct inode *inode;
-       int res;
+       int res = -ENOSPC;
  
-       sb = dir->i_sb;
-       inode = hfsplus_new_inode(sb, mode);
+       mutex_lock(&sbi->vh_mutex);
+       inode = hfsplus_new_inode(dir->i_sb, mode);
         if (!inode)
-               return -ENOSPC;
+               goto out;
+
+       if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode))
+               init_special_inode(inode, mode, rdev);
  
         res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
         if (res) {
                 inode->i_nlink = 0;
                 hfsplus_delete_inode(inode);
                 iput(inode);
-               return res;
+               goto out;
         }
-       init_special_inode(inode, mode, rdev);
+
         hfsplus_instantiate(dentry, inode, inode->i_ino);
         mark_inode_dirty(inode);
+out:
+       mutex_unlock(&sbi->vh_mutex);
+       return res;
+}
  
-       return 0;
+static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
+                         struct nameidata *nd)
+{
+       return hfsplus_mknod(dir, dentry, mode, 0);
+}
+
+static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+       return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0);
  }
  
  static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -466,7 +454,10 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
  
         /* Unlink destination if it already exists */
         if (new_dentry->d_inode) {
-               res = hfsplus_unlink(new_dir, new_dentry);
+               if (S_ISDIR(new_dentry->d_inode->i_mode))
+                       res = hfsplus_rmdir(new_dir, new_dentry);
+               else
+                       res = hfsplus_unlink(new_dir, new_dentry);
                 if (res)
                         return res;
         }
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c

index 0022eec63cdacd97c2a438b8d9f623ff6be88dd4..0c9cb1820a523fae02c6bf2f37e6fbdc5c5dceeb 100644 (file)
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -85,35 +85,49 @@ static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext)
  
  static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd)
  {
+       struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
         int res;
  
-       hfsplus_ext_build_key(fd->search_key, inode->i_ino, HFSPLUS_I(inode).cached_start,
-                             HFSPLUS_IS_RSRC(inode) ?  HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
+       WARN_ON(!mutex_is_locked(&hip->extents_lock));
+
+       hfsplus_ext_build_key(fd->search_key, inode->i_ino, hip->cached_start,
+                             HFSPLUS_IS_RSRC(inode) ?
+                               HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
+
         res = hfs_brec_find(fd);
-       if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_NEW) {
+       if (hip->flags & HFSPLUS_FLG_EXT_NEW) {
                 if (res != -ENOENT)
                         return;
-               hfs_brec_insert(fd, HFSPLUS_I(inode).cached_extents, sizeof(hfsplus_extent_rec));
-               HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+               hfs_brec_insert(fd, hip->cached_extents,
+                               sizeof(hfsplus_extent_rec));
+               hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
         } else {
                 if (res)
                         return;
-               hfs_bnode_write(fd->bnode, HFSPLUS_I(inode).cached_extents, fd->entryoffset, fd->entrylength);
-               HFSPLUS_I(inode).flags &= ~HFSPLUS_FLG_EXT_DIRTY;
+               hfs_bnode_write(fd->bnode, hip->cached_extents,
+                               fd->entryoffset, fd->entrylength);
+               hip->flags &= ~HFSPLUS_FLG_EXT_DIRTY;
         }
  }
  
-void hfsplus_ext_write_extent(struct inode *inode)
+static void hfsplus_ext_write_extent_locked(struct inode *inode)
  {
-       if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY) {
+       if (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_EXT_DIRTY) {
                 struct hfs_find_data fd;
  
-               hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd);
+               hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
                 __hfsplus_ext_write_extent(inode, &fd);
                 hfs_find_exit(&fd);
         }
  }
  
+void hfsplus_ext_write_extent(struct inode *inode)
+{
+       mutex_lock(&HFSPLUS_I(inode)->extents_lock);
+       hfsplus_ext_write_extent_locked(inode);
+       mutex_unlock(&HFSPLUS_I(inode)->extents_lock);
+}
+
  static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
                                             struct hfsplus_extent *extent,
                                             u32 cnid, u32 block, u8 type)
@@ -136,33 +150,39 @@ static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
  
  static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block)
  {
+       struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
         int res;
  
-       if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY)
+       WARN_ON(!mutex_is_locked(&hip->extents_lock));
+
+       if (hip->flags & HFSPLUS_FLG_EXT_DIRTY)
                 __hfsplus_ext_write_extent(inode, fd);
  
-       res = __hfsplus_ext_read_extent(fd, HFSPLUS_I(inode).cached_extents, inode->i_ino,
-                                       block, HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
+       res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino,
+                                       block, HFSPLUS_IS_RSRC(inode) ?
+                                               HFSPLUS_TYPE_RSRC :
+                                               HFSPLUS_TYPE_DATA);
         if (!res) {
-               HFSPLUS_I(inode).cached_start = be32_to_cpu(fd->key->ext.start_block);
-               HFSPLUS_I(inode).cached_blocks = hfsplus_ext_block_count(HFSPLUS_I(inode).cached_extents);
+               hip->cached_start = be32_to_cpu(fd->key->ext.start_block);
+               hip->cached_blocks = hfsplus_ext_block_count(hip->cached_extents);
         } else {
-               HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0;
-               HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+               hip->cached_start = hip->cached_blocks = 0;
+               hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
         }
         return res;
  }
  
  static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
  {
+       struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
         struct hfs_find_data fd;
         int res;
  
-       if (block >= HFSPLUS_I(inode).cached_start &&
-           block < HFSPLUS_I(inode).cached_start + HFSPLUS_I(inode).cached_blocks)
+       if (block >= hip->cached_start &&
+           block < hip->cached_start + hip->cached_blocks)
                 return 0;
  
-       hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd);
+       hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
         res = __hfsplus_ext_cache_extent(&fd, inode, block);
         hfs_find_exit(&fd);
         return res;
@@ -172,21 +192,21 @@ static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
  int hfsplus_get_block(struct inode *inode, sector_t iblock,
                       struct buffer_head *bh_result, int create)
  {
-       struct super_block *sb;
+       struct super_block *sb = inode->i_sb;
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+       struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
         int res = -EIO;
         u32 ablock, dblock, mask;
         int shift;
  
-       sb = inode->i_sb;
-
         /* Convert inode block to disk allocation block */
-       shift = HFSPLUS_SB(sb).alloc_blksz_shift - sb->s_blocksize_bits;
-       ablock = iblock >> HFSPLUS_SB(sb).fs_shift;
+       shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
+       ablock = iblock >> sbi->fs_shift;
  
-       if (iblock >= HFSPLUS_I(inode).fs_blocks) {
-               if (iblock > HFSPLUS_I(inode).fs_blocks || !create)
+       if (iblock >= hip->fs_blocks) {
+               if (iblock > hip->fs_blocks || !create)
                         return -EIO;
-               if (ablock >= HFSPLUS_I(inode).alloc_blocks) {
+               if (ablock >= hip->alloc_blocks) {
                         res = hfsplus_file_extend(inode);
                         if (res)
                                 return res;
@@ -194,33 +214,33 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
         } else
                 create = 0;
  
-       if (ablock < HFSPLUS_I(inode).first_blocks) {
-               dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).first_extents, ablock);
+       if (ablock < hip->first_blocks) {
+               dblock = hfsplus_ext_find_block(hip->first_extents, ablock);
                 goto done;
         }
  
         if (inode->i_ino == HFSPLUS_EXT_CNID)
                 return -EIO;
  
-       mutex_lock(&HFSPLUS_I(inode).extents_lock);
+       mutex_lock(&hip->extents_lock);
         res = hfsplus_ext_read_extent(inode, ablock);
         if (!res) {
-               dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).cached_extents, ablock -
-                                            HFSPLUS_I(inode).cached_start);
+               dblock = hfsplus_ext_find_block(hip->cached_extents,
+                                               ablock - hip->cached_start);
         } else {
-               mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+               mutex_unlock(&hip->extents_lock);
                 return -EIO;
         }
-       mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+       mutex_unlock(&hip->extents_lock);
  
  done:
         dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock);
-       mask = (1 << HFSPLUS_SB(sb).fs_shift) - 1;
-       map_bh(bh_result, sb, (dblock << HFSPLUS_SB(sb).fs_shift) + HFSPLUS_SB(sb).blockoffset + (iblock & mask));
+       mask = (1 << sbi->fs_shift) - 1;
+       map_bh(bh_result, sb, (dblock << sbi->fs_shift) + sbi->blockoffset + (iblock & mask));
         if (create) {
                 set_buffer_new(bh_result);
-               HFSPLUS_I(inode).phys_size += sb->s_blocksize;
-               HFSPLUS_I(inode).fs_blocks++;
+               hip->phys_size += sb->s_blocksize;
+               hip->fs_blocks++;
                 inode_add_bytes(inode, sb->s_blocksize);
                 mark_inode_dirty(inode);
         }
@@ -327,7 +347,7 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw
         if (total_blocks == blocks)
                 return 0;
  
-       hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd);
+       hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
         do {
                 res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid,
                                                 total_blocks, type);
@@ -348,29 +368,33 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw
  int hfsplus_file_extend(struct inode *inode)
  {
         struct super_block *sb = inode->i_sb;
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+       struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
         u32 start, len, goal;
         int res;
  
-       if (HFSPLUS_SB(sb).alloc_file->i_size * 8 < HFSPLUS_SB(sb).total_blocks - HFSPLUS_SB(sb).free_blocks + 8) {
+       if (sbi->alloc_file->i_size * 8 <
+           sbi->total_blocks - sbi->free_blocks + 8) {
                 // extend alloc file
-               printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n", HFSPLUS_SB(sb).alloc_file->i_size * 8,
-                       HFSPLUS_SB(sb).total_blocks, HFSPLUS_SB(sb).free_blocks);
+               printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n",
+                               sbi->alloc_file->i_size * 8,
+                               sbi->total_blocks, sbi->free_blocks);
                 return -ENOSPC;
         }
  
-       mutex_lock(&HFSPLUS_I(inode).extents_lock);
-       if (HFSPLUS_I(inode).alloc_blocks == HFSPLUS_I(inode).first_blocks)
-               goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).first_extents);
+       mutex_lock(&hip->extents_lock);
+       if (hip->alloc_blocks == hip->first_blocks)
+               goal = hfsplus_ext_lastblock(hip->first_extents);
         else {
-               res = hfsplus_ext_read_extent(inode, HFSPLUS_I(inode).alloc_blocks);
+               res = hfsplus_ext_read_extent(inode, hip->alloc_blocks);
                 if (res)
                         goto out;
-               goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).cached_extents);
+               goal = hfsplus_ext_lastblock(hip->cached_extents);
         }
  
-       len = HFSPLUS_I(inode).clump_blocks;
-       start = hfsplus_block_allocate(sb, HFSPLUS_SB(sb).total_blocks, goal, &len);
-       if (start >= HFSPLUS_SB(sb).total_blocks) {
+       len = hip->clump_blocks;
+       start = hfsplus_block_allocate(sb, sbi->total_blocks, goal, &len);
+       if (start >= sbi->total_blocks) {
                 start = hfsplus_block_allocate(sb, goal, 0, &len);
                 if (start >= goal) {
                         res = -ENOSPC;
@@ -379,56 +403,56 @@ int hfsplus_file_extend(struct inode *inode)
         }
  
         dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
-       if (HFSPLUS_I(inode).alloc_blocks <= HFSPLUS_I(inode).first_blocks) {
-               if (!HFSPLUS_I(inode).first_blocks) {
+
+       if (hip->alloc_blocks <= hip->first_blocks) {
+               if (!hip->first_blocks) {
                         dprint(DBG_EXTENT, "first extents\n");
                         /* no extents yet */
-                       HFSPLUS_I(inode).first_extents[0].start_block = cpu_to_be32(start);
-                       HFSPLUS_I(inode).first_extents[0].block_count = cpu_to_be32(len);
+                       hip->first_extents[0].start_block = cpu_to_be32(start);
+                       hip->first_extents[0].block_count = cpu_to_be32(len);
                         res = 0;
                 } else {
                         /* try to append to extents in inode */
-                       res = hfsplus_add_extent(HFSPLUS_I(inode).first_extents,
-                                                HFSPLUS_I(inode).alloc_blocks,
+                       res = hfsplus_add_extent(hip->first_extents,
+                                                hip->alloc_blocks,
                                                  start, len);
                         if (res == -ENOSPC)
                                 goto insert_extent;
                 }
                 if (!res) {
-                       hfsplus_dump_extent(HFSPLUS_I(inode).first_extents);
-                       HFSPLUS_I(inode).first_blocks += len;
+                       hfsplus_dump_extent(hip->first_extents);
+                       hip->first_blocks += len;
                 }
         } else {
-               res = hfsplus_add_extent(HFSPLUS_I(inode).cached_extents,
-                                        HFSPLUS_I(inode).alloc_blocks -
-                                        HFSPLUS_I(inode).cached_start,
+               res = hfsplus_add_extent(hip->cached_extents,
+                                        hip->alloc_blocks - hip->cached_start,
                                          start, len);
                 if (!res) {
-                       hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents);
-                       HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY;
-                       HFSPLUS_I(inode).cached_blocks += len;
+                       hfsplus_dump_extent(hip->cached_extents);
+                       hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
+                       hip->cached_blocks += len;
                 } else if (res == -ENOSPC)
                         goto insert_extent;
         }
  out:
-       mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+       mutex_unlock(&hip->extents_lock);
         if (!res) {
-               HFSPLUS_I(inode).alloc_blocks += len;
+               hip->alloc_blocks += len;
                 mark_inode_dirty(inode);
         }
         return res;
  
  insert_extent:
         dprint(DBG_EXTENT, "insert new extent\n");
-       hfsplus_ext_write_extent(inode);
+       hfsplus_ext_write_extent_locked(inode);
  
-       memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec));
-       HFSPLUS_I(inode).cached_extents[0].start_block = cpu_to_be32(start);
-       HFSPLUS_I(inode).cached_extents[0].block_count = cpu_to_be32(len);
-       hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents);
-       HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW;
-       HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).alloc_blocks;
-       HFSPLUS_I(inode).cached_blocks = len;
+       memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
+       hip->cached_extents[0].start_block = cpu_to_be32(start);
+       hip->cached_extents[0].block_count = cpu_to_be32(len);
+       hfsplus_dump_extent(hip->cached_extents);
+       hip->flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW;
+       hip->cached_start = hip->alloc_blocks;
+       hip->cached_blocks = len;
  
         res = 0;
         goto out;
@@ -437,13 +461,15 @@ insert_extent:
  void hfsplus_file_truncate(struct inode *inode)
  {
         struct super_block *sb = inode->i_sb;
+       struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
         struct hfs_find_data fd;
         u32 alloc_cnt, blk_cnt, start;
         int res;
  
-       dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", inode->i_ino,
-              (long long)HFSPLUS_I(inode).phys_size, inode->i_size);
-       if (inode->i_size > HFSPLUS_I(inode).phys_size) {
+       dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n",
+               inode->i_ino, (long long)hip->phys_size, inode->i_size);
+
+       if (inode->i_size > hip->phys_size) {
                 struct address_space *mapping = inode->i_mapping;
                 struct page *page;
                 void *fsdata;
@@ -460,47 +486,48 @@ void hfsplus_file_truncate(struct inode *inode)
                         return;
                 mark_inode_dirty(inode);
                 return;
-       } else if (inode->i_size == HFSPLUS_I(inode).phys_size)
+       } else if (inode->i_size == hip->phys_size)
                 return;
  
-       blk_cnt = (inode->i_size + HFSPLUS_SB(sb).alloc_blksz - 1) >> HFSPLUS_SB(sb).alloc_blksz_shift;
-       alloc_cnt = HFSPLUS_I(inode).alloc_blocks;
+       blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >>
+                       HFSPLUS_SB(sb)->alloc_blksz_shift;
+       alloc_cnt = hip->alloc_blocks;
         if (blk_cnt == alloc_cnt)
                 goto out;
  
-       mutex_lock(&HFSPLUS_I(inode).extents_lock);
-       hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd);
+       mutex_lock(&hip->extents_lock);
+       hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
         while (1) {
-               if (alloc_cnt == HFSPLUS_I(inode).first_blocks) {
-                       hfsplus_free_extents(sb, HFSPLUS_I(inode).first_extents,
+               if (alloc_cnt == hip->first_blocks) {
+                       hfsplus_free_extents(sb, hip->first_extents,
                                              alloc_cnt, alloc_cnt - blk_cnt);
-                       hfsplus_dump_extent(HFSPLUS_I(inode).first_extents);
-                       HFSPLUS_I(inode).first_blocks = blk_cnt;
+                       hfsplus_dump_extent(hip->first_extents);
+                       hip->first_blocks = blk_cnt;
                         break;
                 }
                 res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt);
                 if (res)
                         break;
-               start = HFSPLUS_I(inode).cached_start;
-               hfsplus_free_extents(sb, HFSPLUS_I(inode).cached_extents,
+               start = hip->cached_start;
+               hfsplus_free_extents(sb, hip->cached_extents,
                                      alloc_cnt - start, alloc_cnt - blk_cnt);
-               hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents);
+               hfsplus_dump_extent(hip->cached_extents);
                 if (blk_cnt > start) {
-                       HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY;
+                       hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
                         break;
                 }
                 alloc_cnt = start;
-               HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0;
-               HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+               hip->cached_start = hip->cached_blocks = 0;
+               hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
                 hfs_brec_remove(&fd);
         }
         hfs_find_exit(&fd);
-       mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+       mutex_unlock(&hip->extents_lock);
  
-       HFSPLUS_I(inode).alloc_blocks = blk_cnt;
+       hip->alloc_blocks = blk_cnt;
  out:
-       HFSPLUS_I(inode).phys_size = inode->i_size;
-       HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
-       inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits);
+       hip->phys_size = inode->i_size;
+       hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+       inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
         mark_inode_dirty(inode);
  }
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h

index dc856be3c2b010854c78da1049b86d2f7ce48147..cb3653efb57a2dcf2285a19fcb7262cb7a1ba509 100644 (file)
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -62,7 +62,7 @@ struct hfs_btree {
         unsigned int depth;
  
         //unsigned int map1_size, map_size;
-       struct semaphore tree_lock;
+       struct mutex tree_lock;
  
         unsigned int pages_per_bnode;
         spinlock_t hash_lock;
@@ -121,16 +121,21 @@ struct hfsplus_sb_info {
         u32 sect_count;
         int fs_shift;
  
-       /* Stuff in host order from Vol Header */
+       /* immutable data from the volume header */
         u32 alloc_blksz;
         int alloc_blksz_shift;
         u32 total_blocks;
+       u32 data_clump_blocks, rsrc_clump_blocks;
+
+       /* mutable data from the volume header, protected by alloc_mutex */
         u32 free_blocks;
-       u32 next_alloc;
+       struct mutex alloc_mutex;
+
+       /* mutable data from the volume header, protected by vh_mutex */
         u32 next_cnid;
         u32 file_count;
         u32 folder_count;
-       u32 data_clump_blocks, rsrc_clump_blocks;
+       struct mutex vh_mutex;
  
         /* Config options */
         u32 creator;
@@ -143,40 +148,50 @@ struct hfsplus_sb_info {
         int part, session;
  
         unsigned long flags;
-
-       struct hlist_head rsrc_inodes;
  };
  
-#define HFSPLUS_SB_WRITEBACKUP 0x0001
-#define HFSPLUS_SB_NODECOMPOSE 0x0002
-#define HFSPLUS_SB_FORCE       0x0004
-#define HFSPLUS_SB_HFSX                0x0008
-#define HFSPLUS_SB_CASEFOLD    0x0010
+#define HFSPLUS_SB_WRITEBACKUP 0
+#define HFSPLUS_SB_NODECOMPOSE 1
+#define HFSPLUS_SB_FORCE       2
+#define HFSPLUS_SB_HFSX                3
+#define HFSPLUS_SB_CASEFOLD    4
  
  
  struct hfsplus_inode_info {
-       struct mutex extents_lock;
-       u32 clump_blocks, alloc_blocks;
-       sector_t fs_blocks;
-       /* Allocation extents from catalog record or volume header */
-       hfsplus_extent_rec first_extents;
-       u32 first_blocks;
-       hfsplus_extent_rec cached_extents;
-       u32 cached_start, cached_blocks;
         atomic_t opencnt;
  
-       struct inode *rsrc_inode;
+       /*
+        * Extent allocation information, protected by extents_lock.
+        */
+       u32 first_blocks;
+       u32 clump_blocks;
+       u32 alloc_blocks;
+       u32 cached_start;
+       u32 cached_blocks;
+       hfsplus_extent_rec first_extents;
+       hfsplus_extent_rec cached_extents;
         unsigned long flags;
+       struct mutex extents_lock;
  
+       /*
+        * Immutable data.
+        */
+       struct inode *rsrc_inode;
         __be32 create_date;
-       /* Device number in hfsplus_permissions in catalog */
-       u32 dev;
-       /* BSD system and user file flags */
-       u8 rootflags;
-       u8 userflags;
  
+       /*
+        * Protected by sbi->vh_mutex.
+        */
+       u32 linkid;
+
+       /*
+        * Protected by i_mutex.
+        */
+       sector_t fs_blocks;
+       u8 userflags;           /* BSD user file flags */
         struct list_head open_dir_list;
         loff_t phys_size;
+
         struct inode vfs_inode;
  };
  
@@ -184,8 +199,8 @@ struct hfsplus_inode_info {
  #define HFSPLUS_FLG_EXT_DIRTY  0x0002
  #define HFSPLUS_FLG_EXT_NEW    0x0004
  
-#define HFSPLUS_IS_DATA(inode)   (!(HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC))
-#define HFSPLUS_IS_RSRC(inode)   (HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC)
+#define HFSPLUS_IS_DATA(inode)   (!(HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC))
+#define HFSPLUS_IS_RSRC(inode)   (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC)
  
  struct hfs_find_data {
         /* filled by caller */
@@ -311,6 +326,7 @@ int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *);
  int hfsplus_delete_cat(u32, struct inode *, struct qstr *);
  int hfsplus_rename_cat(u32, struct inode *, struct qstr *,
                        struct inode *, struct qstr *);
+void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms);
  
  /* dir.c */
  extern const struct inode_operations hfsplus_dir_inode_operations;
@@ -372,26 +388,15 @@ int hfsplus_read_wrapper(struct super_block *);
  int hfs_part_find(struct super_block *, sector_t *, sector_t *);
  
  /* access macros */
-/*
  static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
  {
         return sb->s_fs_info;
  }
+
  static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
  {
         return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
  }
-*/
-#define HFSPLUS_SB(super)      (*(struct hfsplus_sb_info *)(super)->s_fs_info)
-#define HFSPLUS_I(inode)       (*list_entry(inode, struct hfsplus_inode_info, vfs_inode))
-
-#if 1
-#define hfsplus_kmap(p)                ({ struct page *__p = (p); kmap(__p); })
-#define hfsplus_kunmap(p)      ({ struct page *__p = (p); kunmap(__p); __p; })
-#else
-#define hfsplus_kmap(p)                kmap(p)
-#define hfsplus_kunmap(p)      kunmap(p)
-#endif
  
  #define sb_bread512(sb, sec, data) ({                  \
         struct buffer_head *__bh;                       \
@@ -419,6 +424,4 @@ static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
  #define hfsp_ut2mt(t)          __hfsp_ut2mt((t).tv_sec)
  #define hfsp_now2mt()          __hfsp_ut2mt(get_seconds())
  
-#define kdev_t_to_nr(x)                (x)
-
  #endif
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h

index fe99fe8db61a3cb279885cee2c73cf5743704604..6892899fd6fbbabce55d3fe1f2c8915fe3f33d89 100644 (file)
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -200,6 +200,7 @@ struct hfsplus_cat_key {
         struct hfsplus_unistr name;
  } __packed;
  
+#define HFSPLUS_CAT_KEYLEN     (sizeof(struct hfsplus_cat_key))
  
  /* Structs from hfs.h */
  struct hfsp_point {
@@ -323,7 +324,7 @@ struct hfsplus_ext_key {
         __be32 start_block;
  } __packed;
  
-#define HFSPLUS_EXT_KEYLEN 12
+#define HFSPLUS_EXT_KEYLEN     sizeof(struct hfsplus_ext_key)
  
  /* HFS+ generic BTree key */
  typedef union {
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c

index c5a979d62c657a866685dac4743fa01515b1653a..78449280dae08471958a328afca7f225fe9d744f 100644 (file)
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -36,7 +36,7 @@ static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
         *pagep = NULL;
         ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
                                 hfsplus_get_block,
-                               &HFSPLUS_I(mapping->host).phys_size);
+                               &HFSPLUS_I(mapping->host)->phys_size);
         if (unlikely(ret)) {
                 loff_t isize = mapping->host->i_size;
                 if (pos + len > isize)
@@ -62,13 +62,13 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
  
         switch (inode->i_ino) {
         case HFSPLUS_EXT_CNID:
-               tree = HFSPLUS_SB(sb).ext_tree;
+               tree = HFSPLUS_SB(sb)->ext_tree;
                 break;
         case HFSPLUS_CAT_CNID:
-               tree = HFSPLUS_SB(sb).cat_tree;
+               tree = HFSPLUS_SB(sb)->cat_tree;
                 break;
         case HFSPLUS_ATTR_CNID:
-               tree = HFSPLUS_SB(sb).attr_tree;
+               tree = HFSPLUS_SB(sb)->attr_tree;
                 break;
         default:
                 BUG();
@@ -172,12 +172,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
         struct hfs_find_data fd;
         struct super_block *sb = dir->i_sb;
         struct inode *inode = NULL;
+       struct hfsplus_inode_info *hip;
         int err;
  
         if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
                 goto out;
  
-       inode = HFSPLUS_I(dir).rsrc_inode;
+       inode = HFSPLUS_I(dir)->rsrc_inode;
         if (inode)
                 goto out;
  
@@ -185,12 +186,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
         if (!inode)
                 return ERR_PTR(-ENOMEM);
  
+       hip = HFSPLUS_I(inode);
         inode->i_ino = dir->i_ino;
-       INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
-       mutex_init(&HFSPLUS_I(inode).extents_lock);
-       HFSPLUS_I(inode).flags = HFSPLUS_FLG_RSRC;
+       INIT_LIST_HEAD(&hip->open_dir_list);
+       mutex_init(&hip->extents_lock);
+       hip->flags = HFSPLUS_FLG_RSRC;
  
-       hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+       hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
         err = hfsplus_find_cat(sb, dir->i_ino, &fd);
         if (!err)
                 err = hfsplus_cat_read_inode(inode, &fd);
@@ -199,10 +201,18 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
                 iput(inode);
                 return ERR_PTR(err);
         }
-       HFSPLUS_I(inode).rsrc_inode = dir;
-       HFSPLUS_I(dir).rsrc_inode = inode;
+       hip->rsrc_inode = dir;
+       HFSPLUS_I(dir)->rsrc_inode = inode;
         igrab(dir);
-       hlist_add_head(&inode->i_hash, &HFSPLUS_SB(sb).rsrc_inodes);
+
+       /*
+        * __mark_inode_dirty expects inodes to be hashed.  Since we don't
+        * want resource fork inodes in the regular inode space, we make them
+        * appear hashed, but do not put on any lists.  hlist_del()
+        * will work fine and require no locking.
+        */
+       inode->i_hash.pprev = &inode->i_hash.next;
+
         mark_inode_dirty(inode);
  out:
         d_add(dentry, inode);
@@ -211,30 +221,27 @@ out:
  
  static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir)
  {
-       struct super_block *sb = inode->i_sb;
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
         u16 mode;
  
         mode = be16_to_cpu(perms->mode);
  
         inode->i_uid = be32_to_cpu(perms->owner);
         if (!inode->i_uid && !mode)
-               inode->i_uid = HFSPLUS_SB(sb).uid;
+               inode->i_uid = sbi->uid;
  
         inode->i_gid = be32_to_cpu(perms->group);
         if (!inode->i_gid && !mode)
-               inode->i_gid = HFSPLUS_SB(sb).gid;
+               inode->i_gid = sbi->gid;
  
         if (dir) {
-               mode = mode ? (mode & S_IALLUGO) :
-                       (S_IRWXUGO & ~(HFSPLUS_SB(sb).umask));
+               mode = mode ? (mode & S_IALLUGO) : (S_IRWXUGO & ~(sbi->umask));
                 mode |= S_IFDIR;
         } else if (!mode)
-               mode = S_IFREG | ((S_IRUGO|S_IWUGO) &
-                       ~(HFSPLUS_SB(sb).umask));
+               mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask));
         inode->i_mode = mode;
  
-       HFSPLUS_I(inode).rootflags = perms->rootflags;
-       HFSPLUS_I(inode).userflags = perms->userflags;
+       HFSPLUS_I(inode)->userflags = perms->userflags;
         if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE)
                 inode->i_flags |= S_IMMUTABLE;
         else
@@ -245,30 +252,13 @@ static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, i
                 inode->i_flags &= ~S_APPEND;
  }
  
-static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
-{
-       if (inode->i_flags & S_IMMUTABLE)
-               perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
-       else
-               perms->rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
-       if (inode->i_flags & S_APPEND)
-               perms->rootflags |= HFSPLUS_FLG_APPEND;
-       else
-               perms->rootflags &= ~HFSPLUS_FLG_APPEND;
-       perms->userflags = HFSPLUS_I(inode).userflags;
-       perms->mode = cpu_to_be16(inode->i_mode);
-       perms->owner = cpu_to_be32(inode->i_uid);
-       perms->group = cpu_to_be32(inode->i_gid);
-       perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev);
-}
-
  static int hfsplus_file_open(struct inode *inode, struct file *file)
  {
         if (HFSPLUS_IS_RSRC(inode))
-               inode = HFSPLUS_I(inode).rsrc_inode;
+               inode = HFSPLUS_I(inode)->rsrc_inode;
         if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
                 return -EOVERFLOW;
-       atomic_inc(&HFSPLUS_I(inode).opencnt);
+       atomic_inc(&HFSPLUS_I(inode)->opencnt);
         return 0;
  }
  
@@ -277,12 +267,13 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
         struct super_block *sb = inode->i_sb;
  
         if (HFSPLUS_IS_RSRC(inode))
-               inode = HFSPLUS_I(inode).rsrc_inode;
-       if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) {
+               inode = HFSPLUS_I(inode)->rsrc_inode;
+       if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) {
                 mutex_lock(&inode->i_mutex);
                 hfsplus_file_truncate(inode);
                 if (inode->i_flags & S_DEAD) {
-                       hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
+                       hfsplus_delete_cat(inode->i_ino,
+                                          HFSPLUS_SB(sb)->hidden_dir, NULL);
                         hfsplus_delete_inode(inode);
                 }
                 mutex_unlock(&inode->i_mutex);
@@ -361,47 +352,52 @@ static const struct file_operations hfsplus_file_operations = {
  
  struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
  {
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
         struct inode *inode = new_inode(sb);
+       struct hfsplus_inode_info *hip;
+
         if (!inode)
                 return NULL;
  
-       inode->i_ino = HFSPLUS_SB(sb).next_cnid++;
+       inode->i_ino = sbi->next_cnid++;
         inode->i_mode = mode;
         inode->i_uid = current_fsuid();
         inode->i_gid = current_fsgid();
         inode->i_nlink = 1;
         inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
-       INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
-       mutex_init(&HFSPLUS_I(inode).extents_lock);
-       atomic_set(&HFSPLUS_I(inode).opencnt, 0);
-       HFSPLUS_I(inode).flags = 0;
-       memset(HFSPLUS_I(inode).first_extents, 0, sizeof(hfsplus_extent_rec));
-       memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec));
-       HFSPLUS_I(inode).alloc_blocks = 0;
-       HFSPLUS_I(inode).first_blocks = 0;
-       HFSPLUS_I(inode).cached_start = 0;
-       HFSPLUS_I(inode).cached_blocks = 0;
-       HFSPLUS_I(inode).phys_size = 0;
-       HFSPLUS_I(inode).fs_blocks = 0;
-       HFSPLUS_I(inode).rsrc_inode = NULL;
+
+       hip = HFSPLUS_I(inode);
+       INIT_LIST_HEAD(&hip->open_dir_list);
+       mutex_init(&hip->extents_lock);
+       atomic_set(&hip->opencnt, 0);
+       hip->flags = 0;
+       memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));
+       memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
+       hip->alloc_blocks = 0;
+       hip->first_blocks = 0;
+       hip->cached_start = 0;
+       hip->cached_blocks = 0;
+       hip->phys_size = 0;
+       hip->fs_blocks = 0;
+       hip->rsrc_inode = NULL;
         if (S_ISDIR(inode->i_mode)) {
                 inode->i_size = 2;
-               HFSPLUS_SB(sb).folder_count++;
+               sbi->folder_count++;
                 inode->i_op = &hfsplus_dir_inode_operations;
                 inode->i_fop = &hfsplus_dir_operations;
         } else if (S_ISREG(inode->i_mode)) {
-               HFSPLUS_SB(sb).file_count++;
+               sbi->file_count++;
                 inode->i_op = &hfsplus_file_inode_operations;
                 inode->i_fop = &hfsplus_file_operations;
                 inode->i_mapping->a_ops = &hfsplus_aops;
-               HFSPLUS_I(inode).clump_blocks = HFSPLUS_SB(sb).data_clump_blocks;
+               hip->clump_blocks = sbi->data_clump_blocks;
         } else if (S_ISLNK(inode->i_mode)) {
-               HFSPLUS_SB(sb).file_count++;
+               sbi->file_count++;
                 inode->i_op = &page_symlink_inode_operations;
                 inode->i_mapping->a_ops = &hfsplus_aops;
-               HFSPLUS_I(inode).clump_blocks = 1;
+               hip->clump_blocks = 1;
         } else
-               HFSPLUS_SB(sb).file_count++;
+               sbi->file_count++;
         insert_inode_hash(inode);
         mark_inode_dirty(inode);
         sb->s_dirt = 1;
@@ -414,11 +410,11 @@ void hfsplus_delete_inode(struct inode *inode)
         struct super_block *sb = inode->i_sb;
  
         if (S_ISDIR(inode->i_mode)) {
-               HFSPLUS_SB(sb).folder_count--;
+               HFSPLUS_SB(sb)->folder_count--;
                 sb->s_dirt = 1;
                 return;
         }
-       HFSPLUS_SB(sb).file_count--;
+       HFSPLUS_SB(sb)->file_count--;
         if (S_ISREG(inode->i_mode)) {
                 if (!inode->i_nlink) {
                         inode->i_size = 0;
@@ -434,34 +430,39 @@ void hfsplus_delete_inode(struct inode *inode)
  void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
  {
         struct super_block *sb = inode->i_sb;
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+       struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
         u32 count;
         int i;
  
-       memcpy(&HFSPLUS_I(inode).first_extents, &fork->extents,
-              sizeof(hfsplus_extent_rec));
+       memcpy(&hip->first_extents, &fork->extents, sizeof(hfsplus_extent_rec));
         for (count = 0, i = 0; i < 8; i++)
                 count += be32_to_cpu(fork->extents[i].block_count);
-       HFSPLUS_I(inode).first_blocks = count;
-       memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec));
-       HFSPLUS_I(inode).cached_start = 0;
-       HFSPLUS_I(inode).cached_blocks = 0;
-
-       HFSPLUS_I(inode).alloc_blocks = be32_to_cpu(fork->total_blocks);
-       inode->i_size = HFSPLUS_I(inode).phys_size = be64_to_cpu(fork->total_size);
-       HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
-       inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits);
-       HFSPLUS_I(inode).clump_blocks = be32_to_cpu(fork->clump_size) >> HFSPLUS_SB(sb).alloc_blksz_shift;
-       if (!HFSPLUS_I(inode).clump_blocks)
-               HFSPLUS_I(inode).clump_blocks = HFSPLUS_IS_RSRC(inode) ? HFSPLUS_SB(sb).rsrc_clump_blocks :
-                               HFSPLUS_SB(sb).data_clump_blocks;
+       hip->first_blocks = count;
+       memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
+       hip->cached_start = 0;
+       hip->cached_blocks = 0;
+
+       hip->alloc_blocks = be32_to_cpu(fork->total_blocks);
+       hip->phys_size = inode->i_size = be64_to_cpu(fork->total_size);
+       hip->fs_blocks =
+               (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+       inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
+       hip->clump_blocks =
+               be32_to_cpu(fork->clump_size) >> sbi->alloc_blksz_shift;
+       if (!hip->clump_blocks) {
+               hip->clump_blocks = HFSPLUS_IS_RSRC(inode) ?
+                       sbi->rsrc_clump_blocks :
+                       sbi->data_clump_blocks;
+       }
  }
  
  void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
  {
-       memcpy(&fork->extents, &HFSPLUS_I(inode).first_extents,
+       memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents,
                sizeof(hfsplus_extent_rec));
         fork->total_size = cpu_to_be64(inode->i_size);
-       fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode).alloc_blocks);
+       fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode)->alloc_blocks);
  }
  
  int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
@@ -472,7 +473,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
  
         type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset);
  
-       HFSPLUS_I(inode).dev = 0;
+       HFSPLUS_I(inode)->linkid = 0;
         if (type == HFSPLUS_FOLDER) {
                 struct hfsplus_cat_folder *folder = &entry.folder;
  
@@ -486,8 +487,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
                 inode->i_atime = hfsp_mt2ut(folder->access_date);
                 inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
                 inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date);
-               HFSPLUS_I(inode).create_date = folder->create_date;
-               HFSPLUS_I(inode).fs_blocks = 0;
+               HFSPLUS_I(inode)->create_date = folder->create_date;
+               HFSPLUS_I(inode)->fs_blocks = 0;
                 inode->i_op = &hfsplus_dir_inode_operations;
                 inode->i_fop = &hfsplus_dir_operations;
         } else if (type == HFSPLUS_FILE) {
@@ -518,7 +519,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
                 inode->i_atime = hfsp_mt2ut(file->access_date);
                 inode->i_mtime = hfsp_mt2ut(file->content_mod_date);
                 inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date);
-               HFSPLUS_I(inode).create_date = file->create_date;
+               HFSPLUS_I(inode)->create_date = file->create_date;
         } else {
                 printk(KERN_ERR "hfs: bad catalog entry used to create inode\n");
                 res = -EIO;
@@ -533,12 +534,12 @@ int hfsplus_cat_write_inode(struct inode *inode)
         hfsplus_cat_entry entry;
  
         if (HFSPLUS_IS_RSRC(inode))
-               main_inode = HFSPLUS_I(inode).rsrc_inode;
+               main_inode = HFSPLUS_I(inode)->rsrc_inode;
  
         if (!main_inode->i_nlink)
                 return 0;
  
-       if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb).cat_tree, &fd))
+       if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb)->cat_tree, &fd))
                 /* panic? */
                 return -EIO;
  
@@ -554,7 +555,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
                 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
                                         sizeof(struct hfsplus_cat_folder));
                 /* simple node checks? */
-               hfsplus_set_perms(inode, &folder->permissions);
+               hfsplus_cat_set_perms(inode, &folder->permissions);
                 folder->access_date = hfsp_ut2mt(inode->i_atime);
                 folder->content_mod_date = hfsp_ut2mt(inode->i_mtime);
                 folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
@@ -576,11 +577,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
                 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
                                         sizeof(struct hfsplus_cat_file));
                 hfsplus_inode_write_fork(inode, &file->data_fork);
-               if (S_ISREG(inode->i_mode))
-                       HFSPLUS_I(inode).dev = inode->i_nlink;
-               if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
-                       HFSPLUS_I(inode).dev = kdev_t_to_nr(inode->i_rdev);
-               hfsplus_set_perms(inode, &file->permissions);
+               hfsplus_cat_set_perms(inode, &file->permissions);
                 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
                         file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
                 else
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c

index ac405f09902651838979e322931ba7a1f1441633..5b4667e08ef7789e49c274758a28d11ef86d5fde 100644 (file)
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -17,83 +17,98 @@
  #include <linux/mount.h>
  #include <linux/sched.h>
  #include <linux/xattr.h>
-#include <linux/smp_lock.h>
  #include <asm/uaccess.h>
  #include "hfsplus_fs.h"
  
-long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
  {
-       struct inode *inode = filp->f_path.dentry->d_inode;
+       struct inode *inode = file->f_path.dentry->d_inode;
+       struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+       unsigned int flags = 0;
+
+       if (inode->i_flags & S_IMMUTABLE)
+               flags |= FS_IMMUTABLE_FL;
+       if (inode->i_flags |= S_APPEND)
+               flags |= FS_APPEND_FL;
+       if (hip->userflags & HFSPLUS_FLG_NODUMP)
+               flags |= FS_NODUMP_FL;
+
+       return put_user(flags, user_flags);
+}
+
+static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
+{
+       struct inode *inode = file->f_path.dentry->d_inode;
+       struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
         unsigned int flags;
+       int err = 0;
  
-       lock_kernel();
-       switch (cmd) {
-       case HFSPLUS_IOC_EXT2_GETFLAGS:
-               flags = 0;
-               if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_IMMUTABLE)
-                       flags |= FS_IMMUTABLE_FL; /* EXT2_IMMUTABLE_FL */
-               if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_APPEND)
-                       flags |= FS_APPEND_FL; /* EXT2_APPEND_FL */
-               if (HFSPLUS_I(inode).userflags & HFSPLUS_FLG_NODUMP)
-                       flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */
-               return put_user(flags, (int __user *)arg);
-       case HFSPLUS_IOC_EXT2_SETFLAGS: {
-               int err = 0;
-               err = mnt_want_write(filp->f_path.mnt);
-               if (err) {
-                       unlock_kernel();
-                       return err;
-               }
+       err = mnt_want_write(file->f_path.mnt);
+       if (err)
+               goto out;
  
-               if (!is_owner_or_cap(inode)) {
-                       err = -EACCES;
-                       goto setflags_out;
-               }
-               if (get_user(flags, (int __user *)arg)) {
-                       err = -EFAULT;
-                       goto setflags_out;
-               }
-               if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) ||
-                   HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) {
-                       if (!capable(CAP_LINUX_IMMUTABLE)) {
-                               err = -EPERM;
-                               goto setflags_out;
-                       }
-               }
+       if (!is_owner_or_cap(inode)) {
+               err = -EACCES;
+               goto out_drop_write;
+       }
  
-               /* don't silently ignore unsupported ext2 flags */
-               if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
-                       err = -EOPNOTSUPP;
-                       goto setflags_out;
-               }
-               if (flags & FS_IMMUTABLE_FL) { /* EXT2_IMMUTABLE_FL */
-                       inode->i_flags |= S_IMMUTABLE;
-                       HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE;
-               } else {
-                       inode->i_flags &= ~S_IMMUTABLE;
-                       HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
-               }
-               if (flags & FS_APPEND_FL) { /* EXT2_APPEND_FL */
-                       inode->i_flags |= S_APPEND;
-                       HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_APPEND;
-               } else {
-                       inode->i_flags &= ~S_APPEND;
-                       HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_APPEND;
+       if (get_user(flags, user_flags)) {
+               err = -EFAULT;
+               goto out_drop_write;
+       }
+
+       mutex_lock(&inode->i_mutex);
+
+       if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) ||
+           inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
+               if (!capable(CAP_LINUX_IMMUTABLE)) {
+                       err = -EPERM;
+                       goto out_unlock_inode;
                 }
-               if (flags & FS_NODUMP_FL) /* EXT2_NODUMP_FL */
-                       HFSPLUS_I(inode).userflags |= HFSPLUS_FLG_NODUMP;
-               else
-                       HFSPLUS_I(inode).userflags &= ~HFSPLUS_FLG_NODUMP;
-
-               inode->i_ctime = CURRENT_TIME_SEC;
-               mark_inode_dirty(inode);
-setflags_out:
-               mnt_drop_write(filp->f_path.mnt);
-               unlock_kernel();
-               return err;
         }
+
+       /* don't silently ignore unsupported ext2 flags */
+       if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
+               err = -EOPNOTSUPP;
+               goto out_unlock_inode;
+       }
+
+       if (flags & FS_IMMUTABLE_FL)
+               inode->i_flags |= S_IMMUTABLE;
+       else
+               inode->i_flags &= ~S_IMMUTABLE;
+
+       if (flags & FS_APPEND_FL)
+               inode->i_flags |= S_APPEND;
+       else
+               inode->i_flags &= ~S_APPEND;
+
+       if (flags & FS_NODUMP_FL)
+               hip->userflags |= HFSPLUS_FLG_NODUMP;
+       else
+               hip->userflags &= ~HFSPLUS_FLG_NODUMP;
+
+       inode->i_ctime = CURRENT_TIME_SEC;
+       mark_inode_dirty(inode);
+
+out_unlock_inode:
+       mutex_lock(&inode->i_mutex);
+out_drop_write:
+       mnt_drop_write(file->f_path.mnt);
+out:
+       return err;
+}
+
+long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+       void __user *argp = (void __user *)arg;
+
+       switch (cmd) {
+       case HFSPLUS_IOC_EXT2_GETFLAGS:
+               return hfsplus_ioctl_getflags(file, argp);
+       case HFSPLUS_IOC_EXT2_SETFLAGS:
+               return hfsplus_ioctl_setflags(file, argp);
         default:
-               unlock_kernel();
                 return -ENOTTY;
         }
  }
@@ -110,7 +125,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
         if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode))
                 return -EOPNOTSUPP;
  
-       res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
+       res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
         if (res)
                 return res;
         res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
@@ -153,7 +168,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
                 return -EOPNOTSUPP;
  
         if (size) {
-               res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
+               res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
                 if (res)
                         return res;
                 res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
@@ -177,7 +192,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
                 } else
                         res = size ? -ERANGE : 4;
         } else
-               res = -ENODATA;
+               res = -EOPNOTSUPP;
  out:
         if (size)
                 hfs_find_exit(&fd);
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c

index 572628b4b07d23af08f98ca7b757c85acfbcbc0c..f9ab276a4d8de9e15d2acf49a358da1a9ff10fe6 100644 (file)
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -143,13 +143,13 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
                         kfree(p);
                         break;
                 case opt_decompose:
-                       sbi->flags &= ~HFSPLUS_SB_NODECOMPOSE;
+                       clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
                         break;
                 case opt_nodecompose:
-                       sbi->flags |= HFSPLUS_SB_NODECOMPOSE;
+                       set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
                         break;
                 case opt_force:
-                       sbi->flags |= HFSPLUS_SB_FORCE;
+                       set_bit(HFSPLUS_SB_FORCE, &sbi->flags);
                         break;
                 default:
                         return 0;
@@ -171,7 +171,7 @@ done:
  
  int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
  {
-       struct hfsplus_sb_info *sbi = &HFSPLUS_SB(mnt->mnt_sb);
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(mnt->mnt_sb);
  
         if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
                 seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
@@ -184,7 +184,7 @@ int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
                 seq_printf(seq, ",session=%u", sbi->session);
         if (sbi->nls)
                 seq_printf(seq, ",nls=%s", sbi->nls->charset);
-       if (sbi->flags & HFSPLUS_SB_NODECOMPOSE)
+       if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags))
                 seq_printf(seq, ",nodecompose");
         return 0;
  }
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c

index 1528a6fd02992f1858ee254fe01039520da77bd4..208b16c645cc234c6f5ce1b15fbd5b49ba802ee8 100644 (file)
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -74,6 +74,7 @@ struct old_pmap {
  int hfs_part_find(struct super_block *sb,
                   sector_t *part_start, sector_t *part_size)
  {
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
         struct buffer_head *bh;
         __be16 *data;
         int i, size, res;
@@ -95,7 +96,7 @@ int hfs_part_find(struct super_block *sb,
                 for (i = 0; i < size; p++, i++) {
                         if (p->pdStart && p->pdSize &&
                             p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
-                           (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) {
+                           (sbi->part < 0 || sbi->part == i)) {
                                 *part_start += be32_to_cpu(p->pdStart);
                                 *part_size = be32_to_cpu(p->pdSize);
                                 res = 0;
@@ -111,7 +112,7 @@ int hfs_part_find(struct super_block *sb,
                 size = be32_to_cpu(pm->pmMapBlkCnt);
                 for (i = 0; i < size;) {
                         if (!memcmp(pm->pmPartType,"Apple_HFS", 9) &&
-                           (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) {
+                           (sbi->part < 0 || sbi->part == i)) {
                                 *part_start += be32_to_cpu(pm->pmPyPartStart);
                                 *part_size = be32_to_cpu(pm->pmPartBlkCnt);
                                 res = 0;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c

index 3b55c050c74274710fa95cad827edf6abd6b8316..9a88d7536103e2c1c70f3824b42d367d3eac848f 100644 (file)
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -12,7 +12,6 @@
  #include <linux/pagemap.h>
  #include <linux/fs.h>
  #include <linux/slab.h>
-#include <linux/smp_lock.h>
  #include <linux/vfs.h>
  #include <linux/nls.h>
  
@@ -21,40 +20,11 @@ static void hfsplus_destroy_inode(struct inode *inode);
  
  #include "hfsplus_fs.h"
  
-struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
+static int hfsplus_system_read_inode(struct inode *inode)
  {
-       struct hfs_find_data fd;
-       struct hfsplus_vh *vhdr;
-       struct inode *inode;
-       long err = -EIO;
-
-       inode = iget_locked(sb, ino);
-       if (!inode)
-               return ERR_PTR(-ENOMEM);
-       if (!(inode->i_state & I_NEW))
-               return inode;
+       struct hfsplus_vh *vhdr = HFSPLUS_SB(inode->i_sb)->s_vhdr;
  
-       INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
-       mutex_init(&HFSPLUS_I(inode).extents_lock);
-       HFSPLUS_I(inode).flags = 0;
-       HFSPLUS_I(inode).rsrc_inode = NULL;
-       atomic_set(&HFSPLUS_I(inode).opencnt, 0);
-
-       if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
-       read_inode:
-               hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
-               err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
-               if (!err)
-                       err = hfsplus_cat_read_inode(inode, &fd);
-               hfs_find_exit(&fd);
-               if (err)
-                       goto bad_inode;
-               goto done;
-       }
-       vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr;
-       switch(inode->i_ino) {
-       case HFSPLUS_ROOT_CNID:
-               goto read_inode;
+       switch (inode->i_ino) {
         case HFSPLUS_EXT_CNID:
                 hfsplus_inode_read_fork(inode, &vhdr->ext_file);
                 inode->i_mapping->a_ops = &hfsplus_btree_aops;
@@ -75,74 +45,101 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
                 inode->i_mapping->a_ops = &hfsplus_btree_aops;
                 break;
         default:
-               goto bad_inode;
+               return -EIO;
+       }
+
+       return 0;
+}
+
+struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
+{
+       struct hfs_find_data fd;
+       struct inode *inode;
+       int err;
+
+       inode = iget_locked(sb, ino);
+       if (!inode)
+               return ERR_PTR(-ENOMEM);
+       if (!(inode->i_state & I_NEW))
+               return inode;
+
+       INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list);
+       mutex_init(&HFSPLUS_I(inode)->extents_lock);
+       HFSPLUS_I(inode)->flags = 0;
+       HFSPLUS_I(inode)->rsrc_inode = NULL;
+       atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
+
+       if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
+           inode->i_ino == HFSPLUS_ROOT_CNID) {
+               hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
+               err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
+               if (!err)
+                       err = hfsplus_cat_read_inode(inode, &fd);
+               hfs_find_exit(&fd);
+       } else {
+               err = hfsplus_system_read_inode(inode);
+       }
+
+       if (err) {
+               iget_failed(inode);
+               return ERR_PTR(err);
         }
  
-done:
         unlock_new_inode(inode);
         return inode;
-
-bad_inode:
-       iget_failed(inode);
-       return ERR_PTR(err);
  }
  
-static int hfsplus_write_inode(struct inode *inode,
-               struct writeback_control *wbc)
+static int hfsplus_system_write_inode(struct inode *inode)
  {
-       struct hfsplus_vh *vhdr;
-       int ret = 0;
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
+       struct hfsplus_vh *vhdr = sbi->s_vhdr;
+       struct hfsplus_fork_raw *fork;
+       struct hfs_btree *tree = NULL;
  
-       dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
-       hfsplus_ext_write_extent(inode);
-       if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
-               return hfsplus_cat_write_inode(inode);
-       }
-       vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr;
         switch (inode->i_ino) {
-       case HFSPLUS_ROOT_CNID:
-               ret = hfsplus_cat_write_inode(inode);
-               break;
         case HFSPLUS_EXT_CNID:
-               if (vhdr->ext_file.total_size != cpu_to_be64(inode->i_size)) {
-                       HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
-                       inode->i_sb->s_dirt = 1;
-               }
-               hfsplus_inode_write_fork(inode, &vhdr->ext_file);
-               hfs_btree_write(HFSPLUS_SB(inode->i_sb).ext_tree);
+               fork = &vhdr->ext_file;
+               tree = sbi->ext_tree;
                 break;
         case HFSPLUS_CAT_CNID:
-               if (vhdr->cat_file.total_size != cpu_to_be64(inode->i_size)) {
-                       HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
-                       inode->i_sb->s_dirt = 1;
-               }
-               hfsplus_inode_write_fork(inode, &vhdr->cat_file);
-               hfs_btree_write(HFSPLUS_SB(inode->i_sb).cat_tree);
+               fork = &vhdr->cat_file;
+               tree = sbi->cat_tree;
                 break;
         case HFSPLUS_ALLOC_CNID:
-               if (vhdr->alloc_file.total_size != cpu_to_be64(inode->i_size)) {
-                       HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
-                       inode->i_sb->s_dirt = 1;
-               }
-               hfsplus_inode_write_fork(inode, &vhdr->alloc_file);
+               fork = &vhdr->alloc_file;
                 break;
         case HFSPLUS_START_CNID:
-               if (vhdr->start_file.total_size != cpu_to_be64(inode->i_size)) {
-                       HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
-                       inode->i_sb->s_dirt = 1;
-               }
-               hfsplus_inode_write_fork(inode, &vhdr->start_file);
+               fork = &vhdr->start_file;
                 break;
         case HFSPLUS_ATTR_CNID:
-               if (vhdr->attr_file.total_size != cpu_to_be64(inode->i_size)) {
-                       HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
-                       inode->i_sb->s_dirt = 1;
-               }
-               hfsplus_inode_write_fork(inode, &vhdr->attr_file);
-               hfs_btree_write(HFSPLUS_SB(inode->i_sb).attr_tree);
-               break;
+               fork = &vhdr->attr_file;
+               tree = sbi->attr_tree;
+       default:
+               return -EIO;
+       }
+
+       if (fork->total_size != cpu_to_be64(inode->i_size)) {
+               set_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags);
+               inode->i_sb->s_dirt = 1;
         }
-       return ret;
+       hfsplus_inode_write_fork(inode, fork);
+       if (tree)
+               hfs_btree_write(tree);
+       return 0;
+}
+
+static int hfsplus_write_inode(struct inode *inode,
+               struct writeback_control *wbc)
+{
+       dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
+
+       hfsplus_ext_write_extent(inode);
+
+       if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
+           inode->i_ino == HFSPLUS_ROOT_CNID)
+               return hfsplus_cat_write_inode(inode);
+       else
+               return hfsplus_system_write_inode(inode);
  }
  
  static void hfsplus_evict_inode(struct inode *inode)
@@ -151,51 +148,53 @@ static void hfsplus_evict_inode(struct inode *inode)
         truncate_inode_pages(&inode->i_data, 0);
         end_writeback(inode);
         if (HFSPLUS_IS_RSRC(inode)) {
-               HFSPLUS_I(HFSPLUS_I(inode).rsrc_inode).rsrc_inode = NULL;
-               iput(HFSPLUS_I(inode).rsrc_inode);
+               HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
+               iput(HFSPLUS_I(inode)->rsrc_inode);
         }
  }
  
  int hfsplus_sync_fs(struct super_block *sb, int wait)
  {
-       struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+       struct hfsplus_vh *vhdr = sbi->s_vhdr;
  
         dprint(DBG_SUPER, "hfsplus_write_super\n");
  
-       lock_super(sb);
+       mutex_lock(&sbi->vh_mutex);
+       mutex_lock(&sbi->alloc_mutex);
         sb->s_dirt = 0;
  
-       vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks);
-       vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc);
-       vhdr->next_cnid = cpu_to_be32(HFSPLUS_SB(sb).next_cnid);
-       vhdr->folder_count = cpu_to_be32(HFSPLUS_SB(sb).folder_count);
-       vhdr->file_count = cpu_to_be32(HFSPLUS_SB(sb).file_count);
+       vhdr->free_blocks = cpu_to_be32(sbi->free_blocks);
+       vhdr->next_cnid = cpu_to_be32(sbi->next_cnid);
+       vhdr->folder_count = cpu_to_be32(sbi->folder_count);
+       vhdr->file_count = cpu_to_be32(sbi->file_count);
  
-       mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh);
-       if (HFSPLUS_SB(sb).flags & HFSPLUS_SB_WRITEBACKUP) {
-               if (HFSPLUS_SB(sb).sect_count) {
+       mark_buffer_dirty(sbi->s_vhbh);
+       if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) {
+               if (sbi->sect_count) {
                         struct buffer_head *bh;
                         u32 block, offset;
  
-                       block = HFSPLUS_SB(sb).blockoffset;
-                       block += (HFSPLUS_SB(sb).sect_count - 2) >> (sb->s_blocksize_bits - 9);
-                       offset = ((HFSPLUS_SB(sb).sect_count - 2) << 9) & (sb->s_blocksize - 1);
-                       printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n", HFSPLUS_SB(sb).blockoffset,
-                               HFSPLUS_SB(sb).sect_count, block, offset);
+                       block = sbi->blockoffset;
+                       block += (sbi->sect_count - 2) >> (sb->s_blocksize_bits - 9);
+                       offset = ((sbi->sect_count - 2) << 9) & (sb->s_blocksize - 1);
+                       printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n",
+                                         sbi->blockoffset, sbi->sect_count,
+                                         block, offset);
                         bh = sb_bread(sb, block);
                         if (bh) {
                                 vhdr = (struct hfsplus_vh *)(bh->b_data + offset);
                                 if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) {
-                                       memcpy(vhdr, HFSPLUS_SB(sb).s_vhdr, sizeof(*vhdr));
+                                       memcpy(vhdr, sbi->s_vhdr, sizeof(*vhdr));
                                         mark_buffer_dirty(bh);
                                         brelse(bh);
                                 } else
                                         printk(KERN_WARNING "hfs: backup not found!\n");
                         }
                 }
-               HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP;
         }
-       unlock_super(sb);
+       mutex_unlock(&sbi->alloc_mutex);
+       mutex_unlock(&sbi->vh_mutex);
         return 0;
  }
  
@@ -209,48 +208,48 @@ static void hfsplus_write_super(struct super_block *sb)
  
  static void hfsplus_put_super(struct super_block *sb)
  {
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+
         dprint(DBG_SUPER, "hfsplus_put_super\n");
+
         if (!sb->s_fs_info)
                 return;
  
-       lock_kernel();
-
         if (sb->s_dirt)
                 hfsplus_write_super(sb);
-       if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) {
-               struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
+       if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) {
+               struct hfsplus_vh *vhdr = sbi->s_vhdr;
  
                 vhdr->modify_date = hfsp_now2mt();
                 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT);
                 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT);
-               mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh);
-               sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh);
+               mark_buffer_dirty(sbi->s_vhbh);
+               sync_dirty_buffer(sbi->s_vhbh);
         }
  
-       hfs_btree_close(HFSPLUS_SB(sb).cat_tree);
-       hfs_btree_close(HFSPLUS_SB(sb).ext_tree);
-       iput(HFSPLUS_SB(sb).alloc_file);
-       iput(HFSPLUS_SB(sb).hidden_dir);
-       brelse(HFSPLUS_SB(sb).s_vhbh);
-       unload_nls(HFSPLUS_SB(sb).nls);
+       hfs_btree_close(sbi->cat_tree);
+       hfs_btree_close(sbi->ext_tree);
+       iput(sbi->alloc_file);
+       iput(sbi->hidden_dir);
+       brelse(sbi->s_vhbh);
+       unload_nls(sbi->nls);
         kfree(sb->s_fs_info);
         sb->s_fs_info = NULL;
-
-       unlock_kernel();
  }
  
  static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
  {
         struct super_block *sb = dentry->d_sb;
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
         u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
  
         buf->f_type = HFSPLUS_SUPER_MAGIC;
         buf->f_bsize = sb->s_blocksize;
-       buf->f_blocks = HFSPLUS_SB(sb).total_blocks << HFSPLUS_SB(sb).fs_shift;
-       buf->f_bfree = HFSPLUS_SB(sb).free_blocks << HFSPLUS_SB(sb).fs_shift;
+       buf->f_blocks = sbi->total_blocks << sbi->fs_shift;
+       buf->f_bfree = sbi->free_blocks << sbi->fs_shift;
         buf->f_bavail = buf->f_bfree;
         buf->f_files = 0xFFFFFFFF;
-       buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid;
+       buf->f_ffree = 0xFFFFFFFF - sbi->next_cnid;
         buf->f_fsid.val[0] = (u32)id;
         buf->f_fsid.val[1] = (u32)(id >> 32);
         buf->f_namelen = HFSPLUS_MAX_STRLEN;
@@ -263,11 +262,11 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
         if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
                 return 0;
         if (!(*flags & MS_RDONLY)) {
-               struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
+               struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr;
                 struct hfsplus_sb_info sbi;
  
                 memset(&sbi, 0, sizeof(struct hfsplus_sb_info));
-               sbi.nls = HFSPLUS_SB(sb).nls;
+               sbi.nls = HFSPLUS_SB(sb)->nls;
                 if (!hfsplus_parse_options(data, &sbi))
                         return -EINVAL;
  
@@ -276,7 +275,7 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
                                "running fsck.hfsplus is recommended.  leaving read-only.\n");
                         sb->s_flags |= MS_RDONLY;
                         *flags |= MS_RDONLY;
-               } else if (sbi.flags & HFSPLUS_SB_FORCE) {
+               } else if (test_bit(HFSPLUS_SB_FORCE, &sbi.flags)) {
                         /* nothing */
                 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
                         printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n");
@@ -320,7 +319,8 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
                 return -ENOMEM;
  
         sb->s_fs_info = sbi;
-       INIT_HLIST_HEAD(&sbi->rsrc_inodes);
+       mutex_init(&sbi->alloc_mutex);
+       mutex_init(&sbi->vh_mutex);
         hfsplus_fill_defaults(sbi);
         if (!hfsplus_parse_options(data, sbi)) {
                 printk(KERN_ERR "hfs: unable to parse mount options\n");
@@ -344,7 +344,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
                 err = -EINVAL;
                 goto cleanup;
         }
-       vhdr = HFSPLUS_SB(sb).s_vhdr;
+       vhdr = sbi->s_vhdr;
  
         /* Copy parts of the volume header into the superblock */
         sb->s_magic = HFSPLUS_VOLHEAD_SIG;
@@ -353,18 +353,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
                 printk(KERN_ERR "hfs: wrong filesystem version\n");
                 goto cleanup;
         }
-       HFSPLUS_SB(sb).total_blocks = be32_to_cpu(vhdr->total_blocks);
-       HFSPLUS_SB(sb).free_blocks = be32_to_cpu(vhdr->free_blocks);
-       HFSPLUS_SB(sb).next_alloc = be32_to_cpu(vhdr->next_alloc);
-       HFSPLUS_SB(sb).next_cnid = be32_to_cpu(vhdr->next_cnid);
-       HFSPLUS_SB(sb).file_count = be32_to_cpu(vhdr->file_count);
-       HFSPLUS_SB(sb).folder_count = be32_to_cpu(vhdr->folder_count);
-       HFSPLUS_SB(sb).data_clump_blocks = be32_to_cpu(vhdr->data_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift;
-       if (!HFSPLUS_SB(sb).data_clump_blocks)
-               HFSPLUS_SB(sb).data_clump_blocks = 1;
-       HFSPLUS_SB(sb).rsrc_clump_blocks = be32_to_cpu(vhdr->rsrc_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift;
-       if (!HFSPLUS_SB(sb).rsrc_clump_blocks)
-               HFSPLUS_SB(sb).rsrc_clump_blocks = 1;
+       sbi->total_blocks = be32_to_cpu(vhdr->total_blocks);
+       sbi->free_blocks = be32_to_cpu(vhdr->free_blocks);
+       sbi->next_cnid = be32_to_cpu(vhdr->next_cnid);
+       sbi->file_count = be32_to_cpu(vhdr->file_count);
+       sbi->folder_count = be32_to_cpu(vhdr->folder_count);
+       sbi->data_clump_blocks =
+               be32_to_cpu(vhdr->data_clump_sz) >> sbi->alloc_blksz_shift;
+       if (!sbi->data_clump_blocks)
+               sbi->data_clump_blocks = 1;
+       sbi->rsrc_clump_blocks =
+               be32_to_cpu(vhdr->rsrc_clump_sz) >> sbi->alloc_blksz_shift;
+       if (!sbi->rsrc_clump_blocks)
+               sbi->rsrc_clump_blocks = 1;
  
         /* Set up operations so we can load metadata */
         sb->s_op = &hfsplus_sops;
@@ -374,7 +375,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
                 printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, "
                        "running fsck.hfsplus is recommended.  mounting read-only.\n");
                 sb->s_flags |= MS_RDONLY;
-       } else if (sbi->flags & HFSPLUS_SB_FORCE) {
+       } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
                 /* nothing */
         } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
                 printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n");
@@ -384,16 +385,15 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
                        "use the force option at your own risk, mounting read-only.\n");
                 sb->s_flags |= MS_RDONLY;
         }
-       sbi->flags &= ~HFSPLUS_SB_FORCE;
  
         /* Load metadata objects (B*Trees) */
-       HFSPLUS_SB(sb).ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
-       if (!HFSPLUS_SB(sb).ext_tree) {
+       sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
+       if (!sbi->ext_tree) {
                 printk(KERN_ERR "hfs: failed to load extents file\n");
                 goto cleanup;
         }
-       HFSPLUS_SB(sb).cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
-       if (!HFSPLUS_SB(sb).cat_tree) {
+       sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
+       if (!sbi->cat_tree) {
                 printk(KERN_ERR "hfs: failed to load catalog file\n");
                 goto cleanup;
         }
@@ -404,7 +404,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
                 err = PTR_ERR(inode);
                 goto cleanup;
         }
-       HFSPLUS_SB(sb).alloc_file = inode;
+       sbi->alloc_file = inode;
  
         /* Load the root directory */
         root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID);
@@ -423,7 +423,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
  
         str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
         str.name = HFSP_HIDDENDIR_NAME;
-       hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+       hfs_find_init(sbi->cat_tree, &fd);
         hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str);
         if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
                 hfs_find_exit(&fd);
@@ -434,7 +434,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
                         err = PTR_ERR(inode);
                         goto cleanup;
                 }
-               HFSPLUS_SB(sb).hidden_dir = inode;
+               sbi->hidden_dir = inode;
         } else
                 hfs_find_exit(&fd);
  
@@ -449,15 +449,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
         be32_add_cpu(&vhdr->write_count, 1);
         vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
         vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
-       mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh);
-       sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh);
+       mark_buffer_dirty(sbi->s_vhbh);
+       sync_dirty_buffer(sbi->s_vhbh);
  
-       if (!HFSPLUS_SB(sb).hidden_dir) {
+       if (!sbi->hidden_dir) {
                 printk(KERN_DEBUG "hfs: create hidden dir...\n");
-               HFSPLUS_SB(sb).hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
-               hfsplus_create_cat(HFSPLUS_SB(sb).hidden_dir->i_ino, sb->s_root->d_inode,
-                                  &str, HFSPLUS_SB(sb).hidden_dir);
-               mark_inode_dirty(HFSPLUS_SB(sb).hidden_dir);
+
+               mutex_lock(&sbi->vh_mutex);
+               sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
+               hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode,
+                                  &str, sbi->hidden_dir);
+               mutex_unlock(&sbi->vh_mutex);
+
+               mark_inode_dirty(sbi->hidden_dir);
         }
  out:
         unload_nls(sbi->nls);
@@ -486,7 +490,7 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb)
  
  static void hfsplus_destroy_inode(struct inode *inode)
  {
-       kmem_cache_free(hfsplus_inode_cachep, &HFSPLUS_I(inode));
+       kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode));
  }
  
  #define HFSPLUS_INODE_SIZE     sizeof(struct hfsplus_inode_info)
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c

index 628ccf6fa402500aa15d7d53969b0f62b6ea5188..b66d67de882c3d098d661f54cbc2b19983bab32b 100644 (file)
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -121,7 +121,7 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc)
  int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p)
  {
         const hfsplus_unichr *ip;
-       struct nls_table *nls = HFSPLUS_SB(sb).nls;
+       struct nls_table *nls = HFSPLUS_SB(sb)->nls;
         u8 *op;
         u16 cc, c0, c1;
         u16 *ce1, *ce2;
@@ -132,7 +132,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
         ustrlen = be16_to_cpu(ustr->length);
         len = *len_p;
         ce1 = NULL;
-       compose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+       compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
  
         while (ustrlen > 0) {
                 c0 = be16_to_cpu(*ip++);
@@ -246,7 +246,7 @@ out:
  static inline int asc2unichar(struct super_block *sb, const char *astr, int len,
                               wchar_t *uc)
  {
-       int size = HFSPLUS_SB(sb).nls->char2uni(astr, len, uc);
+       int size = HFSPLUS_SB(sb)->nls->char2uni(astr, len, uc);
         if (size <= 0) {
                 *uc = '?';
                 size = 1;
@@ -293,7 +293,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
         u16 *dstr, outlen = 0;
         wchar_t c;
  
-       decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+       decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
         while (outlen < HFSPLUS_MAX_STRLEN && len > 0) {
                 size = asc2unichar(sb, astr, len, &c);
  
@@ -330,8 +330,8 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
         wchar_t c;
         u16 c2;
  
-       casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD);
-       decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+       casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
+       decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
         hash = init_name_hash();
         astr = str->name;
         len = str->len;
@@ -373,8 +373,8 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
         u16 c1, c2;
         wchar_t c;
  
-       casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD);
-       decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+       casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
+       decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
         astr1 = s1->name;
         len1 = s1->len;
         astr2 = s2->name;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c

index bed78ac8f6d1f5e530a93c0afac22e1cfdf2178a..8972c20b3216941a88eb3deffa591d89ea3c114a 100644 (file)
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -65,8 +65,8 @@ static int hfsplus_get_last_session(struct super_block *sb,
         *start = 0;
         *size = sb->s_bdev->bd_inode->i_size >> 9;
  
-       if (HFSPLUS_SB(sb).session >= 0) {
-               te.cdte_track = HFSPLUS_SB(sb).session;
+       if (HFSPLUS_SB(sb)->session >= 0) {
+               te.cdte_track = HFSPLUS_SB(sb)->session;
                 te.cdte_format = CDROM_LBA;
                 res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te);
                 if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) {
@@ -87,6 +87,7 @@ static int hfsplus_get_last_session(struct super_block *sb,
  /* Takes in super block, returns true if good data read */
  int hfsplus_read_wrapper(struct super_block *sb)
  {
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
         struct buffer_head *bh;
         struct hfsplus_vh *vhdr;
         struct hfsplus_wd wd;
@@ -122,7 +123,7 @@ int hfsplus_read_wrapper(struct super_block *sb)
                 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
                         break;
                 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) {
-                       HFSPLUS_SB(sb).flags |= HFSPLUS_SB_HFSX;
+                       set_bit(HFSPLUS_SB_HFSX, &sbi->flags);
                         break;
                 }
                 brelse(bh);
@@ -143,11 +144,11 @@ int hfsplus_read_wrapper(struct super_block *sb)
         if (blocksize < HFSPLUS_SECTOR_SIZE ||
             ((blocksize - 1) & blocksize))
                 return -EINVAL;
-       HFSPLUS_SB(sb).alloc_blksz = blocksize;
-       HFSPLUS_SB(sb).alloc_blksz_shift = 0;
+       sbi->alloc_blksz = blocksize;
+       sbi->alloc_blksz_shift = 0;
         while ((blocksize >>= 1) != 0)
-               HFSPLUS_SB(sb).alloc_blksz_shift++;
-       blocksize = min(HFSPLUS_SB(sb).alloc_blksz, (u32)PAGE_SIZE);
+               sbi->alloc_blksz_shift++;
+       blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE);
  
         /* align block size to block offset */
         while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1))
@@ -158,23 +159,26 @@ int hfsplus_read_wrapper(struct super_block *sb)
                 return -EINVAL;
         }
  
-       HFSPLUS_SB(sb).blockoffset = part_start >>
-                       (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
-       HFSPLUS_SB(sb).sect_count = part_size;
-       HFSPLUS_SB(sb).fs_shift = HFSPLUS_SB(sb).alloc_blksz_shift -
-                       sb->s_blocksize_bits;
+       sbi->blockoffset =
+               part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
+       sbi->sect_count = part_size;
+       sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
  
         bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
         if (!bh)
                 return -EIO;
  
         /* should still be the same... */
-       if (vhdr->signature != (HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX ?
-                               cpu_to_be16(HFSPLUS_VOLHEAD_SIGX) :
-                               cpu_to_be16(HFSPLUS_VOLHEAD_SIG)))
-               goto error;
-       HFSPLUS_SB(sb).s_vhbh = bh;
-       HFSPLUS_SB(sb).s_vhdr = vhdr;
+       if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) {
+               if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIGX))
+                       goto error;
+       } else {
+               if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
+                       goto error;
+       }
+
+       sbi->s_vhbh = bh;
+       sbi->s_vhdr = vhdr;
  
         return 0;
   error:
diff --git a/include/asm-generic/hardirq.h b/include/asm-generic/hardirq.h

index 62f59080e5cc215edb843f928763d9898055071f..04d0a977cd431fc5eb2c2f34cbd5390bfeb0db05 100644 (file)
--- a/include/asm-generic/hardirq.h
+++ b/include/asm-generic/hardirq.h
@@ -3,13 +3,13 @@
  
  #include <linux/cache.h>
  #include <linux/threads.h>
-#include <linux/irq.h>
  
  typedef struct {
         unsigned int __softirq_pending;
  } ____cacheline_aligned irq_cpustat_t;
  
  #include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */
+#include <linux/irq.h>
  
  #ifndef ack_bad_irq
  static inline void ack_bad_irq(unsigned int irq)
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h

index 8a92a170fb7dfd87710bb26b3c7f384dd6801363..ef2af9948eacc9b9f27a706b2febeeb0272b0910 100644 (file)
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -220,6 +220,8 @@
                                                                         \
         BUG_TABLE                                                       \
                                                                         \
+       JUMP_TABLE                                                      \
+                                                                       \
         /* PCI quirks */                                                \
         .pci_fixup        : AT(ADDR(.pci_fixup) - LOAD_OFFSET) {        \
                 VMLINUX_SYMBOL(__start_pci_fixups_early) = .;           \
@@ -563,6 +565,14 @@
  #define BUG_TABLE
  #endif
  
+#define JUMP_TABLE                                                     \
+       . = ALIGN(8);                                                   \
+       __jump_table : AT(ADDR(__jump_table) - LOAD_OFFSET) {           \
+               VMLINUX_SYMBOL(__start___jump_table) = .;               \
+               *(__jump_table)                                         \
+               VMLINUX_SYMBOL(__stop___jump_table) = .;                \
+       }
+
  #ifdef CONFIG_PM_TRACE
  #define TRACEDATA                                                      \
         . = ALIGN(4);                                                   \
diff --git a/fs/ceph/auth.h b/include/linux/ceph/auth.h

similarity index 97%

rename from fs/ceph/auth.h

rename to include/linux/ceph/auth.h

index d38a2fb4a13762fc6e37578f840b166ee85efd16..7fff521d7eb5e00741fcf2fc97f7ce8c40fdbfc8 100644 (file)
--- a/fs/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -1,8 +1,8 @@
  #ifndef _FS_CEPH_AUTH_H
  #define _FS_CEPH_AUTH_H
  
-#include "types.h"
-#include "buffer.h"
+#include <linux/ceph/types.h>
+#include <linux/ceph/buffer.h>
  
  /*
   * Abstract interface for communicating with the authenticate module.
diff --git a/fs/ceph/buffer.h b/include/linux/ceph/buffer.h

similarity index 100%

rename from fs/ceph/buffer.h

rename to include/linux/ceph/buffer.h
diff --git a/fs/ceph/ceph_debug.h b/include/linux/ceph/ceph_debug.h

similarity index 86%

rename from fs/ceph/ceph_debug.h

rename to include/linux/ceph/ceph_debug.h

index 1818c2305610e28f6dbb7d3e126e7137d28c1365..aa2e19182d990eedda70de69bf638db5856811d1 100644 (file)
--- a/fs/ceph/ceph_debug.h
+++ b/include/linux/ceph/ceph_debug.h
@@ -3,7 +3,7 @@
  
  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  
-#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
+#ifdef CONFIG_CEPH_LIB_PRETTYDEBUG
  
  /*
   * wrap pr_debug to include a filename:lineno prefix on each line.
@@ -14,7 +14,8 @@
  # if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
  extern const char *ceph_file_part(const char *s, int len);
  #  define dout(fmt, ...)                                               \
-       pr_debug(" %12.12s:%-4d : " fmt,                                \
+       pr_debug("%.*s %12.12s:%-4d : " fmt,                            \
+                8 - (int)sizeof(KBUILD_MODNAME), "    ",               \
                  ceph_file_part(__FILE__, sizeof(__FILE__)),            \
                  __LINE__, ##__VA_ARGS__)
  # else
diff --git a/fs/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h

similarity index 100%

rename from fs/ceph/ceph_frag.h

rename to include/linux/ceph/ceph_frag.h
diff --git a/fs/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h

similarity index 99%

rename from fs/ceph/ceph_fs.h

rename to include/linux/ceph/ceph_fs.h

index d5619ac86711a6b41c56a00da7660099352d5327..c3c74aef289d0d34afb2758392ef144d82b59cc6 100644 (file)
--- a/fs/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -299,6 +299,7 @@ enum {
         CEPH_MDS_OP_SETATTR    = 0x01108,
         CEPH_MDS_OP_SETFILELOCK= 0x01109,
         CEPH_MDS_OP_GETFILELOCK= 0x00110,
+       CEPH_MDS_OP_SETDIRLAYOUT=0x0110a,
  
         CEPH_MDS_OP_MKNOD      = 0x01201,
         CEPH_MDS_OP_LINK       = 0x01202,
diff --git a/fs/ceph/ceph_hash.h b/include/linux/ceph/ceph_hash.h

similarity index 100%

rename from fs/ceph/ceph_hash.h

rename to include/linux/ceph/ceph_hash.h
diff --git a/include/linux/ceph/debugfs.h b/include/linux/ceph/debugfs.h

new file mode 100644 (file)

index 0000000..2a79702
--- /dev/null
+++ b/include/linux/ceph/debugfs.h
@@ -0,0 +1,33 @@
+#ifndef _FS_CEPH_DEBUGFS_H
+#define _FS_CEPH_DEBUGFS_H
+
+#include "ceph_debug.h"
+#include "types.h"
+
+#define CEPH_DEFINE_SHOW_FUNC(name)                                    \
+static int name##_open(struct inode *inode, struct file *file)         \
+{                                                                      \
+       struct seq_file *sf;                                            \
+       int ret;                                                        \
+                                                                       \
+       ret = single_open(file, name, NULL);                            \
+       sf = file->private_data;                                        \
+       sf->private = inode->i_private;                                 \
+       return ret;                                                     \
+}                                                                      \
+                                                                       \
+static const struct file_operations name##_fops = {                    \
+       .open           = name##_open,                                  \
+       .read           = seq_read,                                     \
+       .llseek         = seq_lseek,                                    \
+       .release        = single_release,                               \
+};
+
+/* debugfs.c */
+extern int ceph_debugfs_init(void);
+extern void ceph_debugfs_cleanup(void);
+extern int ceph_debugfs_client_init(struct ceph_client *client);
+extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
+
+#endif
+
diff --git a/fs/ceph/decode.h b/include/linux/ceph/decode.h

similarity index 96%

rename from fs/ceph/decode.h

rename to include/linux/ceph/decode.h

index 3d25415afe6368debff12c405463e52c7683edc6..c5b6939fb32af578501a280f73f2aa7b4276ac37 100644 (file)
--- a/fs/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -191,6 +191,11 @@ static inline void ceph_encode_string(void **p, void *end,
                 ceph_encode_need(p, end, n, bad);               \
                 ceph_encode_copy(p, pv, n);                     \
         } while (0)
+#define ceph_encode_string_safe(p, end, s, n, bad)             \
+       do {                                                    \
+               ceph_encode_need(p, end, n, bad);               \
+               ceph_encode_string(p, end, s, n);               \
+       } while (0)
  
  
  #endif
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h

new file mode 100644 (file)

index 0000000..f22b2e9
--- /dev/null
+++ b/include/linux/ceph/libceph.h
@@ -0,0 +1,249 @@
+#ifndef _FS_CEPH_LIBCEPH_H
+#define _FS_CEPH_LIBCEPH_H
+
+#include "ceph_debug.h"
+
+#include <asm/unaligned.h>
+#include <linux/backing-dev.h>
+#include <linux/completion.h>
+#include <linux/exportfs.h>
+#include <linux/fs.h>
+#include <linux/mempool.h>
+#include <linux/pagemap.h>
+#include <linux/wait.h>
+#include <linux/writeback.h>
+#include <linux/slab.h>
+
+#include "types.h"
+#include "messenger.h"
+#include "msgpool.h"
+#include "mon_client.h"
+#include "osd_client.h"
+#include "ceph_fs.h"
+
+/*
+ * Supported features
+ */
+#define CEPH_FEATURE_SUPPORTED_DEFAULT CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_DEFAULT  CEPH_FEATURE_NOSRCADDR
+
+/*
+ * mount options
+ */
+#define CEPH_OPT_FSID             (1<<0)
+#define CEPH_OPT_NOSHARE          (1<<1) /* don't share client with other sbs */
+#define CEPH_OPT_MYIP             (1<<2) /* specified my ip */
+#define CEPH_OPT_NOCRC            (1<<3) /* no data crc on writes */
+
+#define CEPH_OPT_DEFAULT   (0);
+
+#define ceph_set_opt(client, opt) \
+       (client)->options->flags |= CEPH_OPT_##opt;
+#define ceph_test_opt(client, opt) \
+       (!!((client)->options->flags & CEPH_OPT_##opt))
+
+struct ceph_options {
+       int flags;
+       struct ceph_fsid fsid;
+       struct ceph_entity_addr my_addr;
+       int mount_timeout;
+       int osd_idle_ttl;
+       int osd_timeout;
+       int osd_keepalive_timeout;
+
+       /*
+        * any type that can't be simply compared or doesn't need need
+        * to be compared should go beyond this point,
+        * ceph_compare_options() should be updated accordingly
+        */
+
+       struct ceph_entity_addr *mon_addr; /* should be the first
+                                             pointer type of args */
+       int num_mon;
+       char *name;
+       char *secret;
+};
+
+/*
+ * defaults
+ */
+#define CEPH_MOUNT_TIMEOUT_DEFAULT  60
+#define CEPH_OSD_TIMEOUT_DEFAULT    60  /* seconds */
+#define CEPH_OSD_KEEPALIVE_DEFAULT  5
+#define CEPH_OSD_IDLE_TTL_DEFAULT    60
+#define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
+
+#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
+#define CEPH_MSG_MAX_DATA_LEN  (16*1024*1024)
+
+#define CEPH_AUTH_NAME_DEFAULT   "guest"
+
+/*
+ * Delay telling the MDS we no longer want caps, in case we reopen
+ * the file.  Delay a minimum amount of time, even if we send a cap
+ * message for some other reason.  Otherwise, take the oppotunity to
+ * update the mds to avoid sending another message later.
+ */
+#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
+#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
+
+#define CEPH_CAP_RELEASE_SAFETY_DEFAULT        (CEPH_CAPS_PER_RELEASE * 4)
+
+/* mount state */
+enum {
+       CEPH_MOUNT_MOUNTING,
+       CEPH_MOUNT_MOUNTED,
+       CEPH_MOUNT_UNMOUNTING,
+       CEPH_MOUNT_UNMOUNTED,
+       CEPH_MOUNT_SHUTDOWN,
+};
+
+/*
+ * subtract jiffies
+ */
+static inline unsigned long time_sub(unsigned long a, unsigned long b)
+{
+       BUG_ON(time_after(b, a));
+       return (long)a - (long)b;
+}
+
+struct ceph_mds_client;
+
+/*
+ * per client state
+ *
+ * possibly shared by multiple mount points, if they are
+ * mounting the same ceph filesystem/cluster.
+ */
+struct ceph_client {
+       struct ceph_fsid fsid;
+       bool have_fsid;
+
+       void *private;
+
+       struct ceph_options *options;
+
+       struct mutex mount_mutex;      /* serialize mount attempts */
+       wait_queue_head_t auth_wq;
+       int auth_err;
+
+       int (*extra_mon_dispatch)(struct ceph_client *, struct ceph_msg *);
+
+       u32 supported_features;
+       u32 required_features;
+
+       struct ceph_messenger *msgr;   /* messenger instance */
+       struct ceph_mon_client monc;
+       struct ceph_osd_client osdc;
+
+#ifdef CONFIG_DEBUG_FS
+       struct dentry *debugfs_dir;
+       struct dentry *debugfs_monmap;
+       struct dentry *debugfs_osdmap;
+#endif
+};
+
+
+
+/*
+ * snapshots
+ */
+
+/*
+ * A "snap context" is the set of existing snapshots when we
+ * write data.  It is used by the OSD to guide its COW behavior.
+ *
+ * The ceph_snap_context is refcounted, and attached to each dirty
+ * page, indicating which context the dirty data belonged when it was
+ * dirtied.
+ */
+struct ceph_snap_context {
+       atomic_t nref;
+       u64 seq;
+       int num_snaps;
+       u64 snaps[];
+};
+
+static inline struct ceph_snap_context *
+ceph_get_snap_context(struct ceph_snap_context *sc)
+{
+       /*
+       printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
+              atomic_read(&sc->nref)+1);
+       */
+       if (sc)
+               atomic_inc(&sc->nref);
+       return sc;
+}
+
+static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
+{
+       if (!sc)
+               return;
+       /*
+       printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
+              atomic_read(&sc->nref)-1);
+       */
+       if (atomic_dec_and_test(&sc->nref)) {
+               /*printk(" deleting snap_context %p\n", sc);*/
+               kfree(sc);
+       }
+}
+
+/*
+ * calculate the number of pages a given length and offset map onto,
+ * if we align the data.
+ */
+static inline int calc_pages_for(u64 off, u64 len)
+{
+       return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
+               (off >> PAGE_CACHE_SHIFT);
+}
+
+/* ceph_common.c */
+extern const char *ceph_msg_type_name(int type);
+extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
+extern struct kmem_cache *ceph_inode_cachep;
+extern struct kmem_cache *ceph_cap_cachep;
+extern struct kmem_cache *ceph_dentry_cachep;
+extern struct kmem_cache *ceph_file_cachep;
+
+extern int ceph_parse_options(struct ceph_options **popt, char *options,
+                             const char *dev_name, const char *dev_name_end,
+                             int (*parse_extra_token)(char *c, void *private),
+                             void *private);
+extern void ceph_destroy_options(struct ceph_options *opt);
+extern int ceph_compare_options(struct ceph_options *new_opt,
+                               struct ceph_client *client);
+extern struct ceph_client *ceph_create_client(struct ceph_options *opt,
+                                             void *private);
+extern u64 ceph_client_id(struct ceph_client *client);
+extern void ceph_destroy_client(struct ceph_client *client);
+extern int __ceph_open_session(struct ceph_client *client,
+                              unsigned long started);
+extern int ceph_open_session(struct ceph_client *client);
+
+/* pagevec.c */
+extern void ceph_release_page_vector(struct page **pages, int num_pages);
+
+extern struct page **ceph_get_direct_page_vector(const char __user *data,
+                                           int num_pages,
+                                           loff_t off, size_t len);
+extern void ceph_put_page_vector(struct page **pages, int num_pages);
+extern void ceph_release_page_vector(struct page **pages, int num_pages);
+extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
+extern int ceph_copy_user_to_page_vector(struct page **pages,
+                                        const char __user *data,
+                                        loff_t off, size_t len);
+extern int ceph_copy_to_page_vector(struct page **pages,
+                                   const char *data,
+                                   loff_t off, size_t len);
+extern int ceph_copy_from_page_vector(struct page **pages,
+                                   char *data,
+                                   loff_t off, size_t len);
+extern int ceph_copy_page_vector_to_user(struct page **pages, char __user *data,
+                                   loff_t off, size_t len);
+extern void ceph_zero_page_vector_range(int off, int len, struct page **pages);
+
+
+#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h

similarity index 100%

rename from fs/ceph/mdsmap.h

rename to include/linux/ceph/mdsmap.h
diff --git a/fs/ceph/messenger.h b/include/linux/ceph/messenger.h

similarity index 95%

rename from fs/ceph/messenger.h

rename to include/linux/ceph/messenger.h

index 76fbc957bc137dab063cdd69abcc1b432800aa51..5956d62c3057139ac6634df8e13556659d07725e 100644 (file)
--- a/fs/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -65,6 +65,9 @@ struct ceph_messenger {
          */
         u32 global_seq;
         spinlock_t global_seq_lock;
+
+       u32 supported_features;
+       u32 required_features;
  };
  
  /*
@@ -82,6 +85,10 @@ struct ceph_msg {
         struct ceph_pagelist *pagelist; /* instead of pages */
         struct list_head list_head;
         struct kref kref;
+       struct bio  *bio;               /* instead of pages/pagelist */
+       struct bio  *bio_iter;          /* bio iterator */
+       int bio_seg;                    /* current bio segment */
+       struct ceph_pagelist *trail;    /* the trailing part of the data */
         bool front_is_vmalloc;
         bool more_to_follow;
         bool needs_out_seq;
@@ -205,7 +212,7 @@ struct ceph_connection {
  };
  
  
-extern const char *pr_addr(const struct sockaddr_storage *ss);
+extern const char *ceph_pr_addr(const struct sockaddr_storage *ss);
  extern int ceph_parse_ips(const char *c, const char *end,
                           struct ceph_entity_addr *addr,
                           int max_count, int *count);
@@ -216,7 +223,8 @@ extern void ceph_msgr_exit(void);
  extern void ceph_msgr_flush(void);
  
  extern struct ceph_messenger *ceph_messenger_create(
-       struct ceph_entity_addr *myaddr);
+       struct ceph_entity_addr *myaddr,
+       u32 features, u32 required);
  extern void ceph_messenger_destroy(struct ceph_messenger *);
  
  extern void ceph_con_init(struct ceph_messenger *msgr,
diff --git a/fs/ceph/mon_client.h b/include/linux/ceph/mon_client.h

similarity index 99%

rename from fs/ceph/mon_client.h

rename to include/linux/ceph/mon_client.h

index 8e396f2c09637f29d0a82311244d036441a51c1c..545f85917780ab3cd65a314553f7011b655dbb50 100644 (file)
--- a/fs/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -79,6 +79,7 @@ struct ceph_mon_client {
         u64 last_tid;
  
         /* mds/osd map */
+       int want_mdsmap;
         int want_next_osdmap; /* 1 = want, 2 = want+asked */
         u32 have_osdmap, have_mdsmap;
  
diff --git a/fs/ceph/msgpool.h b/include/linux/ceph/msgpool.h

similarity index 100%

rename from fs/ceph/msgpool.h

rename to include/linux/ceph/msgpool.h
diff --git a/fs/ceph/msgr.h b/include/linux/ceph/msgr.h

similarity index 100%

rename from fs/ceph/msgr.h

rename to include/linux/ceph/msgr.h
diff --git a/fs/ceph/osd_client.h b/include/linux/ceph/osd_client.h

similarity index 76%

rename from fs/ceph/osd_client.h

rename to include/linux/ceph/osd_client.h

index ce776989ef6a76d73d1475ce2335ce558b653e2d..6c91fb032c39cd907a3ac509db75a3a860e564c1 100644 (file)
--- a/fs/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -15,6 +15,7 @@ struct ceph_snap_context;
  struct ceph_osd_request;
  struct ceph_osd_client;
  struct ceph_authorizer;
+struct ceph_pagelist;
  
  /*
   * completion callback for async writepages
@@ -68,6 +69,7 @@ struct ceph_osd_request {
         struct list_head  r_unsafe_item;
  
         struct inode *r_inode;                /* for use by callbacks */
+       void *r_priv;                         /* ditto */
  
         char              r_oid[40];          /* object name */
         int               r_oid_len;
@@ -80,6 +82,11 @@ struct ceph_osd_request {
         struct page     **r_pages;            /* pages for data payload */
         int               r_pages_from_pool;
         int               r_own_pages;        /* if true, i own page list */
+#ifdef CONFIG_BLOCK
+       struct bio       *r_bio;              /* instead of pages */
+#endif
+
+       struct ceph_pagelist *r_trail;        /* trailing part of the data */
  };
  
  struct ceph_osd_client {
@@ -110,6 +117,42 @@ struct ceph_osd_client {
         struct ceph_msgpool     msgpool_op_reply;
  };
  
+struct ceph_osd_req_op {
+       u16 op;           /* CEPH_OSD_OP_* */
+       u32 flags;        /* CEPH_OSD_FLAG_* */
+       union {
+               struct {
+                       u64 offset, length;
+                       u64 truncate_size;
+                       u32 truncate_seq;
+               } extent;
+               struct {
+                       const char *name;
+                       u32 name_len;
+                       const char  *val;
+                       u32 value_len;
+                       __u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
+                       __u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
+               } xattr;
+               struct {
+                       const char *class_name;
+                       __u8 class_len;
+                       const char *method_name;
+                       __u8 method_len;
+                       __u8 argc;
+                       const char *indata;
+                       u32 indata_len;
+               } cls;
+               struct {
+                       u64 cookie, count;
+               } pgls;
+               struct {
+                       u64 snapid;
+               } snap;
+       };
+       u32 payload_len;
+};
+
  extern int ceph_osdc_init(struct ceph_osd_client *osdc,
                           struct ceph_client *client);
  extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
@@ -119,6 +162,30 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
  extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
                                  struct ceph_msg *msg);
  
+extern void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
+                       struct ceph_file_layout *layout,
+                       u64 snapid,
+                       u64 off, u64 *plen, u64 *bno,
+                       struct ceph_osd_request *req,
+                       struct ceph_osd_req_op *op);
+
+extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
+                                              int flags,
+                                              struct ceph_snap_context *snapc,
+                                              struct ceph_osd_req_op *ops,
+                                              bool use_mempool,
+                                              gfp_t gfp_flags,
+                                              struct page **pages,
+                                              struct bio *bio);
+
+extern void ceph_osdc_build_request(struct ceph_osd_request *req,
+                                   u64 off, u64 *plen,
+                                   struct ceph_osd_req_op *src_ops,
+                                   struct ceph_snap_context *snapc,
+                                   struct timespec *mtime,
+                                   const char *oid,
+                                   int oid_len);
+
  extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
                                       struct ceph_file_layout *layout,
                                       struct ceph_vino vino,
diff --git a/fs/ceph/osdmap.h b/include/linux/ceph/osdmap.h

similarity index 97%

rename from fs/ceph/osdmap.h

rename to include/linux/ceph/osdmap.h

index 970b547e510dbd0edb8ba8c2ec95c45932be6106..ba4c205cbb016a141495e2e872ee6a3cfd57253a 100644 (file)
--- a/fs/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -4,7 +4,7 @@
  #include <linux/rbtree.h>
  #include "types.h"
  #include "ceph_fs.h"
-#include "crush/crush.h"
+#include <linux/crush/crush.h>
  
  /*
   * The osd map describes the current membership of the osd cluster and
@@ -125,4 +125,6 @@ extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
  extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
                                 struct ceph_pg pgid);
  
+extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
+
  #endif
diff --git a/fs/ceph/pagelist.h b/include/linux/ceph/pagelist.h

similarity index 62%

rename from fs/ceph/pagelist.h

rename to include/linux/ceph/pagelist.h

index e8a4187e1087aa7fbc86a2620764f87f234dfe0a..9660d6b0a35d780dbe0806edf6193d1f7d5ef8f5 100644 (file)
--- a/fs/ceph/pagelist.h
+++ b/include/linux/ceph/pagelist.h
@@ -8,6 +8,14 @@ struct ceph_pagelist {
         void *mapped_tail;
         size_t length;
         size_t room;
+       struct list_head free_list;
+       size_t num_pages_free;
+};
+
+struct ceph_pagelist_cursor {
+       struct ceph_pagelist *pl;   /* pagelist, for error checking */
+       struct list_head *page_lru; /* page in list */
+       size_t room;                /* room remaining to reset to */
  };
  
  static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
@@ -16,10 +24,23 @@ static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
         pl->mapped_tail = NULL;
         pl->length = 0;
         pl->room = 0;
+       INIT_LIST_HEAD(&pl->free_list);
+       pl->num_pages_free = 0;
  }
+
  extern int ceph_pagelist_release(struct ceph_pagelist *pl);
  
-extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
+extern int ceph_pagelist_append(struct ceph_pagelist *pl, const void *d, size_t l);
+
+extern int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space);
+
+extern int ceph_pagelist_free_reserve(struct ceph_pagelist *pl);
+
+extern void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
+                                    struct ceph_pagelist_cursor *c);
+
+extern int ceph_pagelist_truncate(struct ceph_pagelist *pl,
+                                 struct ceph_pagelist_cursor *c);
  
  static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
  {
diff --git a/fs/ceph/rados.h b/include/linux/ceph/rados.h

similarity index 100%

rename from fs/ceph/rados.h

rename to include/linux/ceph/rados.h
diff --git a/fs/ceph/types.h b/include/linux/ceph/types.h

similarity index 100%

rename from fs/ceph/types.h

rename to include/linux/ceph/types.h
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h

index 0c991023ee475fad85136a04b00cb8d2699a382b..709dfb901d1124c75656fb0f7591826683bd2c5d 100644 (file)
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -75,7 +75,7 @@ struct cgroup_subsys_state {
  
         unsigned long flags;
         /* ID for this css, if possible */
-       struct css_id *id;
+       struct css_id __rcu *id;
  };
  
  /* bits in struct cgroup_subsys_state flags field */
@@ -205,7 +205,7 @@ struct cgroup {
         struct list_head children;      /* my children */
  
         struct cgroup *parent;          /* my parent */
-       struct dentry *dentry;          /* cgroup fs entry, RCU protected */
+       struct dentry __rcu *dentry;    /* cgroup fs entry, RCU protected */
  
         /* Private pointers for each registered subsystem */
         struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
diff --git a/include/linux/compiler.h b/include/linux/compiler.h

index c1a62c56a660226b1592bc6bc269087adac58649..320d6c94ff848d5db94fb1fd76576501a88e9a3a 100644 (file)
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -16,7 +16,11 @@
  # define __release(x)  __context__(x,-1)
  # define __cond_lock(x,c)      ((c) ? ({ __acquire(x); 1; }) : 0)
  # define __percpu      __attribute__((noderef, address_space(3)))
+#ifdef CONFIG_SPARSE_RCU_POINTER
+# define __rcu         __attribute__((noderef, address_space(4)))
+#else
  # define __rcu
+#endif
  extern void __chk_user_ptr(const volatile void __user *);
  extern void __chk_io_ptr(const volatile void __iomem *);
  #else
diff --git a/include/linux/coredump.h b/include/linux/coredump.h

index 8ba66a9d9022c7d0ddbbcb984b8839dabb2b91f8..ba4b85a6d9b8bc71853de37df2a36b865707b745 100644 (file)
--- a/include/linux/coredump.h
+++ b/include/linux/coredump.h
@@ -9,37 +9,7 @@
   * These are the only things you should do on a core-file: use only these
   * functions to write out all the necessary info.
   */
-static inline int dump_write(struct file *file, const void *addr, int nr)
-{
-       return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-}
-
-static inline int dump_seek(struct file *file, loff_t off)
-{
-       int ret = 1;
-
-       if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
-               if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
-                       return 0;
-       } else {
-               char *buf = (char *)get_zeroed_page(GFP_KERNEL);
-
-               if (!buf)
-                       return 0;
-               while (off > 0) {
-                       unsigned long n = off;
-
-                       if (n > PAGE_SIZE)
-                               n = PAGE_SIZE;
-                       if (!dump_write(file, buf, n)) {
-                               ret = 0;
-                               break;
-                       }
-                       off -= n;
-               }
-               free_page((unsigned long)buf);
-       }
-       return ret;
-}
+extern int dump_write(struct file *file, const void *addr, int nr);
+extern int dump_seek(struct file *file, loff_t off);
  
  #endif /* _LINUX_COREDUMP_H */
diff --git a/include/linux/cred.h b/include/linux/cred.h

index 4d2c39573f3694cdeef07d83c6f141b4804a5f36..4aaeab3764469961f1106d988e57e58a91e1a16e 100644 (file)
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -84,7 +84,7 @@ struct thread_group_cred {
         atomic_t        usage;
         pid_t           tgid;                   /* thread group process ID */
         spinlock_t      lock;
-       struct key      *session_keyring;       /* keyring inherited over fork */
+       struct key __rcu *session_keyring;      /* keyring inherited over fork */
         struct key      *process_keyring;       /* keyring private to this process */
         struct rcu_head rcu;                    /* RCU deletion hook */
  };
diff --git a/fs/ceph/crush/crush.h b/include/linux/crush/crush.h

similarity index 100%

rename from fs/ceph/crush/crush.h

rename to include/linux/crush/crush.h
diff --git a/fs/ceph/crush/hash.h b/include/linux/crush/hash.h

similarity index 100%

rename from fs/ceph/crush/hash.h

rename to include/linux/crush/hash.h
diff --git a/fs/ceph/crush/mapper.h b/include/linux/crush/mapper.h

similarity index 100%

rename from fs/ceph/crush/mapper.h

rename to include/linux/crush/mapper.h
diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h

index 29b3ce3f2a1d0cd948523d8890c0bb8420834e09..2833452ea01c8ecec5207a7a8631da3f2b439fa3 100644 (file)
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -49,7 +49,6 @@ struct task_struct;
  
  #ifdef CONFIG_LOCKDEP
  extern void debug_show_all_locks(void);
-extern void __debug_show_held_locks(struct task_struct *task);
  extern void debug_show_held_locks(struct task_struct *task);
  extern void debug_check_no_locks_freed(const void *from, unsigned long len);
  extern void debug_check_no_locks_held(struct task_struct *task);
@@ -58,10 +57,6 @@ static inline void debug_show_all_locks(void)
  {
  }
  
-static inline void __debug_show_held_locks(struct task_struct *task)
-{
-}
-
  static inline void debug_show_held_locks(struct task_struct *task)
  {
  }
diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h

index 52c0da4bdd18fe5969fbf9c583652c0f7f00ba07..bef3cda44c4c4bb54570600a99fb0003db7038e7 100644 (file)
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -1,6 +1,8 @@
  #ifndef _DYNAMIC_DEBUG_H
  #define _DYNAMIC_DEBUG_H
  
+#include <linux/jump_label.h>
+
  /* dynamic_printk_enabled, and dynamic_printk_enabled2 are bitmasks in which
   * bit n is set to 1 if any modname hashes into the bucket n, 0 otherwise. They
   * use independent hash functions, to reduce the chance of false positives.
@@ -22,8 +24,6 @@ struct _ddebug {
         const char *function;
         const char *filename;
         const char *format;
-       char primary_hash;
-       char secondary_hash;
         unsigned int lineno:24;
         /*
          * The flags field controls the behaviour at the callsite.
@@ -33,6 +33,7 @@ struct _ddebug {
  #define _DPRINTK_FLAGS_PRINT   (1<<0)  /* printk() a message using the format */
  #define _DPRINTK_FLAGS_DEFAULT 0
         unsigned int flags:8;
+       char enabled;
  } __attribute__((aligned(8)));
  
  
@@ -42,33 +43,35 @@ int ddebug_add_module(struct _ddebug *tab, unsigned int n,
  #if defined(CONFIG_DYNAMIC_DEBUG)
  extern int ddebug_remove_module(const char *mod_name);
  
-#define __dynamic_dbg_enabled(dd)  ({       \
-       int __ret = 0;                                                       \
-       if (unlikely((dynamic_debug_enabled & (1LL << DEBUG_HASH)) &&        \
-                       (dynamic_debug_enabled2 & (1LL << DEBUG_HASH2))))   \
-                               if (unlikely(dd.flags))                      \
-                                       __ret = 1;                           \
-       __ret; })
-
  #define dynamic_pr_debug(fmt, ...) do {                                        \
+       __label__ do_printk;                                            \
+       __label__ out;                                                  \
         static struct _ddebug descriptor                                \
         __used                                                          \
         __attribute__((section("__verbose"), aligned(8))) =             \
-       { KBUILD_MODNAME, __func__, __FILE__, fmt, DEBUG_HASH,  \
-               DEBUG_HASH2, __LINE__, _DPRINTK_FLAGS_DEFAULT };        \
-       if (__dynamic_dbg_enabled(descriptor))                          \
-               printk(KERN_DEBUG pr_fmt(fmt),  ##__VA_ARGS__);         \
+       { KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__,            \
+               _DPRINTK_FLAGS_DEFAULT };                               \
+       JUMP_LABEL(&descriptor.enabled, do_printk);                     \
+       goto out;                                                       \
+do_printk:                                                             \
+       printk(KERN_DEBUG pr_fmt(fmt),  ##__VA_ARGS__);                 \
+out:   ;                                                               \
         } while (0)
  
  
  #define dynamic_dev_dbg(dev, fmt, ...) do {                            \
+       __label__ do_printk;                                            \
+       __label__ out;                                                  \
         static struct _ddebug descriptor                                \
         __used                                                          \
         __attribute__((section("__verbose"), aligned(8))) =             \
-       { KBUILD_MODNAME, __func__, __FILE__, fmt, DEBUG_HASH,  \
-               DEBUG_HASH2, __LINE__, _DPRINTK_FLAGS_DEFAULT };        \
-       if (__dynamic_dbg_enabled(descriptor))                          \
-               dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__);        \
+       { KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__,            \
+               _DPRINTK_FLAGS_DEFAULT };                               \
+       JUMP_LABEL(&descriptor.enabled, do_printk);                     \
+       goto out;                                                       \
+do_printk:                                                             \
+       dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__);                \
+out:   ;                                                               \
         } while (0)
  
  #else
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h

index f59ed297b661fceb086a6ca188b54c3ff0006f85..133c0ba25e306a68199d399cf26fc15c242d347a 100644 (file)
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -31,7 +31,7 @@ struct embedded_fd_set {
  
  struct fdtable {
         unsigned int max_fds;
-       struct file ** fd;      /* current fd array */
+       struct file __rcu **fd;      /* current fd array */
         fd_set *close_on_exec;
         fd_set *open_fds;
         struct rcu_head rcu;
@@ -46,7 +46,7 @@ struct files_struct {
     * read mostly part
     */
         atomic_t count;
-       struct fdtable *fdt;
+       struct fdtable __rcu *fdt;
         struct fdtable fdtab;
    /*
     * written part on a separate cache line in SMP
@@ -55,7 +55,7 @@ struct files_struct {
         int next_fd;
         struct embedded_fd_set close_on_exec_init;
         struct embedded_fd_set open_fds_init;
-       struct file * fd_array[NR_OPEN_DEFAULT];
+       struct file __rcu * fd_array[NR_OPEN_DEFAULT];
  };
  
  #define rcu_dereference_check_fdtable(files, fdtfd) \
diff --git a/include/linux/fs.h b/include/linux/fs.h

index 63d069bd80b702c392d4052bf2be937f90763692..3168dcfb94f2bedfa02fe42b7345055d3bb154e2 100644 (file)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1384,7 +1384,7 @@ struct super_block {
          * Saved mount options for lazy filesystems using
          * generic_show_options()
          */
-       char *s_options;
+       char __rcu *s_options;
  };
  
  extern struct timespec current_fs_time(struct super_block *sb);
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h

index 02b8b24f8f51f0e37156731ba94da19ba2d19131..8beabb958f61d5147c8893f1e780415a91fcb2e6 100644 (file)
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -191,8 +191,8 @@ struct ftrace_event_call {
         unsigned int            flags;
  
  #ifdef CONFIG_PERF_EVENTS
-       int                     perf_refcount;
-       struct hlist_head       *perf_events;
+       int                             perf_refcount;
+       struct hlist_head __percpu      *perf_events;
  #endif
  };
  
@@ -252,8 +252,8 @@ DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
  
  extern int  perf_trace_init(struct perf_event *event);
  extern void perf_trace_destroy(struct perf_event *event);
-extern int  perf_trace_enable(struct perf_event *event);
-extern void perf_trace_disable(struct perf_event *event);
+extern int  perf_trace_add(struct perf_event *event, int flags);
+extern void perf_trace_del(struct perf_event *event, int flags);
  extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
                                      char *filter_str);
  extern void ftrace_profile_free_filter(struct perf_event *event);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h

index 5f2f4c4d8fb0594720bfc1643cec75fbf12aaa36..af3f06b41dc1e520a7729eb10e73f22b26ed8dff 100644 (file)
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -129,8 +129,8 @@ struct blk_scsi_cmd_filter {
  struct disk_part_tbl {
         struct rcu_head rcu_head;
         int len;
-       struct hd_struct *last_lookup;
-       struct hd_struct *part[];
+       struct hd_struct __rcu *last_lookup;
+       struct hd_struct __rcu *part[];
  };
  
  struct gendisk {
@@ -149,7 +149,7 @@ struct gendisk {
          * non-critical accesses use RCU.  Always access through
          * helpers.
          */
-       struct disk_part_tbl *part_tbl;
+       struct disk_part_tbl __rcu *part_tbl;
         struct hd_struct part0;
  
         const struct block_device_operations *fops;
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h

index ff43e9268449c20c333e3ba5724f62dab2a6429a..96c323ac44dfea066a48641cf10b3093a1cfb63c 100644 (file)
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -146,7 +146,7 @@ extern void account_system_vtime(struct task_struct *tsk);
  #endif
  
  #if defined(CONFIG_NO_HZ)
-#if defined(CONFIG_TINY_RCU)
+#if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU)
  extern void rcu_enter_nohz(void);
  extern void rcu_exit_nohz(void);
  
diff --git a/include/linux/idr.h b/include/linux/idr.h

index e968db71e33a94548160cb38c216289ab1577c30..cdb715e58e3e6de8b2e60485e31ce79ea99da8f4 100644 (file)
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -50,14 +50,14 @@
  
  struct idr_layer {
         unsigned long            bitmap; /* A zero bit means "space here" */
-       struct idr_layer        *ary[1<<IDR_BITS];
+       struct idr_layer __rcu  *ary[1<<IDR_BITS];
         int                      count;  /* When zero, we can release it */
         int                      layer;  /* distance from leaf */
         struct rcu_head          rcu_head;
  };
  
  struct idr {
-       struct idr_layer *top;
+       struct idr_layer __rcu *top;
         struct idr_layer *id_free;
         int               layers; /* only valid without concurrent changes */
         int               id_free_cnt;
diff --git a/include/linux/init_task.h b/include/linux/init_task.h

index 1f43fa56f6001f821736e4b66e9fac5e6e0375ed..2fea6c8ef6babea0564ccf3b061698cacba1d14e 100644 (file)
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -82,11 +82,17 @@ extern struct group_info init_groups;
  # define CAP_INIT_BSET  CAP_FULL_SET
  
  #ifdef CONFIG_TREE_PREEMPT_RCU
+#define INIT_TASK_RCU_TREE_PREEMPT()                                   \
+       .rcu_blocked_node = NULL,
+#else
+#define INIT_TASK_RCU_TREE_PREEMPT(tsk)
+#endif
+#ifdef CONFIG_PREEMPT_RCU
  #define INIT_TASK_RCU_PREEMPT(tsk)                                     \
         .rcu_read_lock_nesting = 0,                                     \
         .rcu_read_unlock_special = 0,                                   \
-       .rcu_blocked_node = NULL,                                       \
-       .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry),
+       .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry),           \
+       INIT_TASK_RCU_TREE_PREEMPT()
  #else
  #define INIT_TASK_RCU_PREEMPT(tsk)
  #endif
@@ -137,8 +143,8 @@ extern struct cred init_cred;
         .children       = LIST_HEAD_INIT(tsk.children),                 \
         .sibling        = LIST_HEAD_INIT(tsk.sibling),                  \
         .group_leader   = &tsk,                                         \
-       .real_cred      = &init_cred,                                   \
-       .cred           = &init_cred,                                   \
+       RCU_INIT_POINTER(.real_cred, &init_cred),                       \
+       RCU_INIT_POINTER(.cred, &init_cred),                            \
         .cred_guard_mutex =                                             \
                  __MUTEX_INITIALIZER(tsk.cred_guard_mutex),             \
         .comm           = "swapper",                                    \
diff --git a/include/linux/input.h b/include/linux/input.h

index 896a92227bc429a9abdd27da5e3b0d77be122d95..d6ae1761be97fae4cbbc66602d19d9bdd60f3bdd 100644 (file)
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -1196,7 +1196,7 @@ struct input_dev {
         int (*flush)(struct input_dev *dev, struct file *file);
         int (*event)(struct input_dev *dev, unsigned int type, unsigned int code, int value);
  
-       struct input_handle *grab;
+       struct input_handle __rcu *grab;
  
         spinlock_t event_lock;
         struct mutex mutex;
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h

index a0384a4d1e6f4da4d39a02c0f8ba6634842dd236..531495db17081efabcb649a3740673f03a531e67 100644 (file)
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -18,6 +18,7 @@
  #include <asm/atomic.h>
  #include <asm/ptrace.h>
  #include <asm/system.h>
+#include <trace/events/irq.h>
  
  /*
   * These correspond to the IORESOURCE_IRQ_* defines in
@@ -407,7 +408,12 @@ asmlinkage void do_softirq(void);
  asmlinkage void __do_softirq(void);
  extern void open_softirq(int nr, void (*action)(struct softirq_action *));
  extern void softirq_init(void);
-#define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0)
+static inline void __raise_softirq_irqoff(unsigned int nr)
+{
+       trace_softirq_raise((struct softirq_action *)(unsigned long)nr, NULL);
+       or_softirq_pending(1UL << nr);
+}
+
  extern void raise_softirq_irqoff(unsigned int nr);
  extern void raise_softirq(unsigned int nr);
  extern void wakeup_softirqd(void);
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h

index 64d5291330312ac718ba7f649e3d428063127f39..3e70b21884a948880f90e28684e5d5778e7a3d35 100644 (file)
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -53,7 +53,7 @@ struct io_context {
  
         struct radix_tree_root radix_root;
         struct hlist_head cic_list;
-       void *ioc_data;
+       void __rcu *ioc_data;
  };
  
  static inline struct io_context *ioc_task_link(struct io_context *ioc)
diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h

new file mode 100644 (file)

index 0000000..4fa09d4
--- /dev/null
+++ b/include/linux/irq_work.h
@@ -0,0 +1,20 @@
+#ifndef _LINUX_IRQ_WORK_H
+#define _LINUX_IRQ_WORK_H
+
+struct irq_work {
+       struct irq_work *next;
+       void (*func)(struct irq_work *);
+};
+
+static inline
+void init_irq_work(struct irq_work *entry, void (*func)(struct irq_work *))
+{
+       entry->next = NULL;
+       entry->func = func;
+}
+
+bool irq_work_queue(struct irq_work *entry);
+void irq_work_run(void);
+void irq_work_sync(struct irq_work *entry);
+
+#endif /* _LINUX_IRQ_WORK_H */
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h

new file mode 100644 (file)

index 0000000..b67cb18
--- /dev/null
+++ b/include/linux/jump_label.h
@@ -0,0 +1,74 @@
+#ifndef _LINUX_JUMP_LABEL_H
+#define _LINUX_JUMP_LABEL_H
+
+#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_HAVE_ARCH_JUMP_LABEL)
+# include <asm/jump_label.h>
+# define HAVE_JUMP_LABEL
+#endif
+
+enum jump_label_type {
+       JUMP_LABEL_ENABLE,
+       JUMP_LABEL_DISABLE
+};
+
+struct module;
+
+#ifdef HAVE_JUMP_LABEL
+
+extern struct jump_entry __start___jump_table[];
+extern struct jump_entry __stop___jump_table[];
+
+extern void arch_jump_label_transform(struct jump_entry *entry,
+                                enum jump_label_type type);
+extern void arch_jump_label_text_poke_early(jump_label_t addr);
+extern void jump_label_update(unsigned long key, enum jump_label_type type);
+extern void jump_label_apply_nops(struct module *mod);
+extern int jump_label_text_reserved(void *start, void *end);
+
+#define jump_label_enable(key) \
+       jump_label_update((unsigned long)key, JUMP_LABEL_ENABLE);
+
+#define jump_label_disable(key) \
+       jump_label_update((unsigned long)key, JUMP_LABEL_DISABLE);
+
+#else
+
+#define JUMP_LABEL(key, label)                 \
+do {                                           \
+       if (unlikely(*key))                     \
+               goto label;                     \
+} while (0)
+
+#define jump_label_enable(cond_var)    \
+do {                                   \
+       *(cond_var) = 1;                        \
+} while (0)
+
+#define jump_label_disable(cond_var)   \
+do {                                   \
+       *(cond_var) = 0;                        \
+} while (0)
+
+static inline int jump_label_apply_nops(struct module *mod)
+{
+       return 0;
+}
+
+static inline int jump_label_text_reserved(void *start, void *end)
+{
+       return 0;
+}
+
+#endif
+
+#define COND_STMT(key, stmt)                                   \
+do {                                                           \
+       __label__ jl_enabled;                                   \
+       JUMP_LABEL(key, jl_enabled);                            \
+       if (0) {                                                \
+jl_enabled:                                                    \
+               stmt;                                           \
+       }                                                       \
+} while (0)
+
+#endif
diff --git a/include/linux/jump_label_ref.h b/include/linux/jump_label_ref.h

new file mode 100644 (file)

index 0000000..e5d012a
--- /dev/null
+++ b/include/linux/jump_label_ref.h
@@ -0,0 +1,44 @@
+#ifndef _LINUX_JUMP_LABEL_REF_H
+#define _LINUX_JUMP_LABEL_REF_H
+
+#include <linux/jump_label.h>
+#include <asm/atomic.h>
+
+#ifdef HAVE_JUMP_LABEL
+
+static inline void jump_label_inc(atomic_t *key)
+{
+       if (atomic_add_return(1, key) == 1)
+               jump_label_enable(key);
+}
+
+static inline void jump_label_dec(atomic_t *key)
+{
+       if (atomic_dec_and_test(key))
+               jump_label_disable(key);
+}
+
+#else /* !HAVE_JUMP_LABEL */
+
+static inline void jump_label_inc(atomic_t *key)
+{
+       atomic_inc(key);
+}
+
+static inline void jump_label_dec(atomic_t *key)
+{
+       atomic_dec(key);
+}
+
+#undef JUMP_LABEL
+#define JUMP_LABEL(key, label)                                         \
+do {                                                                   \
+       if (unlikely(__builtin_choose_expr(                             \
+             __builtin_types_compatible_p(typeof(key), atomic_t *),    \
+             atomic_read((atomic_t *)(key)), *(key))))                 \
+               goto label;                                             \
+} while (0)
+
+#endif /* HAVE_JUMP_LABEL */
+
+#endif /* _LINUX_JUMP_LABEL_REF_H */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h

index 2b0a35e6bc691896609944c114328b414be37a12..1759ba5adce845fa08d4b1fd899920ac65436938 100644 (file)
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -58,7 +58,18 @@ extern const char linux_proc_banner[];
  
  #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
  #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
-#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
+#define roundup(x, y) (                                        \
+{                                                      \
+       typeof(y) __y = y;                              \
+       (((x) + (__y - 1)) / __y) * __y;                \
+}                                                      \
+)
+#define rounddown(x, y) (                              \
+{                                                      \
+       typeof(x) __x = (x);                            \
+       __x - (__x % (y));                              \
+}                                                      \
+)
  #define DIV_ROUND_CLOSEST(x, divisor)(                 \
  {                                                      \
         typeof(divisor) __divisor = divisor;            \
diff --git a/include/linux/key.h b/include/linux/key.h

index cd50dfa1d4c224de2a26e2d8ef3926d7fc74435e..3db0adce1fdabd00d034ad2e111b9d3411146dfd 100644 (file)
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -178,8 +178,9 @@ struct key {
          */
         union {
                 unsigned long           value;
+               void __rcu              *rcudata;
                 void                    *data;
-               struct keyring_list     *subscriptions;
+               struct keyring_list __rcu *subscriptions;
         } payload;
  };
  
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h

index c13cc48697aa73d2f8daa55dcbaf94204ff3db38..ac740b26eb1071950a26ce64ddca68f8212e8742 100644 (file)
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -205,7 +205,7 @@ struct kvm {
  
         struct mutex irq_lock;
  #ifdef CONFIG_HAVE_KVM_IRQCHIP
-       struct kvm_irq_routing_table *irq_routing;
+       struct kvm_irq_routing_table __rcu *irq_routing;
         struct hlist_head mask_notifier_list;
         struct hlist_head irq_ack_notifier_list;
  #endif
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h

index 06aed8305bf3bc38061dc42c168c3b9d83f29492..2186a64ee4b568f09649347b5bf04e5ed7a66cdb 100644 (file)
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -31,6 +31,17 @@ extern int lock_stat;
  
  #define MAX_LOCKDEP_SUBCLASSES         8UL
  
+/*
+ * NR_LOCKDEP_CACHING_CLASSES ... Number of classes
+ * cached in the instance of lockdep_map
+ *
+ * Currently main class (subclass == 0) and signle depth subclass
+ * are cached in lockdep_map. This optimization is mainly targeting
+ * on rq->lock. double_rq_lock() acquires this highly competitive with
+ * single depth.
+ */
+#define NR_LOCKDEP_CACHING_CLASSES     2
+
  /*
   * Lock-classes are keyed via unique addresses, by embedding the
   * lockclass-key into the kernel (or module) .data section. (For
@@ -138,7 +149,7 @@ void clear_lock_stats(struct lock_class *class);
   */
  struct lockdep_map {
         struct lock_class_key           *key;
-       struct lock_class               *class_cache;
+       struct lock_class               *class_cache[NR_LOCKDEP_CACHING_CLASSES];
         const char                      *name;
  #ifdef CONFIG_LOCK_STAT
         int                             cpu;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index ee7e258627f9f5b9999e4c3033f7fa78c5927b48..cb57d657ce4d2643c58e21eb23b2b1434f0f7a2f 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -299,7 +299,7 @@ struct mm_struct {
          * new_owner->mm == mm
          * new_owner->alloc_lock is held
          */
-       struct task_struct *owner;
+       struct task_struct __rcu *owner;
  #endif
  
  #ifdef CONFIG_PROC_FS
diff --git a/include/linux/module.h b/include/linux/module.h

index aace066bad8f067758cda4d341b9ce8941d2ae1c..b29e7458b96642b2f36162aa755b10ff76b15abe 100644 (file)
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -350,7 +350,10 @@ struct module
         struct tracepoint *tracepoints;
         unsigned int num_tracepoints;
  #endif
-
+#ifdef HAVE_JUMP_LABEL
+       struct jump_entry *jump_entries;
+       unsigned int num_jump_entries;
+#endif
  #ifdef CONFIG_TRACING
         const char **trace_bprintk_fmt_start;
         unsigned int num_trace_bprintk_fmt;
diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h

index 9ed534c991b9312d84876c9162ed63e5ef2b5c66..70cd0603911c97b865bc576a3f4490f523e2fc08 100644 (file)
--- a/include/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/linux/netfilter/nfnetlink_conntrack.h
@@ -39,8 +39,9 @@ enum ctattr_type {
         CTA_TUPLE_MASTER,
         CTA_NAT_SEQ_ADJ_ORIG,
         CTA_NAT_SEQ_ADJ_REPLY,
-       CTA_SECMARK,
+       CTA_SECMARK,            /* obsolete */
         CTA_ZONE,
+       CTA_SECCTX,
         __CTA_MAX
  };
  #define CTA_MAX (__CTA_MAX - 1)
@@ -172,4 +173,11 @@ enum ctattr_help {
  };
  #define CTA_HELP_MAX (__CTA_HELP_MAX - 1)
  
+enum ctattr_secctx {
+       CTA_SECCTX_UNSPEC,
+       CTA_SECCTX_NAME,
+       __CTA_SECCTX_MAX
+};
+#define CTA_SECCTX_MAX (__CTA_SECCTX_MAX - 1)
+
  #endif /* _IPCONNTRACK_NETLINK_H */
diff --git a/include/linux/netfilter/xt_SECMARK.h b/include/linux/netfilter/xt_SECMARK.h

index 6fcd3448b18631f04e081cde470f85218dd7f9b8..989092bd6274b44585ccc0faf4f005a0d7b909b7 100644 (file)
--- a/include/linux/netfilter/xt_SECMARK.h
+++ b/include/linux/netfilter/xt_SECMARK.h
@@ -11,18 +11,12 @@
   * packets are being marked for.
   */
  #define SECMARK_MODE_SEL       0x01            /* SELinux */
-#define SECMARK_SELCTX_MAX     256
-
-struct xt_secmark_target_selinux_info {
-       __u32 selsid;
-       char selctx[SECMARK_SELCTX_MAX];
-};
+#define SECMARK_SECCTX_MAX     256
  
  struct xt_secmark_target_info {
         __u8 mode;
-       union {
-               struct xt_secmark_target_selinux_info sel;
-       } u;
+       __u32 secid;
+       char secctx[SECMARK_SECCTX_MAX];
  };
  
  #endif /*_XT_SECMARK_H_target */
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h

index 508f8cf6da379bc7179b5c8f22d534a63a539ad0..d0edf7d823ae3ef60ec2ba6c6daa3ff0b34643cd 100644 (file)
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -185,7 +185,7 @@ struct nfs_inode {
         struct nfs4_cached_acl  *nfs4_acl;
          /* NFSv4 state */
         struct list_head        open_states;
-       struct nfs_delegation   *delegation;
+       struct nfs_delegation __rcu *delegation;
         fmode_t                  delegation_state;
         struct rw_semaphore     rwsem;
  #endif /* CONFIG_NFS_V4*/
diff --git a/include/linux/notifier.h b/include/linux/notifier.h

index b2f1a4d835506b7d0d8fdce8831d608fa28759bd..2026f9e1ceb8e5cb5d6a9f3a2a1b4c4ff02d5953 100644 (file)
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -49,28 +49,28 @@
  
  struct notifier_block {
         int (*notifier_call)(struct notifier_block *, unsigned long, void *);
-       struct notifier_block *next;
+       struct notifier_block __rcu *next;
         int priority;
  };
  
  struct atomic_notifier_head {
         spinlock_t lock;
-       struct notifier_block *head;
+       struct notifier_block __rcu *head;
  };
  
  struct blocking_notifier_head {
         struct rw_semaphore rwsem;
-       struct notifier_block *head;
+       struct notifier_block __rcu *head;
  };
  
  struct raw_notifier_head {
-       struct notifier_block *head;
+       struct notifier_block __rcu *head;
  };
  
  struct srcu_notifier_head {
         struct mutex mutex;
         struct srcu_struct srcu;
-       struct notifier_block *head;
+       struct notifier_block __rcu *head;
  };
  
  #define ATOMIC_INIT_NOTIFIER_HEAD(name) do {   \
diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h

index 5171639ecf0ff070e74d884deb8c969e3b88132d..32fb81212fd153c0a71ba347ead6429d0a8dcd06 100644 (file)
--- a/include/linux/oprofile.h
+++ b/include/linux/oprofile.h
@@ -15,6 +15,7 @@
  
  #include <linux/types.h>
  #include <linux/spinlock.h>
+#include <linux/init.h>
  #include <asm/atomic.h>
   
  /* Each escaped entry is prefixed by ESCAPE_CODE
@@ -185,4 +186,10 @@ int oprofile_add_data(struct op_entry *entry, unsigned long val);
  int oprofile_add_data64(struct op_entry *entry, u64 val);
  int oprofile_write_commit(struct op_entry *entry);
  
+#ifdef CONFIG_PERF_EVENTS
+int __init oprofile_perf_init(struct oprofile_operations *ops);
+void oprofile_perf_exit(void);
+char *op_name_from_perf_id(void);
+#endif /* CONFIG_PERF_EVENTS */
+
  #endif /* OPROFILE_H */
diff --git a/include/linux/percpu.h b/include/linux/percpu.h

index 49466b13c5c6b310f36b31c052573864df6f02a0..0eb50832aa00fd1bbc31cc23837abfc698412402 100644 (file)
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -39,6 +39,15 @@
         preempt_enable();                               \
  } while (0)
  
+#define get_cpu_ptr(var) ({                            \
+       preempt_disable();                              \
+       this_cpu_ptr(var); })
+
+#define put_cpu_ptr(var) do {                          \
+       (void)(var);                                    \
+       preempt_enable();                               \
+} while (0)
+
  #ifdef CONFIG_SMP
  
  /* minimum unit size, also is the maximum supported allocation size */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h

index 716f99b682c1a57fb3b6f1f72e90aec3982ca5fd..057bf22a8323463a6bd9279d6928d3fea7ed8eb8 100644 (file)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -486,6 +486,8 @@ struct perf_guest_info_callbacks {
  #include <linux/workqueue.h>
  #include <linux/ftrace.h>
  #include <linux/cpu.h>
+#include <linux/irq_work.h>
+#include <linux/jump_label_ref.h>
  #include <asm/atomic.h>
  #include <asm/local.h>
  
@@ -529,16 +531,22 @@ struct hw_perf_event {
                         int             last_cpu;
                 };
                 struct { /* software */
-                       s64             remaining;
                         struct hrtimer  hrtimer;
                 };
  #ifdef CONFIG_HAVE_HW_BREAKPOINT
                 struct { /* breakpoint */
                         struct arch_hw_breakpoint       info;
                         struct list_head                bp_list;
+                       /*
+                        * Crufty hack to avoid the chicken and egg
+                        * problem hw_breakpoint has with context
+                        * creation and event initalization.
+                        */
+                       struct task_struct              *bp_target;
                 };
  #endif
         };
+       int                             state;
         local64_t                       prev_count;
         u64                             sample_period;
         u64                             last_period;
@@ -550,6 +558,13 @@ struct hw_perf_event {
  #endif
  };
  
+/*
+ * hw_perf_event::state flags
+ */
+#define PERF_HES_STOPPED       0x01 /* the counter is stopped */
+#define PERF_HES_UPTODATE      0x02 /* event->count up-to-date */
+#define PERF_HES_ARCH          0x04
+
  struct perf_event;
  
  /*
@@ -561,36 +576,70 @@ struct perf_event;
   * struct pmu - generic performance monitoring unit
   */
  struct pmu {
-       int (*enable)                   (struct perf_event *event);
-       void (*disable)                 (struct perf_event *event);
-       int (*start)                    (struct perf_event *event);
-       void (*stop)                    (struct perf_event *event);
-       void (*read)                    (struct perf_event *event);
-       void (*unthrottle)              (struct perf_event *event);
+       struct list_head                entry;
+
+       int * __percpu                  pmu_disable_count;
+       struct perf_cpu_context * __percpu pmu_cpu_context;
+       int                             task_ctx_nr;
+
+       /*
+        * Fully disable/enable this PMU, can be used to protect from the PMI
+        * as well as for lazy/batch writing of the MSRs.
+        */
+       void (*pmu_enable)              (struct pmu *pmu); /* optional */
+       void (*pmu_disable)             (struct pmu *pmu); /* optional */
  
         /*
-        * Group events scheduling is treated as a transaction, add group
-        * events as a whole and perform one schedulability test. If the test
-        * fails, roll back the whole group
+        * Try and initialize the event for this PMU.
+        * Should return -ENOENT when the @event doesn't match this PMU.
          */
+       int (*event_init)               (struct perf_event *event);
+
+#define PERF_EF_START  0x01            /* start the counter when adding    */
+#define PERF_EF_RELOAD 0x02            /* reload the counter when starting */
+#define PERF_EF_UPDATE 0x04            /* update the counter when stopping */
  
         /*
-        * Start the transaction, after this ->enable() doesn't need
-        * to do schedulability tests.
+        * Adds/Removes a counter to/from the PMU, can be done inside
+        * a transaction, see the ->*_txn() methods.
          */
-       void (*start_txn)       (const struct pmu *pmu);
+       int  (*add)                     (struct perf_event *event, int flags);
+       void (*del)                     (struct perf_event *event, int flags);
+
         /*
-        * If ->start_txn() disabled the ->enable() schedulability test
+        * Starts/Stops a counter present on the PMU. The PMI handler
+        * should stop the counter when perf_event_overflow() returns
+        * !0. ->start() will be used to continue.
+        */
+       void (*start)                   (struct perf_event *event, int flags);
+       void (*stop)                    (struct perf_event *event, int flags);
+
+       /*
+        * Updates the counter value of the event.
+        */
+       void (*read)                    (struct perf_event *event);
+
+       /*
+        * Group events scheduling is treated as a transaction, add
+        * group events as a whole and perform one schedulability test.
+        * If the test fails, roll back the whole group
+        *
+        * Start the transaction, after this ->add() doesn't need to
+        * do schedulability tests.
+        */
+       void (*start_txn)       (struct pmu *pmu); /* optional */
+       /*
+        * If ->start_txn() disabled the ->add() schedulability test
          * then ->commit_txn() is required to perform one. On success
          * the transaction is closed. On error the transaction is kept
          * open until ->cancel_txn() is called.
          */
-       int  (*commit_txn)      (const struct pmu *pmu);
+       int  (*commit_txn)      (struct pmu *pmu); /* optional */
         /*
-        * Will cancel the transaction, assumes ->disable() is called for
-        * each successfull ->enable() during the transaction.
+        * Will cancel the transaction, assumes ->del() is called
+        * for each successfull ->add() during the transaction.
          */
-       void (*cancel_txn)      (const struct pmu *pmu);
+       void (*cancel_txn)      (struct pmu *pmu); /* optional */
  };
  
  /**
@@ -631,11 +680,6 @@ struct perf_buffer {
         void                            *data_pages[0];
  };
  
-struct perf_pending_entry {
-       struct perf_pending_entry *next;
-       void (*func)(struct perf_pending_entry *);
-};
-
  struct perf_sample_data;
  
  typedef void (*perf_overflow_handler_t)(struct perf_event *, int,
@@ -656,6 +700,7 @@ struct swevent_hlist {
  
  #define PERF_ATTACH_CONTEXT    0x01
  #define PERF_ATTACH_GROUP      0x02
+#define PERF_ATTACH_TASK       0x04
  
  /**
   * struct perf_event - performance event kernel representation:
@@ -669,7 +714,7 @@ struct perf_event {
         int                             nr_siblings;
         int                             group_flags;
         struct perf_event               *group_leader;
-       const struct pmu                *pmu;
+       struct pmu                      *pmu;
  
         enum perf_event_active_state    state;
         unsigned int                    attach_state;
@@ -743,7 +788,7 @@ struct perf_event {
         int                             pending_wakeup;
         int                             pending_kill;
         int                             pending_disable;
-       struct perf_pending_entry       pending;
+       struct irq_work                 pending;
  
         atomic_t                        event_limit;
  
@@ -763,12 +808,19 @@ struct perf_event {
  #endif /* CONFIG_PERF_EVENTS */
  };
  
+enum perf_event_context_type {
+       task_context,
+       cpu_context,
+};
+
  /**
   * struct perf_event_context - event context structure
   *
   * Used as a container for task events and CPU events as well:
   */
  struct perf_event_context {
+       enum perf_event_context_type    type;
+       struct pmu                      *pmu;
         /*
          * Protect the states of the events in the list,
          * nr_active, and the list:
@@ -808,6 +860,12 @@ struct perf_event_context {
         struct rcu_head                 rcu_head;
  };
  
+/*
+ * Number of contexts where an event can trigger:
+ *     task, softirq, hardirq, nmi.
+ */
+#define PERF_NR_CONTEXTS       4
+
  /**
   * struct perf_event_cpu_context - per cpu event context structure
   */
@@ -815,18 +873,9 @@ struct perf_cpu_context {
         struct perf_event_context       ctx;
         struct perf_event_context       *task_ctx;
         int                             active_oncpu;
-       int                             max_pertask;
         int                             exclusive;
-       struct swevent_hlist            *swevent_hlist;
-       struct mutex                    hlist_mutex;
-       int                             hlist_refcount;
-
-       /*
-        * Recursion avoidance:
-        *
-        * task, softirq, irq, nmi context
-        */
-       int                             recursion[4];
+       struct list_head                rotation_list;
+       int                             jiffies_interval;
  };
  
  struct perf_output_handle {
@@ -842,26 +891,34 @@ struct perf_output_handle {
  
  #ifdef CONFIG_PERF_EVENTS
  
-/*
- * Set by architecture code:
- */
-extern int perf_max_events;
+extern int perf_pmu_register(struct pmu *pmu);
+extern void perf_pmu_unregister(struct pmu *pmu);
+
+extern int perf_num_counters(void);
+extern const char *perf_pmu_name(void);
+extern void __perf_event_task_sched_in(struct task_struct *task);
+extern void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next);
  
-extern const struct pmu *hw_perf_event_init(struct perf_event *event);
+extern atomic_t perf_task_events;
+
+static inline void perf_event_task_sched_in(struct task_struct *task)
+{
+       COND_STMT(&perf_task_events, __perf_event_task_sched_in(task));
+}
+
+static inline
+void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next)
+{
+       COND_STMT(&perf_task_events, __perf_event_task_sched_out(task, next));
+}
  
-extern void perf_event_task_sched_in(struct task_struct *task);
-extern void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next);
-extern void perf_event_task_tick(struct task_struct *task);
  extern int perf_event_init_task(struct task_struct *child);
  extern void perf_event_exit_task(struct task_struct *child);
  extern void perf_event_free_task(struct task_struct *task);
-extern void set_perf_event_pending(void);
-extern void perf_event_do_pending(void);
+extern void perf_event_delayed_put(struct task_struct *task);
  extern void perf_event_print_debug(void);
-extern void __perf_disable(void);
-extern bool __perf_enable(void);
-extern void perf_disable(void);
-extern void perf_enable(void);
+extern void perf_pmu_disable(struct pmu *pmu);
+extern void perf_pmu_enable(struct pmu *pmu);
  extern int perf_event_task_disable(void);
  extern int perf_event_task_enable(void);
  extern void perf_event_update_userpage(struct perf_event *event);
@@ -869,7 +926,7 @@ extern int perf_event_release_kernel(struct perf_event *event);
  extern struct perf_event *
  perf_event_create_kernel_counter(struct perf_event_attr *attr,
                                 int cpu,
-                               pid_t pid,
+                               struct task_struct *task,
                                 perf_overflow_handler_t callback);
  extern u64 perf_event_read_value(struct perf_event *event,
                                  u64 *enabled, u64 *running);
@@ -920,14 +977,7 @@ extern int perf_event_overflow(struct perf_event *event, int nmi,
   */
  static inline int is_software_event(struct perf_event *event)
  {
-       switch (event->attr.type) {
-       case PERF_TYPE_SOFTWARE:
-       case PERF_TYPE_TRACEPOINT:
-       /* for now the breakpoint stuff also works as software event */
-       case PERF_TYPE_BREAKPOINT:
-               return 1;
-       }
-       return 0;
+       return event->pmu->task_ctx_nr == perf_sw_context;
  }
  
  extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
@@ -954,18 +1004,20 @@ static inline void perf_fetch_caller_regs(struct pt_regs *regs)
         perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
  }
  
-static inline void
+static __always_inline void
  perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
  {
-       if (atomic_read(&perf_swevent_enabled[event_id])) {
-               struct pt_regs hot_regs;
-
-               if (!regs) {
-                       perf_fetch_caller_regs(&hot_regs);
-                       regs = &hot_regs;
-               }
-               __perf_sw_event(event_id, nr, nmi, regs, addr);
+       struct pt_regs hot_regs;
+
+       JUMP_LABEL(&perf_swevent_enabled[event_id], have_event);
+       return;
+
+have_event:
+       if (!regs) {
+               perf_fetch_caller_regs(&hot_regs);
+               regs = &hot_regs;
         }
+       __perf_sw_event(event_id, nr, nmi, regs, addr);
  }
  
  extern void perf_event_mmap(struct vm_area_struct *vma);
@@ -976,7 +1028,21 @@ extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks
  extern void perf_event_comm(struct task_struct *tsk);
  extern void perf_event_fork(struct task_struct *tsk);
  
-extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
+/* Callchains */
+DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
+
+extern void perf_callchain_user(struct perf_callchain_entry *entry,
+                               struct pt_regs *regs);
+extern void perf_callchain_kernel(struct perf_callchain_entry *entry,
+                                 struct pt_regs *regs);
+
+
+static inline void
+perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
+{
+       if (entry->nr < PERF_MAX_STACK_DEPTH)
+               entry->ip[entry->nr++] = ip;
+}
  
  extern int sysctl_perf_event_paranoid;
  extern int sysctl_perf_event_mlock;
@@ -1019,21 +1085,18 @@ extern int perf_swevent_get_recursion_context(void);
  extern void perf_swevent_put_recursion_context(int rctx);
  extern void perf_event_enable(struct perf_event *event);
  extern void perf_event_disable(struct perf_event *event);
+extern void perf_event_task_tick(void);
  #else
  static inline void
  perf_event_task_sched_in(struct task_struct *task)                     { }
  static inline void
  perf_event_task_sched_out(struct task_struct *task,
                             struct task_struct *next)                   { }
-static inline void
-perf_event_task_tick(struct task_struct *task)                         { }
  static inline int perf_event_init_task(struct task_struct *child)      { return 0; }
  static inline void perf_event_exit_task(struct task_struct *child)     { }
  static inline void perf_event_free_task(struct task_struct *task)      { }
-static inline void perf_event_do_pending(void)                         { }
+static inline void perf_event_delayed_put(struct task_struct *task)    { }
  static inline void perf_event_print_debug(void)                                { }
-static inline void perf_disable(void)                                  { }
-static inline void perf_enable(void)                                   { }
  static inline int perf_event_task_disable(void)                                { return -EINVAL; }
  static inline int perf_event_task_enable(void)                         { return -EINVAL; }
  
@@ -1056,6 +1119,7 @@ static inline int  perf_swevent_get_recursion_context(void)               { return -1; }
  static inline void perf_swevent_put_recursion_context(int rctx)                { }
  static inline void perf_event_enable(struct perf_event *event)         { }
  static inline void perf_event_disable(struct perf_event *event)                { }
+static inline void perf_event_task_tick(void)                          { }
  #endif
  
  #define perf_output_put(handle, x) \
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h

index 634b8e674ac578e2916b110a28b992bf2e8dfd22..a39cbed9ee17a5d771f7c3e7ca129e3a05171220 100644 (file)
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -47,6 +47,8 @@ static inline void *radix_tree_indirect_to_ptr(void *ptr)
  {
         return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
  }
+#define radix_tree_indirect_to_ptr(ptr) \
+       radix_tree_indirect_to_ptr((void __force *)(ptr))
  
  static inline int radix_tree_is_indirect_ptr(void *ptr)
  {
@@ -61,7 +63,7 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
  struct radix_tree_root {
         unsigned int            height;
         gfp_t                   gfp_mask;
-       struct radix_tree_node  *rnode;
+       struct radix_tree_node  __rcu *rnode;
  };
  
  #define RADIX_TREE_INIT(mask)  {                                       \
diff --git a/include/linux/rculist.h b/include/linux/rculist.h

index 4ec3b38ce9c584049229b71bbf537770d6fbe263..f31ef61f1c650b585bd6faf969f7cec754dffe2d 100644 (file)
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -9,6 +9,21 @@
  #include <linux/list.h>
  #include <linux/rcupdate.h>
  
+/*
+ * Why is there no list_empty_rcu()?  Because list_empty() serves this
+ * purpose.  The list_empty() function fetches the RCU-protected pointer
+ * and compares it to the address of the list head, but neither dereferences
+ * this pointer itself nor provides this pointer to the caller.  Therefore,
+ * it is not necessary to use rcu_dereference(), so that list_empty() can
+ * be used anywhere you would want to use a list_empty_rcu().
+ */
+
+/*
+ * return the ->next pointer of a list_head in an rcu safe
+ * way, we must not access it directly
+ */
+#define list_next_rcu(list)    (*((struct list_head __rcu **)(&(list)->next)))
+
  /*
   * Insert a new entry between two known consecutive entries.
   *
@@ -20,7 +35,7 @@ static inline void __list_add_rcu(struct list_head *new,
  {
         new->next = next;
         new->prev = prev;
-       rcu_assign_pointer(prev->next, new);
+       rcu_assign_pointer(list_next_rcu(prev), new);
         next->prev = new;
  }
  
@@ -138,7 +153,7 @@ static inline void list_replace_rcu(struct list_head *old,
  {
         new->next = old->next;
         new->prev = old->prev;
-       rcu_assign_pointer(new->prev->next, new);
+       rcu_assign_pointer(list_next_rcu(new->prev), new);
         new->next->prev = new;
         old->prev = LIST_POISON2;
  }
@@ -193,7 +208,7 @@ static inline void list_splice_init_rcu(struct list_head *list,
          */
  
         last->next = at;
-       rcu_assign_pointer(head->next, first);
+       rcu_assign_pointer(list_next_rcu(head), first);
         first->prev = head;
         at->prev = last;
  }
@@ -208,7 +223,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
   * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
   */
  #define list_entry_rcu(ptr, type, member) \
-       container_of(rcu_dereference_raw(ptr), type, member)
+       ({typeof (*ptr) __rcu *__ptr = (typeof (*ptr) __rcu __force *)ptr; \
+        container_of((typeof(ptr))rcu_dereference_raw(__ptr), type, member); \
+       })
  
  /**
   * list_first_entry_rcu - get the first element from a list
@@ -225,9 +242,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
         list_entry_rcu((ptr)->next, type, member)
  
  #define __list_for_each_rcu(pos, head) \
-       for (pos = rcu_dereference_raw((head)->next); \
+       for (pos = rcu_dereference_raw(list_next_rcu(head)); \
                 pos != (head); \
-               pos = rcu_dereference_raw(pos->next))
+               pos = rcu_dereference_raw(list_next_rcu((pos)))
  
  /**
   * list_for_each_entry_rcu     -       iterate over rcu list of given type
@@ -257,9 +274,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
   * as long as the traversal is guarded by rcu_read_lock().
   */
  #define list_for_each_continue_rcu(pos, head) \
-       for ((pos) = rcu_dereference_raw((pos)->next); \
+       for ((pos) = rcu_dereference_raw(list_next_rcu(pos)); \
                 prefetch((pos)->next), (pos) != (head); \
-               (pos) = rcu_dereference_raw((pos)->next))
+               (pos) = rcu_dereference_raw(list_next_rcu(pos)))
  
  /**
   * list_for_each_entry_continue_rcu - continue iteration over list of given type
@@ -314,12 +331,19 @@ static inline void hlist_replace_rcu(struct hlist_node *old,
  
         new->next = next;
         new->pprev = old->pprev;
-       rcu_assign_pointer(*new->pprev, new);
+       rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new);
         if (next)
                 new->next->pprev = &new->next;
         old->pprev = LIST_POISON2;
  }
  
+/*
+ * return the first or the next element in an RCU protected hlist
+ */
+#define hlist_first_rcu(head)  (*((struct hlist_node __rcu **)(&(head)->first)))
+#define hlist_next_rcu(node)   (*((struct hlist_node __rcu **)(&(node)->next)))
+#define hlist_pprev_rcu(node)  (*((struct hlist_node __rcu **)((node)->pprev)))
+
  /**
   * hlist_add_head_rcu
   * @n: the element to add to the hash list.
@@ -346,7 +370,7 @@ static inline void hlist_add_head_rcu(struct hlist_node *n,
  
         n->next = first;
         n->pprev = &h->first;
-       rcu_assign_pointer(h->first, n);
+       rcu_assign_pointer(hlist_first_rcu(h), n);
         if (first)
                 first->pprev = &n->next;
  }
@@ -374,7 +398,7 @@ static inline void hlist_add_before_rcu(struct hlist_node *n,
  {
         n->pprev = next->pprev;
         n->next = next;
-       rcu_assign_pointer(*(n->pprev), n);
+       rcu_assign_pointer(hlist_pprev_rcu(n), n);
         next->pprev = &n->next;
  }
  
@@ -401,15 +425,15 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
  {
         n->next = prev->next;
         n->pprev = &prev->next;
-       rcu_assign_pointer(prev->next, n);
+       rcu_assign_pointer(hlist_next_rcu(prev), n);
         if (n->next)
                 n->next->pprev = &n->next;
  }
  
-#define __hlist_for_each_rcu(pos, head)                        \
-       for (pos = rcu_dereference((head)->first);      \
-            pos && ({ prefetch(pos->next); 1; });      \
-            pos = rcu_dereference(pos->next))
+#define __hlist_for_each_rcu(pos, head)                                \
+       for (pos = rcu_dereference(hlist_first_rcu(head));      \
+            pos && ({ prefetch(pos->next); 1; });              \
+            pos = rcu_dereference(hlist_next_rcu(pos)))
  
  /**
   * hlist_for_each_entry_rcu - iterate over rcu list of given type
@@ -422,11 +446,11 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
   * the _rcu list-mutation primitives such as hlist_add_head_rcu()
   * as long as the traversal is guarded by rcu_read_lock().
   */
-#define hlist_for_each_entry_rcu(tpos, pos, head, member)               \
-       for (pos = rcu_dereference_raw((head)->first);                   \
+#define hlist_for_each_entry_rcu(tpos, pos, head, member)              \
+       for (pos = rcu_dereference_raw(hlist_first_rcu(head));          \
                 pos && ({ prefetch(pos->next); 1; }) &&                  \
                 ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
-               pos = rcu_dereference_raw(pos->next))
+               pos = rcu_dereference_raw(hlist_next_rcu(pos)))
  
  /**
   * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type
diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h

index b70ffe53cb9fe77a668f57c2bc584216f90f95b6..2ae13714828bc42568e77684fedfc2cf929c291d 100644 (file)
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -37,6 +37,12 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
         }
  }
  
+#define hlist_nulls_first_rcu(head) \
+       (*((struct hlist_nulls_node __rcu __force **)&(head)->first))
+
+#define hlist_nulls_next_rcu(node) \
+       (*((struct hlist_nulls_node __rcu __force **)&(node)->next))
+
  /**
   * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
   * @n: the element to delete from the hash list.
@@ -88,7 +94,7 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
  
         n->next = first;
         n->pprev = &h->first;
-       rcu_assign_pointer(h->first, n);
+       rcu_assign_pointer(hlist_nulls_first_rcu(h), n);
         if (!is_a_nulls(first))
                 first->pprev = &n->next;
  }
@@ -100,11 +106,11 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
   * @member:    the name of the hlist_nulls_node within the struct.
   *
   */
-#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \
-       for (pos = rcu_dereference_raw((head)->first);                   \
-               (!is_a_nulls(pos)) &&                   \
+#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member)                        \
+       for (pos = rcu_dereference_raw(hlist_nulls_first_rcu(head));            \
+               (!is_a_nulls(pos)) &&                                           \
                 ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \
-               pos = rcu_dereference_raw(pos->next))
+               pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)))
  
  #endif
  #endif
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h

index 83af1f8d8b746cde3b8369d7125f3cdfe07da6d3..03cda7bed98587b128c5a9953316644a8debb4d2 100644 (file)
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -41,11 +41,15 @@
  #include <linux/lockdep.h>
  #include <linux/completion.h>
  #include <linux/debugobjects.h>
+#include <linux/compiler.h>
  
  #ifdef CONFIG_RCU_TORTURE_TEST
  extern int rcutorture_runnable; /* for sysctl */
  #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
  
+#define ULONG_CMP_GE(a, b)     (ULONG_MAX / 2 >= (a) - (b))
+#define ULONG_CMP_LT(a, b)     (ULONG_MAX / 2 < (a) - (b))
+
  /**
   * struct rcu_head - callback structure for use with RCU
   * @next: next update requests in a list
@@ -57,29 +61,94 @@ struct rcu_head {
  };
  
  /* Exported common interfaces */
-extern void rcu_barrier(void);
+extern void call_rcu_sched(struct rcu_head *head,
+                          void (*func)(struct rcu_head *rcu));
+extern void synchronize_sched(void);
  extern void rcu_barrier_bh(void);
  extern void rcu_barrier_sched(void);
  extern void synchronize_sched_expedited(void);
  extern int sched_expedited_torture_stats(char *page);
  
+static inline void __rcu_read_lock_bh(void)
+{
+       local_bh_disable();
+}
+
+static inline void __rcu_read_unlock_bh(void)
+{
+       local_bh_enable();
+}
+
+#ifdef CONFIG_PREEMPT_RCU
+
+extern void __rcu_read_lock(void);
+extern void __rcu_read_unlock(void);
+void synchronize_rcu(void);
+
+/*
+ * Defined as a macro as it is a very low level header included from
+ * areas that don't even know about current.  This gives the rcu_read_lock()
+ * nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other
+ * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
+ */
+#define rcu_preempt_depth() (current->rcu_read_lock_nesting)
+
+#else /* #ifdef CONFIG_PREEMPT_RCU */
+
+static inline void __rcu_read_lock(void)
+{
+       preempt_disable();
+}
+
+static inline void __rcu_read_unlock(void)
+{
+       preempt_enable();
+}
+
+static inline void synchronize_rcu(void)
+{
+       synchronize_sched();
+}
+
+static inline int rcu_preempt_depth(void)
+{
+       return 0;
+}
+
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+
  /* Internal to kernel */
  extern void rcu_init(void);
+extern void rcu_sched_qs(int cpu);
+extern void rcu_bh_qs(int cpu);
+extern void rcu_check_callbacks(int cpu, int user);
+struct notifier_block;
+
+#ifdef CONFIG_NO_HZ
+
+extern void rcu_enter_nohz(void);
+extern void rcu_exit_nohz(void);
+
+#else /* #ifdef CONFIG_NO_HZ */
+
+static inline void rcu_enter_nohz(void)
+{
+}
+
+static inline void rcu_exit_nohz(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_NO_HZ */
  
  #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
  #include <linux/rcutree.h>
-#elif defined(CONFIG_TINY_RCU)
+#elif defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU)
  #include <linux/rcutiny.h>
  #else
  #error "Unknown RCU implementation specified to kernel configuration"
  #endif
  
-#define RCU_HEAD_INIT  { .next = NULL, .func = NULL }
-#define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT
-#define INIT_RCU_HEAD(ptr) do { \
-       (ptr)->next = NULL; (ptr)->func = NULL; \
-} while (0)
-
  /*
   * init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic
   * initialization and destruction of rcu_head on the stack. rcu_head structures
@@ -120,14 +189,15 @@ extern struct lockdep_map rcu_sched_lock_map;
  extern int debug_lockdep_rcu_enabled(void);
  
  /**
- * rcu_read_lock_held - might we be in RCU read-side critical section?
+ * rcu_read_lock_held() - might we be in RCU read-side critical section?
   *
   * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU
   * read-side critical section.  In absence of CONFIG_DEBUG_LOCK_ALLOC,
   * this assumes we are in an RCU read-side critical section unless it can
- * prove otherwise.
+ * prove otherwise.  This is useful for debug checks in functions that
+ * require that they be called within an RCU read-side critical section.
   *
- * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
+ * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
   * and while lockdep is disabled.
   */
  static inline int rcu_read_lock_held(void)
@@ -144,14 +214,16 @@ static inline int rcu_read_lock_held(void)
  extern int rcu_read_lock_bh_held(void);
  
  /**
- * rcu_read_lock_sched_held - might we be in RCU-sched read-side critical section?
+ * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
   *
   * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an
   * RCU-sched read-side critical section.  In absence of
   * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side
   * critical section unless it can prove otherwise.  Note that disabling
   * of preemption (including disabling irqs) counts as an RCU-sched
- * read-side critical section.
+ * read-side critical section.  This is useful for debug checks in functions
+ * that required that they be called within an RCU-sched read-side
+ * critical section.
   *
   * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
   * and while lockdep is disabled.
@@ -211,7 +283,11 @@ static inline int rcu_read_lock_sched_held(void)
  
  extern int rcu_my_thread_group_empty(void);
  
-#define __do_rcu_dereference_check(c)                                  \
+/**
+ * rcu_lockdep_assert - emit lockdep splat if specified condition not met
+ * @c: condition to check
+ */
+#define rcu_lockdep_assert(c)                                          \
         do {                                                            \
                 static bool __warned;                                   \
                 if (debug_lockdep_rcu_enabled() && !__warned && !(c)) { \
@@ -220,41 +296,163 @@ extern int rcu_my_thread_group_empty(void);
                 }                                                       \
         } while (0)
  
+#else /* #ifdef CONFIG_PROVE_RCU */
+
+#define rcu_lockdep_assert(c) do { } while (0)
+
+#endif /* #else #ifdef CONFIG_PROVE_RCU */
+
+/*
+ * Helper functions for rcu_dereference_check(), rcu_dereference_protected()
+ * and rcu_assign_pointer().  Some of these could be folded into their
+ * callers, but they are left separate in order to ease introduction of
+ * multiple flavors of pointers to match the multiple flavors of RCU
+ * (e.g., __rcu_bh, * __rcu_sched, and __srcu), should this make sense in
+ * the future.
+ */
+
+#ifdef __CHECKER__
+#define rcu_dereference_sparse(p, space) \
+       ((void)(((typeof(*p) space *)p) == p))
+#else /* #ifdef __CHECKER__ */
+#define rcu_dereference_sparse(p, space)
+#endif /* #else #ifdef __CHECKER__ */
+
+#define __rcu_access_pointer(p, space) \
+       ({ \
+               typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \
+               rcu_dereference_sparse(p, space); \
+               ((typeof(*p) __force __kernel *)(_________p1)); \
+       })
+#define __rcu_dereference_check(p, c, space) \
+       ({ \
+               typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \
+               rcu_lockdep_assert(c); \
+               rcu_dereference_sparse(p, space); \
+               smp_read_barrier_depends(); \
+               ((typeof(*p) __force __kernel *)(_________p1)); \
+       })
+#define __rcu_dereference_protected(p, c, space) \
+       ({ \
+               rcu_lockdep_assert(c); \
+               rcu_dereference_sparse(p, space); \
+               ((typeof(*p) __force __kernel *)(p)); \
+       })
+
+#define __rcu_dereference_index_check(p, c) \
+       ({ \
+               typeof(p) _________p1 = ACCESS_ONCE(p); \
+               rcu_lockdep_assert(c); \
+               smp_read_barrier_depends(); \
+               (_________p1); \
+       })
+#define __rcu_assign_pointer(p, v, space) \
+       ({ \
+               if (!__builtin_constant_p(v) || \
+                   ((v) != NULL)) \
+                       smp_wmb(); \
+               (p) = (typeof(*v) __force space *)(v); \
+       })
+
+
+/**
+ * rcu_access_pointer() - fetch RCU pointer with no dereferencing
+ * @p: The pointer to read
+ *
+ * Return the value of the specified RCU-protected pointer, but omit the
+ * smp_read_barrier_depends() and keep the ACCESS_ONCE().  This is useful
+ * when the value of this pointer is accessed, but the pointer is not
+ * dereferenced, for example, when testing an RCU-protected pointer against
+ * NULL.  Although rcu_access_pointer() may also be used in cases where
+ * update-side locks prevent the value of the pointer from changing, you
+ * should instead use rcu_dereference_protected() for this use case.
+ */
+#define rcu_access_pointer(p) __rcu_access_pointer((p), __rcu)
+
  /**
- * rcu_dereference_check - rcu_dereference with debug checking
+ * rcu_dereference_check() - rcu_dereference with debug checking
   * @p: The pointer to read, prior to dereferencing
   * @c: The conditions under which the dereference will take place
   *
   * Do an rcu_dereference(), but check that the conditions under which the
- * dereference will take place are correct.  Typically the conditions indicate
- * the various locking conditions that should be held at that point.  The check
- * should return true if the conditions are satisfied.
+ * dereference will take place are correct.  Typically the conditions
+ * indicate the various locking conditions that should be held at that
+ * point.  The check should return true if the conditions are satisfied.
+ * An implicit check for being in an RCU read-side critical section
+ * (rcu_read_lock()) is included.
   *
   * For example:
   *
- *     bar = rcu_dereference_check(foo->bar, rcu_read_lock_held() ||
- *                                           lockdep_is_held(&foo->lock));
+ *     bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock));
   *
   * could be used to indicate to lockdep that foo->bar may only be dereferenced
- * if either the RCU read lock is held, or that the lock required to replace
+ * if either rcu_read_lock() is held, or that the lock required to replace
   * the bar struct at foo->bar is held.
   *
   * Note that the list of conditions may also include indications of when a lock
   * need not be held, for example during initialisation or destruction of the
   * target struct:
   *
- *     bar = rcu_dereference_check(foo->bar, rcu_read_lock_held() ||
- *                                           lockdep_is_held(&foo->lock) ||
+ *     bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock) ||
   *                                           atomic_read(&foo->usage) == 0);
+ *
+ * Inserts memory barriers on architectures that require them
+ * (currently only the Alpha), prevents the compiler from refetching
+ * (and from merging fetches), and, more importantly, documents exactly
+ * which pointers are protected by RCU and checks that the pointer is
+ * annotated as __rcu.
   */
  #define rcu_dereference_check(p, c) \
-       ({ \
-               __do_rcu_dereference_check(c); \
-               rcu_dereference_raw(p); \
-       })
+       __rcu_dereference_check((p), rcu_read_lock_held() || (c), __rcu)
+
+/**
+ * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking
+ * @p: The pointer to read, prior to dereferencing
+ * @c: The conditions under which the dereference will take place
+ *
+ * This is the RCU-bh counterpart to rcu_dereference_check().
+ */
+#define rcu_dereference_bh_check(p, c) \
+       __rcu_dereference_check((p), rcu_read_lock_bh_held() || (c), __rcu)
  
  /**
- * rcu_dereference_protected - fetch RCU pointer when updates prevented
+ * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking
+ * @p: The pointer to read, prior to dereferencing
+ * @c: The conditions under which the dereference will take place
+ *
+ * This is the RCU-sched counterpart to rcu_dereference_check().
+ */
+#define rcu_dereference_sched_check(p, c) \
+       __rcu_dereference_check((p), rcu_read_lock_sched_held() || (c), \
+                               __rcu)
+
+#define rcu_dereference_raw(p) rcu_dereference_check(p, 1) /*@@@ needed? @@@*/
+
+/**
+ * rcu_dereference_index_check() - rcu_dereference for indices with debug checking
+ * @p: The pointer to read, prior to dereferencing
+ * @c: The conditions under which the dereference will take place
+ *
+ * Similar to rcu_dereference_check(), but omits the sparse checking.
+ * This allows rcu_dereference_index_check() to be used on integers,
+ * which can then be used as array indices.  Attempting to use
+ * rcu_dereference_check() on an integer will give compiler warnings
+ * because the sparse address-space mechanism relies on dereferencing
+ * the RCU-protected pointer.  Dereferencing integers is not something
+ * that even gcc will put up with.
+ *
+ * Note that this function does not implicitly check for RCU read-side
+ * critical sections.  If this function gains lots of uses, it might
+ * make sense to provide versions for each flavor of RCU, but it does
+ * not make sense as of early 2010.
+ */
+#define rcu_dereference_index_check(p, c) \
+       __rcu_dereference_index_check((p), (c))
+
+/**
+ * rcu_dereference_protected() - fetch RCU pointer when updates prevented
+ * @p: The pointer to read, prior to dereferencing
+ * @c: The conditions under which the dereference will take place
   *
   * Return the value of the specified RCU-protected pointer, but omit
   * both the smp_read_barrier_depends() and the ACCESS_ONCE().  This
@@ -263,35 +461,61 @@ extern int rcu_my_thread_group_empty(void);
   * prevent the compiler from repeating this reference or combining it
   * with other references, so it should not be used without protection
   * of appropriate locks.
+ *
+ * This function is only for update-side use.  Using this function
+ * when protected only by rcu_read_lock() will result in infrequent
+ * but very ugly failures.
   */
  #define rcu_dereference_protected(p, c) \
-       ({ \
-               __do_rcu_dereference_check(c); \
-               (p); \
-       })
+       __rcu_dereference_protected((p), (c), __rcu)
  
-#else /* #ifdef CONFIG_PROVE_RCU */
+/**
+ * rcu_dereference_bh_protected() - fetch RCU-bh pointer when updates prevented
+ * @p: The pointer to read, prior to dereferencing
+ * @c: The conditions under which the dereference will take place
+ *
+ * This is the RCU-bh counterpart to rcu_dereference_protected().
+ */
+#define rcu_dereference_bh_protected(p, c) \
+       __rcu_dereference_protected((p), (c), __rcu)
  
-#define rcu_dereference_check(p, c)    rcu_dereference_raw(p)
-#define rcu_dereference_protected(p, c) (p)
+/**
+ * rcu_dereference_sched_protected() - fetch RCU-sched pointer when updates prevented
+ * @p: The pointer to read, prior to dereferencing
+ * @c: The conditions under which the dereference will take place
+ *
+ * This is the RCU-sched counterpart to rcu_dereference_protected().
+ */
+#define rcu_dereference_sched_protected(p, c) \
+       __rcu_dereference_protected((p), (c), __rcu)
  
-#endif /* #else #ifdef CONFIG_PROVE_RCU */
  
  /**
- * rcu_access_pointer - fetch RCU pointer with no dereferencing
+ * rcu_dereference() - fetch RCU-protected pointer for dereferencing
+ * @p: The pointer to read, prior to dereferencing
   *
- * Return the value of the specified RCU-protected pointer, but omit the
- * smp_read_barrier_depends() and keep the ACCESS_ONCE().  This is useful
- * when the value of this pointer is accessed, but the pointer is not
- * dereferenced, for example, when testing an RCU-protected pointer against
- * NULL.  This may also be used in cases where update-side locks prevent
- * the value of the pointer from changing, but rcu_dereference_protected()
- * is a lighter-weight primitive for this use case.
+ * This is a simple wrapper around rcu_dereference_check().
+ */
+#define rcu_dereference(p) rcu_dereference_check(p, 0)
+
+/**
+ * rcu_dereference_bh() - fetch an RCU-bh-protected pointer for dereferencing
+ * @p: The pointer to read, prior to dereferencing
+ *
+ * Makes rcu_dereference_check() do the dirty work.
+ */
+#define rcu_dereference_bh(p) rcu_dereference_bh_check(p, 0)
+
+/**
+ * rcu_dereference_sched() - fetch RCU-sched-protected pointer for dereferencing
+ * @p: The pointer to read, prior to dereferencing
+ *
+ * Makes rcu_dereference_check() do the dirty work.
   */
-#define rcu_access_pointer(p)  ACCESS_ONCE(p)
+#define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0)
  
  /**
- * rcu_read_lock - mark the beginning of an RCU read-side critical section.
+ * rcu_read_lock() - mark the beginning of an RCU read-side critical section
   *
   * When synchronize_rcu() is invoked on one CPU while other CPUs
   * are within RCU read-side critical sections, then the
@@ -302,7 +526,7 @@ extern int rcu_my_thread_group_empty(void);
   * until after the all the other CPUs exit their critical sections.
   *
   * Note, however, that RCU callbacks are permitted to run concurrently
- * with RCU read-side critical sections.  One way that this can happen
+ * with new RCU read-side critical sections.  One way that this can happen
   * is via the following sequence of events: (1) CPU 0 enters an RCU
   * read-side critical section, (2) CPU 1 invokes call_rcu() to register
   * an RCU callback, (3) CPU 0 exits the RCU read-side critical section,
@@ -317,7 +541,20 @@ extern int rcu_my_thread_group_empty(void);
   * will be deferred until the outermost RCU read-side critical section
   * completes.
   *
- * It is illegal to block while in an RCU read-side critical section.
+ * You can avoid reading and understanding the next paragraph by
+ * following this rule: don't put anything in an rcu_read_lock() RCU
+ * read-side critical section that would block in a !PREEMPT kernel.
+ * But if you want the full story, read on!
+ *
+ * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU), it
+ * is illegal to block while in an RCU read-side critical section.  In
+ * preemptible RCU implementations (TREE_PREEMPT_RCU and TINY_PREEMPT_RCU)
+ * in CONFIG_PREEMPT kernel builds, RCU read-side critical sections may
+ * be preempted, but explicit blocking is illegal.  Finally, in preemptible
+ * RCU implementations in real-time (CONFIG_PREEMPT_RT) kernel builds,
+ * RCU read-side critical sections may be preempted and they may also
+ * block, but only when acquiring spinlocks that are subject to priority
+ * inheritance.
   */
  static inline void rcu_read_lock(void)
  {
@@ -337,7 +574,7 @@ static inline void rcu_read_lock(void)
   */
  
  /**
- * rcu_read_unlock - marks the end of an RCU read-side critical section.
+ * rcu_read_unlock() - marks the end of an RCU read-side critical section.
   *
   * See rcu_read_lock() for more information.
   */
@@ -349,15 +586,16 @@ static inline void rcu_read_unlock(void)
  }
  
  /**
- * rcu_read_lock_bh - mark the beginning of a softirq-only RCU critical section
+ * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section
   *
   * This is equivalent of rcu_read_lock(), but to be used when updates
- * are being done using call_rcu_bh(). Since call_rcu_bh() callbacks
- * consider completion of a softirq handler to be a quiescent state,
- * a process in RCU read-side critical section must be protected by
- * disabling softirqs. Read-side critical sections in interrupt context
- * can use just rcu_read_lock().
- *
+ * are being done using call_rcu_bh() or synchronize_rcu_bh(). Since
+ * both call_rcu_bh() and synchronize_rcu_bh() consider completion of a
+ * softirq handler to be a quiescent state, a process in RCU read-side
+ * critical section must be protected by disabling softirqs. Read-side
+ * critical sections in interrupt context can use just rcu_read_lock(),
+ * though this should at least be commented to avoid confusing people
+ * reading the code.
   */
  static inline void rcu_read_lock_bh(void)
  {
@@ -379,13 +617,12 @@ static inline void rcu_read_unlock_bh(void)
  }
  
  /**
- * rcu_read_lock_sched - mark the beginning of a RCU-classic critical section
+ * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section
   *
- * Should be used with either
- * - synchronize_sched()
- * or
- * - call_rcu_sched() and rcu_barrier_sched()
- * on the write-side to insure proper synchronization.
+ * This is equivalent of rcu_read_lock(), but to be used when updates
+ * are being done using call_rcu_sched() or synchronize_rcu_sched().
+ * Read-side critical sections can also be introduced by anything that
+ * disables preemption, including local_irq_disable() and friends.
   */
  static inline void rcu_read_lock_sched(void)
  {
@@ -420,54 +657,14 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
         preempt_enable_notrace();
  }
  
-
  /**
- * rcu_dereference_raw - fetch an RCU-protected pointer
+ * rcu_assign_pointer() - assign to RCU-protected pointer
+ * @p: pointer to assign to
+ * @v: value to assign (publish)
   *
- * The caller must be within some flavor of RCU read-side critical
- * section, or must be otherwise preventing the pointer from changing,
- * for example, by holding an appropriate lock.  This pointer may later
- * be safely dereferenced.  It is the caller's responsibility to have
- * done the right thing, as this primitive does no checking of any kind.
- *
- * Inserts memory barriers on architectures that require them
- * (currently only the Alpha), and, more importantly, documents
- * exactly which pointers are protected by RCU.
- */
-#define rcu_dereference_raw(p) ({ \
-                               typeof(p) _________p1 = ACCESS_ONCE(p); \
-                               smp_read_barrier_depends(); \
-                               (_________p1); \
-                               })
-
-/**
- * rcu_dereference - fetch an RCU-protected pointer, checking for RCU
- *
- * Makes rcu_dereference_check() do the dirty work.
- */
-#define rcu_dereference(p) \
-       rcu_dereference_check(p, rcu_read_lock_held())
-
-/**
- * rcu_dereference_bh - fetch an RCU-protected pointer, checking for RCU-bh
- *
- * Makes rcu_dereference_check() do the dirty work.
- */
-#define rcu_dereference_bh(p) \
-               rcu_dereference_check(p, rcu_read_lock_bh_held() || irqs_disabled())
-
-/**
- * rcu_dereference_sched - fetch RCU-protected pointer, checking for RCU-sched
- *
- * Makes rcu_dereference_check() do the dirty work.
- */
-#define rcu_dereference_sched(p) \
-               rcu_dereference_check(p, rcu_read_lock_sched_held())
-
-/**
- * rcu_assign_pointer - assign (publicize) a pointer to a newly
- * initialized structure that will be dereferenced by RCU read-side
- * critical sections.  Returns the value assigned.
+ * Assigns the specified value to the specified RCU-protected
+ * pointer, ensuring that any concurrent RCU readers will see
+ * any prior initialization.  Returns the value assigned.
   *
   * Inserts memory barriers on architectures that require them
   * (pretty much all of them other than x86), and also prevents
@@ -476,14 +673,17 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
   * call documents which pointers will be dereferenced by RCU read-side
   * code.
   */
-
  #define rcu_assign_pointer(p, v) \
-       ({ \
-               if (!__builtin_constant_p(v) || \
-                   ((v) != NULL)) \
-                       smp_wmb(); \
-               (p) = (v); \
-       })
+       __rcu_assign_pointer((p), (v), __rcu)
+
+/**
+ * RCU_INIT_POINTER() - initialize an RCU protected pointer
+ *
+ * Initialize an RCU-protected pointer in such a way to avoid RCU-lockdep
+ * splats.
+ */
+#define RCU_INIT_POINTER(p, v) \
+               p = (typeof(*v) __force __rcu *)(v)
  
  /* Infrastructure to implement the synchronize_() primitives. */
  
@@ -494,26 +694,37 @@ struct rcu_synchronize {
  
  extern void wakeme_after_rcu(struct rcu_head  *head);
  
+#ifdef CONFIG_PREEMPT_RCU
+
  /**
- * call_rcu - Queue an RCU callback for invocation after a grace period.
+ * call_rcu() - Queue an RCU callback for invocation after a grace period.
   * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
+ * @func: actual callback function to be invoked after the grace period
   *
- * The update function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed.  RCU read-side critical
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all pre-existing RCU read-side
+ * critical sections have completed.  However, the callback function
+ * might well execute concurrently with RCU read-side critical sections
+ * that started after call_rcu() was invoked.  RCU read-side critical
   * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
   * and may be nested.
   */
  extern void call_rcu(struct rcu_head *head,
                               void (*func)(struct rcu_head *head));
  
+#else /* #ifdef CONFIG_PREEMPT_RCU */
+
+/* In classic RCU, call_rcu() is just call_rcu_sched(). */
+#define        call_rcu        call_rcu_sched
+
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+
  /**
- * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
+ * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
   * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
+ * @func: actual callback function to be invoked after the grace period
   *
- * The update function will be invoked some time after a full grace
+ * The callback function will be invoked some time after a full grace
   * period elapses, in other words after all currently executing RCU
   * read-side critical sections have completed. call_rcu_bh() assumes
   * that the read-side critical sections end on completion of a softirq
@@ -566,37 +777,4 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
  }
  #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
  
-#ifndef CONFIG_PROVE_RCU
-#define __do_rcu_dereference_check(c) do { } while (0)
-#endif /* #ifdef CONFIG_PROVE_RCU */
-
-#define __rcu_dereference_index_check(p, c) \
-       ({ \
-               typeof(p) _________p1 = ACCESS_ONCE(p); \
-               __do_rcu_dereference_check(c); \
-               smp_read_barrier_depends(); \
-               (_________p1); \
-       })
-
-/**
- * rcu_dereference_index_check() - rcu_dereference for indices with debug checking
- * @p: The pointer to read, prior to dereferencing
- * @c: The conditions under which the dereference will take place
- *
- * Similar to rcu_dereference_check(), but omits the sparse checking.
- * This allows rcu_dereference_index_check() to be used on integers,
- * which can then be used as array indices.  Attempting to use
- * rcu_dereference_check() on an integer will give compiler warnings
- * because the sparse address-space mechanism relies on dereferencing
- * the RCU-protected pointer.  Dereferencing integers is not something
- * that even gcc will put up with.
- *
- * Note that this function does not implicitly check for RCU read-side
- * critical sections.  If this function gains lots of uses, it might
- * make sense to provide versions for each flavor of RCU, but it does
- * not make sense as of early 2010.
- */
-#define rcu_dereference_index_check(p, c) \
-       __rcu_dereference_index_check((p), (c))
-
  #endif /* __LINUX_RCUPDATE_H */
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h

index e2e893144a8450848cf50f8672368aa79c0a0001..13877cb93a6000043f11a6704f2d90b0cc04552d 100644 (file)
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -27,103 +27,101 @@
  
  #include <linux/cache.h>
  
-void rcu_sched_qs(int cpu);
-void rcu_bh_qs(int cpu);
-static inline void rcu_note_context_switch(int cpu)
-{
-       rcu_sched_qs(cpu);
-}
+#define rcu_init_sched()       do { } while (0)
  
-#define __rcu_read_lock()      preempt_disable()
-#define __rcu_read_unlock()    preempt_enable()
-#define __rcu_read_lock_bh()   local_bh_disable()
-#define __rcu_read_unlock_bh() local_bh_enable()
-#define call_rcu_sched         call_rcu
+#ifdef CONFIG_TINY_RCU
  
-#define rcu_init_sched()       do { } while (0)
-extern void rcu_check_callbacks(int cpu, int user);
+static inline void synchronize_rcu_expedited(void)
+{
+       synchronize_sched();    /* Only one CPU, so pretty fast anyway!!! */
+}
  
-static inline int rcu_needs_cpu(int cpu)
+static inline void rcu_barrier(void)
  {
-       return 0;
+       rcu_barrier_sched();  /* Only one CPU, so only one list of callbacks! */
  }
  
-/*
- * Return the number of grace periods.
- */
-static inline long rcu_batches_completed(void)
+#else /* #ifdef CONFIG_TINY_RCU */
+
+void rcu_barrier(void);
+void synchronize_rcu_expedited(void);
+
+#endif /* #else #ifdef CONFIG_TINY_RCU */
+
+static inline void synchronize_rcu_bh(void)
  {
-       return 0;
+       synchronize_sched();
  }
  
-/*
- * Return the number of bottom-half grace periods.
- */
-static inline long rcu_batches_completed_bh(void)
+static inline void synchronize_rcu_bh_expedited(void)
  {
-       return 0;
+       synchronize_sched();
  }
  
-static inline void rcu_force_quiescent_state(void)
+#ifdef CONFIG_TINY_RCU
+
+static inline void rcu_preempt_note_context_switch(void)
  {
  }
  
-static inline void rcu_bh_force_quiescent_state(void)
+static inline void exit_rcu(void)
  {
  }
  
-static inline void rcu_sched_force_quiescent_state(void)
+static inline int rcu_needs_cpu(int cpu)
  {
+       return 0;
  }
  
-extern void synchronize_sched(void);
+#else /* #ifdef CONFIG_TINY_RCU */
+
+void rcu_preempt_note_context_switch(void);
+extern void exit_rcu(void);
+int rcu_preempt_needs_cpu(void);
  
-static inline void synchronize_rcu(void)
+static inline int rcu_needs_cpu(int cpu)
  {
-       synchronize_sched();
+       return rcu_preempt_needs_cpu();
  }
  
-static inline void synchronize_rcu_bh(void)
+#endif /* #else #ifdef CONFIG_TINY_RCU */
+
+static inline void rcu_note_context_switch(int cpu)
  {
-       synchronize_sched();
+       rcu_sched_qs(cpu);
+       rcu_preempt_note_context_switch();
  }
  
-static inline void synchronize_rcu_expedited(void)
+/*
+ * Return the number of grace periods.
+ */
+static inline long rcu_batches_completed(void)
  {
-       synchronize_sched();
+       return 0;
  }
  
-static inline void synchronize_rcu_bh_expedited(void)
+/*
+ * Return the number of bottom-half grace periods.
+ */
+static inline long rcu_batches_completed_bh(void)
  {
-       synchronize_sched();
+       return 0;
  }
  
-struct notifier_block;
-
-#ifdef CONFIG_NO_HZ
-
-extern void rcu_enter_nohz(void);
-extern void rcu_exit_nohz(void);
-
-#else /* #ifdef CONFIG_NO_HZ */
-
-static inline void rcu_enter_nohz(void)
+static inline void rcu_force_quiescent_state(void)
  {
  }
  
-static inline void rcu_exit_nohz(void)
+static inline void rcu_bh_force_quiescent_state(void)
  {
  }
  
-#endif /* #else #ifdef CONFIG_NO_HZ */
-
-static inline void exit_rcu(void)
+static inline void rcu_sched_force_quiescent_state(void)
  {
  }
  
-static inline int rcu_preempt_depth(void)
+static inline void rcu_cpu_stall_reset(void)
  {
-       return 0;
  }
  
  #ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h

index c0ed1c056f290701def0e0116b6aad72ada30c91..95518e6287946177e0eceb5cbf201ebfcaf0e072 100644 (file)
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -30,64 +30,23 @@
  #ifndef __LINUX_RCUTREE_H
  #define __LINUX_RCUTREE_H
  
-struct notifier_block;
-
-extern void rcu_sched_qs(int cpu);
-extern void rcu_bh_qs(int cpu);
  extern void rcu_note_context_switch(int cpu);
  extern int rcu_needs_cpu(int cpu);
+extern void rcu_cpu_stall_reset(void);
  
  #ifdef CONFIG_TREE_PREEMPT_RCU
  
-extern void __rcu_read_lock(void);
-extern void __rcu_read_unlock(void);
-extern void synchronize_rcu(void);
  extern void exit_rcu(void);
  
-/*
- * Defined as macro as it is a very low level header
- * included from areas that don't even know about current
- */
-#define rcu_preempt_depth() (current->rcu_read_lock_nesting)
-
  #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
  
-static inline void __rcu_read_lock(void)
-{
-       preempt_disable();
-}
-
-static inline void __rcu_read_unlock(void)
-{
-       preempt_enable();
-}
-
-#define synchronize_rcu synchronize_sched
-
  static inline void exit_rcu(void)
  {
  }
  
-static inline int rcu_preempt_depth(void)
-{
-       return 0;
-}
-
  #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
  
-static inline void __rcu_read_lock_bh(void)
-{
-       local_bh_disable();
-}
-static inline void __rcu_read_unlock_bh(void)
-{
-       local_bh_enable();
-}
-
-extern void call_rcu_sched(struct rcu_head *head,
-                          void (*func)(struct rcu_head *rcu));
  extern void synchronize_rcu_bh(void);
-extern void synchronize_sched(void);
  extern void synchronize_rcu_expedited(void);
  
  static inline void synchronize_rcu_bh_expedited(void)
@@ -95,7 +54,7 @@ static inline void synchronize_rcu_bh_expedited(void)
         synchronize_sched_expedited();
  }
  
-extern void rcu_check_callbacks(int cpu, int user);
+extern void rcu_barrier(void);
  
  extern long rcu_batches_completed(void);
  extern long rcu_batches_completed_bh(void);
@@ -104,18 +63,6 @@ extern void rcu_force_quiescent_state(void);
  extern void rcu_bh_force_quiescent_state(void);
  extern void rcu_sched_force_quiescent_state(void);
  
-#ifdef CONFIG_NO_HZ
-void rcu_enter_nohz(void);
-void rcu_exit_nohz(void);
-#else /* CONFIG_NO_HZ */
-static inline void rcu_enter_nohz(void)
-{
-}
-static inline void rcu_exit_nohz(void)
-{
-}
-#endif /* CONFIG_NO_HZ */
-
  /* A context switch is a grace period for RCU-sched and RCU-bh. */
  static inline int rcu_blocking_is_gp(void)
  {
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 2cca9a92f5e59cd80eb212b28c1ff74729e40b81..0383601a927c48fa6ccba55de8ebf347573ef8ea 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1161,6 +1161,13 @@ struct sched_rt_entity {
  
  struct rcu_node;
  
+enum perf_event_task_context {
+       perf_invalid_context = -1,
+       perf_hw_context = 0,
+       perf_sw_context,
+       perf_nr_task_contexts,
+};
+
  struct task_struct {
         volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
         void *stack;
@@ -1203,11 +1210,13 @@ struct task_struct {
         unsigned int policy;
         cpumask_t cpus_allowed;
  
-#ifdef CONFIG_TREE_PREEMPT_RCU
+#ifdef CONFIG_PREEMPT_RCU
         int rcu_read_lock_nesting;
         char rcu_read_unlock_special;
-       struct rcu_node *rcu_blocked_node;
         struct list_head rcu_node_entry;
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
+#ifdef CONFIG_TREE_PREEMPT_RCU
+       struct rcu_node *rcu_blocked_node;
  #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
  
  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@ -1289,9 +1298,9 @@ struct task_struct {
         struct list_head cpu_timers[3];
  
  /* process credentials */
-       const struct cred *real_cred;   /* objective and real subjective task
+       const struct cred __rcu *real_cred; /* objective and real subjective task
                                          * credentials (COW) */
-       const struct cred *cred;        /* effective (overridable) subjective task
+       const struct cred __rcu *cred;  /* effective (overridable) subjective task
                                          * credentials (COW) */
         struct mutex cred_guard_mutex;  /* guard against foreign influences on
                                          * credential calculations
@@ -1419,7 +1428,7 @@ struct task_struct {
  #endif
  #ifdef CONFIG_CGROUPS
         /* Control Group info protected by css_set_lock */
-       struct css_set *cgroups;
+       struct css_set __rcu *cgroups;
         /* cg_list protected by css_set_lock and tsk->alloc_lock */
         struct list_head cg_list;
  #endif
@@ -1432,7 +1441,7 @@ struct task_struct {
         struct futex_pi_state *pi_state_cache;
  #endif
  #ifdef CONFIG_PERF_EVENTS
-       struct perf_event_context *perf_event_ctxp;
+       struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
         struct mutex perf_event_mutex;
         struct list_head perf_event_list;
  #endif
@@ -1740,7 +1749,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
  #define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
  #define used_math() tsk_used_math(current)
  
-#ifdef CONFIG_TREE_PREEMPT_RCU
+#ifdef CONFIG_PREEMPT_RCU
  
  #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
  #define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
@@ -1749,7 +1758,9 @@ static inline void rcu_copy_process(struct task_struct *p)
  {
         p->rcu_read_lock_nesting = 0;
         p->rcu_read_unlock_special = 0;
+#ifdef CONFIG_TREE_PREEMPT_RCU
         p->rcu_blocked_node = NULL;
+#endif
         INIT_LIST_HEAD(&p->rcu_node_entry);
  }
  
diff --git a/include/linux/security.h b/include/linux/security.h

index a22219afff092952bbe276cb9da1d0509ddf196c..b8246a8df7d2dc2ecc864ac192d694b369dd12d6 100644 (file)
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -74,7 +74,7 @@ extern int cap_file_mmap(struct file *file, unsigned long reqprot,
  extern int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags);
  extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
                           unsigned long arg4, unsigned long arg5);
-extern int cap_task_setscheduler(struct task_struct *p, int policy, struct sched_param *lp);
+extern int cap_task_setscheduler(struct task_struct *p);
  extern int cap_task_setioprio(struct task_struct *p, int ioprio);
  extern int cap_task_setnice(struct task_struct *p, int nice);
  extern int cap_syslog(int type, bool from_file);
@@ -959,6 +959,12 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
   *     Sets the new child socket's sid to the openreq sid.
   * @inet_conn_established:
   *     Sets the connection's peersid to the secmark on skb.
+ * @secmark_relabel_packet:
+ *     check if the process should be allowed to relabel packets to the given secid
+ * @security_secmark_refcount_inc
+ *     tells the LSM to increment the number of secmark labeling rules loaded
+ * @security_secmark_refcount_dec
+ *     tells the LSM to decrement the number of secmark labeling rules loaded
   * @req_classify_flow:
   *     Sets the flow's sid to the openreq sid.
   * @tun_dev_create:
@@ -1279,9 +1285,13 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
   *     Return 0 if permission is granted.
   *
   * @secid_to_secctx:
- *     Convert secid to security context.
+ *     Convert secid to security context.  If secdata is NULL the length of
+ *     the result will be returned in seclen, but no secdata will be returned.
+ *     This does mean that the length could change between calls to check the
+ *     length and the next call which actually allocates and returns the secdata.
   *     @secid contains the security ID.
   *     @secdata contains the pointer that stores the converted security context.
+ *     @seclen pointer which contains the length of the data
   * @secctx_to_secid:
   *     Convert security context to secid.
   *     @secid contains the pointer to the generated security ID.
@@ -1501,8 +1511,7 @@ struct security_operations {
         int (*task_getioprio) (struct task_struct *p);
         int (*task_setrlimit) (struct task_struct *p, unsigned int resource,
                         struct rlimit *new_rlim);
-       int (*task_setscheduler) (struct task_struct *p, int policy,
-                                 struct sched_param *lp);
+       int (*task_setscheduler) (struct task_struct *p);
         int (*task_getscheduler) (struct task_struct *p);
         int (*task_movememory) (struct task_struct *p);
         int (*task_kill) (struct task_struct *p,
@@ -1594,6 +1603,9 @@ struct security_operations {
                                   struct request_sock *req);
         void (*inet_csk_clone) (struct sock *newsk, const struct request_sock *req);
         void (*inet_conn_established) (struct sock *sk, struct sk_buff *skb);
+       int (*secmark_relabel_packet) (u32 secid);
+       void (*secmark_refcount_inc) (void);
+       void (*secmark_refcount_dec) (void);
         void (*req_classify_flow) (const struct request_sock *req, struct flowi *fl);
         int (*tun_dev_create)(void);
         void (*tun_dev_post_create)(struct sock *sk);
@@ -1752,8 +1764,7 @@ int security_task_setioprio(struct task_struct *p, int ioprio);
  int security_task_getioprio(struct task_struct *p);
  int security_task_setrlimit(struct task_struct *p, unsigned int resource,
                 struct rlimit *new_rlim);
-int security_task_setscheduler(struct task_struct *p,
-                               int policy, struct sched_param *lp);
+int security_task_setscheduler(struct task_struct *p);
  int security_task_getscheduler(struct task_struct *p);
  int security_task_movememory(struct task_struct *p);
  int security_task_kill(struct task_struct *p, struct siginfo *info,
@@ -2320,11 +2331,9 @@ static inline int security_task_setrlimit(struct task_struct *p,
         return 0;
  }
  
-static inline int security_task_setscheduler(struct task_struct *p,
-                                            int policy,
-                                            struct sched_param *lp)
+static inline int security_task_setscheduler(struct task_struct *p)
  {
-       return cap_task_setscheduler(p, policy, lp);
+       return cap_task_setscheduler(p);
  }
  
  static inline int security_task_getscheduler(struct task_struct *p)
@@ -2551,6 +2560,9 @@ void security_inet_csk_clone(struct sock *newsk,
                         const struct request_sock *req);
  void security_inet_conn_established(struct sock *sk,
                         struct sk_buff *skb);
+int security_secmark_relabel_packet(u32 secid);
+void security_secmark_refcount_inc(void);
+void security_secmark_refcount_dec(void);
  int security_tun_dev_create(void);
  void security_tun_dev_post_create(struct sock *sk);
  int security_tun_dev_attach(struct sock *sk);
@@ -2705,6 +2717,19 @@ static inline void security_inet_conn_established(struct sock *sk,
  {
  }
  
+static inline int security_secmark_relabel_packet(u32 secid)
+{
+       return 0;
+}
+
+static inline void security_secmark_refcount_inc(void)
+{
+}
+
+static inline void security_secmark_refcount_dec(void)
+{
+}
+
  static inline int security_tun_dev_create(void)
  {
         return 0;
diff --git a/include/linux/selinux.h b/include/linux/selinux.h

index 82e0f26a12996a1bcce6efddd07e734313162c19..44f4596126904f7d0357654c999ed2ae13258c82 100644 (file)
--- a/include/linux/selinux.h
+++ b/include/linux/selinux.h
@@ -20,75 +20,12 @@ struct kern_ipc_perm;
  
  #ifdef CONFIG_SECURITY_SELINUX
  
-/**
- *     selinux_string_to_sid - map a security context string to a security ID
- *     @str: the security context string to be mapped
- *     @sid: ID value returned via this.
- *
- *     Returns 0 if successful, with the SID stored in sid.  A value
- *     of zero for sid indicates no SID could be determined (but no error
- *     occurred).
- */
-int selinux_string_to_sid(char *str, u32 *sid);
-
-/**
- *     selinux_secmark_relabel_packet_permission - secmark permission check
- *     @sid: SECMARK ID value to be applied to network packet
- *
- *     Returns 0 if the current task is allowed to set the SECMARK label of
- *     packets with the supplied security ID.  Note that it is implicit that
- *     the packet is always being relabeled from the default unlabeled value,
- *     and that the access control decision is made in the AVC.
- */
-int selinux_secmark_relabel_packet_permission(u32 sid);
-
-/**
- *     selinux_secmark_refcount_inc - increments the secmark use counter
- *
- *     SELinux keeps track of the current SECMARK targets in use so it knows
- *     when to apply SECMARK label access checks to network packets.  This
- *     function incements this reference count to indicate that a new SECMARK
- *     target has been configured.
- */
-void selinux_secmark_refcount_inc(void);
-
-/**
- *     selinux_secmark_refcount_dec - decrements the secmark use counter
- *
- *     SELinux keeps track of the current SECMARK targets in use so it knows
- *     when to apply SECMARK label access checks to network packets.  This
- *     function decements this reference count to indicate that one of the
- *     existing SECMARK targets has been removed/flushed.
- */
-void selinux_secmark_refcount_dec(void);
-
  /**
   * selinux_is_enabled - is SELinux enabled?
   */
  bool selinux_is_enabled(void);
  #else
  
-static inline int selinux_string_to_sid(const char *str, u32 *sid)
-{
-       *sid = 0;
-       return 0;
-}
-
-static inline int selinux_secmark_relabel_packet_permission(u32 sid)
-{
-       return 0;
-}
-
-static inline void selinux_secmark_refcount_inc(void)
-{
-       return;
-}
-
-static inline void selinux_secmark_refcount_dec(void)
-{
-       return;
-}
-
  static inline bool selinux_is_enabled(void)
  {
         return false;
diff --git a/include/linux/srcu.h b/include/linux/srcu.h

index 4d5d2f546dbff11ee6a4abb6ad2cc53770d45aeb..58971e891f489950102d41f75bea118a22604f23 100644 (file)
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -108,19 +108,43 @@ static inline int srcu_read_lock_held(struct srcu_struct *sp)
  #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
  
  /**
- * srcu_dereference - fetch SRCU-protected pointer with checking
+ * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing
+ * @p: the pointer to fetch and protect for later dereferencing
+ * @sp: pointer to the srcu_struct, which is used to check that we
+ *     really are in an SRCU read-side critical section.
+ * @c: condition to check for update-side use
   *
- * Makes rcu_dereference_check() do the dirty work.
+ * If PROVE_RCU is enabled, invoking this outside of an RCU read-side
+ * critical section will result in an RCU-lockdep splat, unless @c evaluates
+ * to 1.  The @c argument will normally be a logical expression containing
+ * lockdep_is_held() calls.
   */
-#define srcu_dereference(p, sp) \
-               rcu_dereference_check(p, srcu_read_lock_held(sp))
+#define srcu_dereference_check(p, sp, c) \
+       __rcu_dereference_check((p), srcu_read_lock_held(sp) || (c), __rcu)
+
+/**
+ * srcu_dereference - fetch SRCU-protected pointer for later dereferencing
+ * @p: the pointer to fetch and protect for later dereferencing
+ * @sp: pointer to the srcu_struct, which is used to check that we
+ *     really are in an SRCU read-side critical section.
+ *
+ * Makes rcu_dereference_check() do the dirty work.  If PROVE_RCU
+ * is enabled, invoking this outside of an RCU read-side critical
+ * section will result in an RCU-lockdep splat.
+ */
+#define srcu_dereference(p, sp) srcu_dereference_check((p), (sp), 0)
  
  /**
   * srcu_read_lock - register a new reader for an SRCU-protected structure.
   * @sp: srcu_struct in which to register the new reader.
   *
   * Enter an SRCU read-side critical section.  Note that SRCU read-side
- * critical sections may be nested.
+ * critical sections may be nested.  However, it is illegal to
+ * call anything that waits on an SRCU grace period for the same
+ * srcu_struct, whether directly or indirectly.  Please note that
+ * one way to indirectly wait on an SRCU grace period is to acquire
+ * a mutex that is held elsewhere while calling synchronize_srcu() or
+ * synchronize_srcu_expedited().
   */
  static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp)
  {
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h

index 6b524a0d02e42b14419c5ae75133360a1f4874a5..1808960c50595908935455ca90a80fc02e421ef0 100644 (file)
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -126,8 +126,8 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
  
  #else   /* CONFIG_STOP_MACHINE && CONFIG_SMP */
  
-static inline int stop_machine(int (*fn)(void *), void *data,
-                              const struct cpumask *cpus)
+static inline int __stop_machine(int (*fn)(void *), void *data,
+                                const struct cpumask *cpus)
  {
         int ret;
         local_irq_disable();
@@ -136,5 +136,11 @@ static inline int stop_machine(int (*fn)(void *), void *data,
         return ret;
  }
  
+static inline int stop_machine(int (*fn)(void *), void *data,
+                              const struct cpumask *cpus)
+{
+       return __stop_machine(fn, data, cpus);
+}
+
  #endif /* CONFIG_STOP_MACHINE && CONFIG_SMP */
  #endif /* _LINUX_STOP_MACHINE */
diff --git a/include/linux/sunrpc/auth_gss.h b/include/linux/sunrpc/auth_gss.h

index 671538d25bc15b623155f2b7b7fd269ce394b5d6..8eee9dbbfe7aaddbdb5aaebfbaf5ad82669f846d 100644 (file)
--- a/include/linux/sunrpc/auth_gss.h
+++ b/include/linux/sunrpc/auth_gss.h
@@ -69,7 +69,7 @@ struct gss_cl_ctx {
         enum rpc_gss_proc       gc_proc;
         u32                     gc_seq;
         spinlock_t              gc_seq_lock;
-       struct gss_ctx          *gc_gss_ctx;
+       struct gss_ctx __rcu    *gc_gss_ctx;
         struct xdr_netobj       gc_wire_ctx;
         u32                     gc_win;
         unsigned long           gc_expiry;
@@ -80,7 +80,7 @@ struct gss_upcall_msg;
  struct gss_cred {
         struct rpc_cred         gc_base;
         enum rpc_gss_svc        gc_service;
-       struct gss_cl_ctx       *gc_ctx;
+       struct gss_cl_ctx __rcu *gc_ctx;
         struct gss_upcall_msg   *gc_upcall;
         unsigned long           gc_upcall_timestamp;
         unsigned char           gc_machine_cred : 1;
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h

index 103d1b61aacba635bb7c81cef9e32b70fe220bb7..a4a90b6726ce6129b43174609fb3e35a2bd088ae 100644 (file)
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -17,6 +17,7 @@
  #include <linux/errno.h>
  #include <linux/types.h>
  #include <linux/rcupdate.h>
+#include <linux/jump_label.h>
  
  struct module;
  struct tracepoint;
@@ -145,7 +146,9 @@ static inline void tracepoint_update_probe_range(struct tracepoint *begin,
         extern struct tracepoint __tracepoint_##name;                   \
         static inline void trace_##name(proto)                          \
         {                                                               \
-               if (unlikely(__tracepoint_##name.state))                \
+               JUMP_LABEL(&__tracepoint_##name.state, do_trace);       \
+               return;                                                 \
+do_trace:                                                              \
                         __DO_TRACE(&__tracepoint_##name,                \
                                 TP_PROTO(data_proto),                   \
                                 TP_ARGS(data_args));                    \
diff --git a/include/linux/types.h b/include/linux/types.h

index 01a082f56ef423065adb66f621b11db2bd1d88f1..357dbc19606f8920f0b060ad04fb1d4fb2dfca00 100644 (file)
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -121,7 +121,15 @@ typedef            __u64           u_int64_t;
  typedef                __s64           int64_t;
  #endif
  
-/* this is a special 64bit data type that is 8-byte aligned */
+/*
+ * aligned_u64 should be used in defining kernel<->userspace ABIs to avoid
+ * common 32/64-bit compat problems.
+ * 64-bit values align to 4-byte boundaries on x86_32 (and possibly other
+ * architectures) and to 8-byte boundaries on 64-bit architetures.  The new
+ * aligned_64 type enforces 8-byte alignment so that structs containing
+ * aligned_64 values have the same alignment on 32-bit and 64-bit architectures.
+ * No conversions are necessary between 32-bit user-space and a 64-bit kernel.
+ */
  #define aligned_u64 __u64 __attribute__((aligned(8)))
  #define aligned_be64 __be64 __attribute__((aligned(8)))
  #define aligned_le64 __le64 __attribute__((aligned(8)))
@@ -178,6 +186,11 @@ typedef __u64 __bitwise __be64;
  typedef __u16 __bitwise __sum16;
  typedef __u32 __bitwise __wsum;
  
+/* this is a special 64bit data type that is 8-byte aligned */
+#define __aligned_u64 __u64 __attribute__((aligned(8)))
+#define __aligned_be64 __be64 __attribute__((aligned(8)))
+#define __aligned_le64 __le64 __attribute__((aligned(8)))
+
  #ifdef __KERNEL__
  typedef unsigned __bitwise__ gfp_t;
  typedef unsigned __bitwise__ fmode_t;
diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h

index ef6c24a529e1a23188a27b60a0d6e73e84fb7003..a4dc5b027bd9cc7731b7fab1146508896c0b215c 100644 (file)
--- a/include/net/cls_cgroup.h
+++ b/include/net/cls_cgroup.h
@@ -51,7 +51,8 @@ static inline u32 task_cls_classid(struct task_struct *p)
                 return 0;
  
         rcu_read_lock();
-       id = rcu_dereference(net_cls_subsys_id);
+       id = rcu_dereference_index_check(net_cls_subsys_id,
+                                        rcu_read_lock_held());
         if (id >= 0)
                 classid = container_of(task_subsys_state(p, id),
                                        struct cgroup_cls_state, css)->classid;
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h

index e624dae54fa49b7d713b78641c7f18c2e3cfbae2..caf17db87dbc8983f4aeef52a54c2424a7d21054 100644 (file)
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -75,7 +75,7 @@ struct nf_conntrack_helper;
  /* nf_conn feature for connections that have a helper */
  struct nf_conn_help {
         /* Helper. if any */
-       struct nf_conntrack_helper *helper;
+       struct nf_conntrack_helper __rcu *helper;
  
         union nf_conntrack_help help;
  
diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h

index 0e4cfb694fe70630457af67e1b1bc568f56c9b09..6fa7cbab7d932c6649e9fbd4221615b6b4d0fd8d 100644 (file)
--- a/include/trace/events/irq.h
+++ b/include/trace/events/irq.h
@@ -5,7 +5,9 @@
  #define _TRACE_IRQ_H
  
  #include <linux/tracepoint.h>
-#include <linux/interrupt.h>
+
+struct irqaction;
+struct softirq_action;
  
  #define softirq_name(sirq) { sirq##_SOFTIRQ, #sirq }
  #define show_softirq_name(val)                         \
@@ -93,7 +95,10 @@ DECLARE_EVENT_CLASS(softirq,
         ),
  
         TP_fast_assign(
-               __entry->vec = (int)(h - vec);
+               if (vec)
+                       __entry->vec = (int)(h - vec);
+               else
+                       __entry->vec = (int)(long)h;
         ),
  
         TP_printk("vec=%d [action=%s]", __entry->vec,
@@ -136,6 +141,23 @@ DEFINE_EVENT(softirq, softirq_exit,
         TP_ARGS(h, vec)
  );
  
+/**
+ * softirq_raise - called immediately when a softirq is raised
+ * @h: pointer to struct softirq_action
+ * @vec: pointer to first struct softirq_action in softirq_vec array
+ *
+ * The @h parameter contains a pointer to the softirq vector number which is
+ * raised. @vec is NULL and it means @h includes vector number not
+ * softirq_action. When used in combination with the softirq_entry tracepoint
+ * we can determine the softirq raise latency.
+ */
+DEFINE_EVENT(softirq, softirq_raise,
+
+       TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+
+       TP_ARGS(h, vec)
+);
+
  #endif /*  _TRACE_IRQ_H */
  
  /* This part must be outside protection */
diff --git a/include/trace/events/napi.h b/include/trace/events/napi.h

index 188deca2f3c7721a1baac60cc07e8d7006442c71..8fe1e93f531dd81a8e549689a1b8e9b551231e9e 100644 (file)
--- a/include/trace/events/napi.h
+++ b/include/trace/events/napi.h
@@ -6,10 +6,31 @@
  
  #include <linux/netdevice.h>
  #include <linux/tracepoint.h>
+#include <linux/ftrace.h>
+
+#define NO_DEV "(no_device)"
+
+TRACE_EVENT(napi_poll,
  
-DECLARE_TRACE(napi_poll,
         TP_PROTO(struct napi_struct *napi),
-       TP_ARGS(napi));
+
+       TP_ARGS(napi),
+
+       TP_STRUCT__entry(
+               __field(        struct napi_struct *,   napi)
+               __string(       dev_name, napi->dev ? napi->dev->name : NO_DEV)
+       ),
+
+       TP_fast_assign(
+               __entry->napi = napi;
+               __assign_str(dev_name, napi->dev ? napi->dev->name : NO_DEV);
+       ),
+
+       TP_printk("napi poll on napi struct %p for device %s",
+               __entry->napi, __get_str(dev_name))
+);
+
+#undef NO_DEV
  
  #endif /* _TRACE_NAPI_H_ */
  
diff --git a/include/trace/events/net.h b/include/trace/events/net.h

new file mode 100644 (file)

index 0000000..5f247f5
--- /dev/null
+++ b/include/trace/events/net.h
@@ -0,0 +1,82 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM net
+
+#if !defined(_TRACE_NET_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NET_H
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/ip.h>
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(net_dev_xmit,
+
+       TP_PROTO(struct sk_buff *skb,
+                int rc),
+
+       TP_ARGS(skb, rc),
+
+       TP_STRUCT__entry(
+               __field(        void *,         skbaddr         )
+               __field(        unsigned int,   len             )
+               __field(        int,            rc              )
+               __string(       name,           skb->dev->name  )
+       ),
+
+       TP_fast_assign(
+               __entry->skbaddr = skb;
+               __entry->len = skb->len;
+               __entry->rc = rc;
+               __assign_str(name, skb->dev->name);
+       ),
+
+       TP_printk("dev=%s skbaddr=%p len=%u rc=%d",
+               __get_str(name), __entry->skbaddr, __entry->len, __entry->rc)
+);
+
+DECLARE_EVENT_CLASS(net_dev_template,
+
+       TP_PROTO(struct sk_buff *skb),
+
+       TP_ARGS(skb),
+
+       TP_STRUCT__entry(
+               __field(        void *,         skbaddr         )
+               __field(        unsigned int,   len             )
+               __string(       name,           skb->dev->name  )
+       ),
+
+       TP_fast_assign(
+               __entry->skbaddr = skb;
+               __entry->len = skb->len;
+               __assign_str(name, skb->dev->name);
+       ),
+
+       TP_printk("dev=%s skbaddr=%p len=%u",
+               __get_str(name), __entry->skbaddr, __entry->len)
+)
+
+DEFINE_EVENT(net_dev_template, net_dev_queue,
+
+       TP_PROTO(struct sk_buff *skb),
+
+       TP_ARGS(skb)
+);
+
+DEFINE_EVENT(net_dev_template, netif_receive_skb,
+
+       TP_PROTO(struct sk_buff *skb),
+
+       TP_ARGS(skb)
+);
+
+DEFINE_EVENT(net_dev_template, netif_rx,
+
+       TP_PROTO(struct sk_buff *skb),
+
+       TP_ARGS(skb)
+);
+#endif /* _TRACE_NET_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/power.h b/include/trace/events/power.h

index 35a2a6e7bf1e74992b8b83b242c4a507fed4246f..286784d69b8f480343244d8046327a5a7d9883d9 100644 (file)
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -10,12 +10,17 @@
  #ifndef _TRACE_POWER_ENUM_
  #define _TRACE_POWER_ENUM_
  enum {
-       POWER_NONE = 0,
-       POWER_CSTATE = 1,
-       POWER_PSTATE = 2,
+       POWER_NONE      = 0,
+       POWER_CSTATE    = 1,    /* C-State */
+       POWER_PSTATE    = 2,    /* Fequency change or DVFS */
+       POWER_SSTATE    = 3,    /* Suspend */
  };
  #endif
  
+/*
+ * The power events are used for cpuidle & suspend (power_start, power_end)
+ *  and for cpufreq (power_frequency)
+ */
  DECLARE_EVENT_CLASS(power,
  
         TP_PROTO(unsigned int type, unsigned int state, unsigned int cpu_id),
@@ -70,6 +75,85 @@ TRACE_EVENT(power_end,
  
  );
  
+/*
+ * The clock events are used for clock enable/disable and for
+ *  clock rate change
+ */
+DECLARE_EVENT_CLASS(clock,
+
+       TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),
+
+       TP_ARGS(name, state, cpu_id),
+
+       TP_STRUCT__entry(
+               __string(       name,           name            )
+               __field(        u64,            state           )
+               __field(        u64,            cpu_id          )
+       ),
+
+       TP_fast_assign(
+               __assign_str(name, name);
+               __entry->state = state;
+               __entry->cpu_id = cpu_id;
+       ),
+
+       TP_printk("%s state=%lu cpu_id=%lu", __get_str(name),
+               (unsigned long)__entry->state, (unsigned long)__entry->cpu_id)
+);
+
+DEFINE_EVENT(clock, clock_enable,
+
+       TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),
+
+       TP_ARGS(name, state, cpu_id)
+);
+
+DEFINE_EVENT(clock, clock_disable,
+
+       TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),
+
+       TP_ARGS(name, state, cpu_id)
+);
+
+DEFINE_EVENT(clock, clock_set_rate,
+
+       TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),
+
+       TP_ARGS(name, state, cpu_id)
+);
+
+/*
+ * The power domain events are used for power domains transitions
+ */
+DECLARE_EVENT_CLASS(power_domain,
+
+       TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),
+
+       TP_ARGS(name, state, cpu_id),
+
+       TP_STRUCT__entry(
+               __string(       name,           name            )
+               __field(        u64,            state           )
+               __field(        u64,            cpu_id          )
+       ),
+
+       TP_fast_assign(
+               __assign_str(name, name);
+               __entry->state = state;
+               __entry->cpu_id = cpu_id;
+),
+
+       TP_printk("%s state=%lu cpu_id=%lu", __get_str(name),
+               (unsigned long)__entry->state, (unsigned long)__entry->cpu_id)
+);
+
+DEFINE_EVENT(power_domain, power_domain_target,
+
+       TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id),
+
+       TP_ARGS(name, state, cpu_id)
+);
+
  #endif /* _TRACE_POWER_H */
  
  /* This part must be outside protection */
diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h

index 4b2be6dc76f091647eb30f30a40a49218def8682..75ce9d500d8e3c62dbfffbc0acafca13d11d90aa 100644 (file)
--- a/include/trace/events/skb.h
+++ b/include/trace/events/skb.h
@@ -35,6 +35,23 @@ TRACE_EVENT(kfree_skb,
                 __entry->skbaddr, __entry->protocol, __entry->location)
  );
  
+TRACE_EVENT(consume_skb,
+
+       TP_PROTO(struct sk_buff *skb),
+
+       TP_ARGS(skb),
+
+       TP_STRUCT__entry(
+               __field(        void *, skbaddr )
+       ),
+
+       TP_fast_assign(
+               __entry->skbaddr = skb;
+       ),
+
+       TP_printk("skbaddr=%p", __entry->skbaddr)
+);
+
  TRACE_EVENT(skb_copy_datagram_iovec,
  
         TP_PROTO(const struct sk_buff *skb, int len),
diff --git a/init/Kconfig b/init/Kconfig

index 2de5b1cbadd9e47138f879d23cc4d2d5066d32d7..7b920aafa98a3bc5e8873bb192e8c0db59535556 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -21,6 +21,13 @@ config CONSTRUCTORS
         depends on !UML
         default y
  
+config HAVE_IRQ_WORK
+       bool
+
+config IRQ_WORK
+       bool
+       depends on HAVE_IRQ_WORK
+
  menu "General setup"
  
  config EXPERIMENTAL
@@ -340,6 +347,7 @@ choice
  
  config TREE_RCU
         bool "Tree-based hierarchical RCU"
+       depends on !PREEMPT && SMP
         help
           This option selects the RCU implementation that is
           designed for very large SMP system with hundreds or
@@ -347,7 +355,7 @@ config TREE_RCU
           smaller systems.
  
  config TREE_PREEMPT_RCU
-       bool "Preemptable tree-based hierarchical RCU"
+       bool "Preemptible tree-based hierarchical RCU"
         depends on PREEMPT
         help
           This option selects the RCU implementation that is
@@ -365,8 +373,22 @@ config TINY_RCU
           is not required.  This option greatly reduces the
           memory footprint of RCU.
  
+config TINY_PREEMPT_RCU
+       bool "Preemptible UP-only small-memory-footprint RCU"
+       depends on !SMP && PREEMPT
+       help
+         This option selects the RCU implementation that is designed
+         for real-time UP systems.  This option greatly reduces the
+         memory footprint of RCU.
+
  endchoice
  
+config PREEMPT_RCU
+       def_bool ( TREE_PREEMPT_RCU || TINY_PREEMPT_RCU )
+       help
+         This option enables preemptible-RCU code that is common between
+         the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations.
+
  config RCU_TRACE
         bool "Enable tracing for RCU"
         depends on TREE_RCU || TREE_PREEMPT_RCU
@@ -387,9 +409,12 @@ config RCU_FANOUT
         help
           This option controls the fanout of hierarchical implementations
           of RCU, allowing RCU to work efficiently on machines with
-         large numbers of CPUs.  This value must be at least the cube
-         root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
-         systems and up to 262,144 for 64-bit systems.
+         large numbers of CPUs.  This value must be at least the fourth
+         root of NR_CPUS, which allows NR_CPUS to be insanely large.
+         The default value of RCU_FANOUT should be used for production
+         systems, but if you are stress-testing the RCU implementation
+         itself, small RCU_FANOUT values allow you to test large-system
+         code paths on small(er) systems.
  
           Select a specific number if testing RCU itself.
           Take the default if unsure.
@@ -987,6 +1012,7 @@ config PERF_EVENTS
         default y if (PROFILING || PERF_COUNTERS)
         depends on HAVE_PERF_EVENTS
         select ANON_INODES
+       select IRQ_WORK
         help
           Enable kernel support for various performance events provided
           by software and hardware.
diff --git a/kernel/Makefile b/kernel/Makefile

index 0b72d1a74be07c25b99a8da670ec4cfb0963cf77..e2c9d52cfe9e52b31eb6a430195d63da22e8aa30 100644 (file)
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
             kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
             hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
             notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
-           async.o range.o
+           async.o range.o jump_label.o
  obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
  obj-y += groups.o
  
@@ -23,6 +23,7 @@ CFLAGS_REMOVE_rtmutex-debug.o = -pg
  CFLAGS_REMOVE_cgroup-debug.o = -pg
  CFLAGS_REMOVE_sched_clock.o = -pg
  CFLAGS_REMOVE_perf_event.o = -pg
+CFLAGS_REMOVE_irq_work.o = -pg
  endif
  
  obj-$(CONFIG_FREEZER) += freezer.o
@@ -86,6 +87,7 @@ obj-$(CONFIG_TREE_RCU) += rcutree.o
  obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
  obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
  obj-$(CONFIG_TINY_RCU) += rcutiny.o
+obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o
  obj-$(CONFIG_RELAY) += relay.o
  obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
  obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@ -100,6 +102,7 @@ obj-$(CONFIG_TRACING) += trace/
  obj-$(CONFIG_X86_DS) += trace/
  obj-$(CONFIG_RING_BUFFER) += trace/
  obj-$(CONFIG_SMP) += sched_cpupri.o
+obj-$(CONFIG_IRQ_WORK) += irq_work.o
  obj-$(CONFIG_PERF_EVENTS) += perf_event.o
  obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
  obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index c9483d8f6140ed6cb4e06fa6139e2aeb907b3d47..291ba3d04beab08f0edc4a0239bae956941e50d9 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -138,7 +138,7 @@ struct css_id {
          * is called after synchronize_rcu(). But for safe use, css_is_removed()
          * css_tryget() should be used for avoiding race.
          */
-       struct cgroup_subsys_state *css;
+       struct cgroup_subsys_state __rcu *css;
         /*
          * ID of this css.
          */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c

index b23c0979bbe7212a748a9aac70697e92ee54f448..51b143e2a07a49603d9aa462728f6a45032c01d9 100644 (file)
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1397,7 +1397,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
         if (tsk->flags & PF_THREAD_BOUND)
                 return -EINVAL;
  
-       ret = security_task_setscheduler(tsk, 0, NULL);
+       ret = security_task_setscheduler(tsk);
         if (ret)
                 return ret;
         if (threadgroup) {
@@ -1405,7 +1405,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
  
                 rcu_read_lock();
                 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                       ret = security_task_setscheduler(c, 0, NULL);
+                       ret = security_task_setscheduler(c);
                         if (ret) {
                                 rcu_read_unlock();
                                 return ret;
diff --git a/kernel/exit.c b/kernel/exit.c

index 03120229db2802929065a210930e41c7fa701ba0..e2bdf37f9fdea71a15acb3523ec20b63a6aa42d3 100644 (file)
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -149,9 +149,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
  {
         struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
  
-#ifdef CONFIG_PERF_EVENTS
-       WARN_ON_ONCE(tsk->perf_event_ctxp);
-#endif
+       perf_event_delayed_put(tsk);
         trace_sched_process_free(tsk);
         put_task_struct(tsk);
  }
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c

index 1decafbb6b1a28197b768021cc987e230d352b5b..72206cf5c6cf854898d889a6a645e44febdd526f 100644 (file)
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -931,6 +931,7 @@ static inline int
  remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
  {
         if (hrtimer_is_queued(timer)) {
+               unsigned long state;
                 int reprogram;
  
                 /*
@@ -944,8 +945,13 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
                 debug_deactivate(timer);
                 timer_stats_hrtimer_clear_start_info(timer);
                 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
-               __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
-                                reprogram);
+               /*
+                * We must preserve the CALLBACK state flag here,
+                * otherwise we could move the timer base in
+                * switch_hrtimer_base.
+                */
+               state = timer->state & HRTIMER_STATE_CALLBACK;
+               __remove_hrtimer(timer, base, state, reprogram);
                 return 1;
         }
         return 0;
@@ -1231,6 +1237,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
                 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
                 enqueue_hrtimer(timer, base);
         }
+
+       WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
+
         timer->state &= ~HRTIMER_STATE_CALLBACK;
  }
  
diff --git a/kernel/hung_task.c b/kernel/hung_task.c

index 0c642d51aac2d8ab82c782a786eb13b90c0985c1..53ead174da2f0a7c7849df4ef81b60a96fe7c4f6 100644 (file)
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -98,7 +98,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
         printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
                         " disables this message.\n");
         sched_show_task(t);
-       __debug_show_held_locks(t);
+       debug_show_held_locks(t);
  
         touch_nmi_watchdog();
  
@@ -111,7 +111,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
   * periodically exit the critical section and enter a new one.
   *
   * For preemptible RCU it is sufficient to call rcu_read_unlock in order
- * exit the grace period. For classic RCU, a reschedule is required.
+ * to exit the grace period. For classic RCU, a reschedule is required.
   */
  static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
  {
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c

index c7c2aed9e2dcc2e52669e7d3880be0666c496bff..2c9120f0afca9872cc4edba57d4e5cfe9311788f 100644 (file)
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -113,12 +113,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
   */
  static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
  {
-       struct perf_event_context *ctx = bp->ctx;
+       struct task_struct *tsk = bp->hw.bp_target;
         struct perf_event *iter;
         int count = 0;
  
         list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
-               if (iter->ctx == ctx && find_slot_idx(iter) == type)
+               if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type)
                         count += hw_breakpoint_weight(iter);
         }
  
@@ -134,7 +134,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
                     enum bp_type_idx type)
  {
         int cpu = bp->cpu;
-       struct task_struct *tsk = bp->ctx->task;
+       struct task_struct *tsk = bp->hw.bp_target;
  
         if (cpu >= 0) {
                 slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
@@ -213,7 +213,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
                int weight)
  {
         int cpu = bp->cpu;
-       struct task_struct *tsk = bp->ctx->task;
+       struct task_struct *tsk = bp->hw.bp_target;
  
         /* Pinned counter cpu profiling */
         if (!tsk) {
@@ -433,8 +433,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
                             perf_overflow_handler_t triggered,
                             struct task_struct *tsk)
  {
-       return perf_event_create_kernel_counter(attr, -1, task_pid_vnr(tsk),
-                                               triggered);
+       return perf_event_create_kernel_counter(attr, -1, tsk, triggered);
  }
  EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
  
@@ -516,7 +515,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
         get_online_cpus();
         for_each_online_cpu(cpu) {
                 pevent = per_cpu_ptr(cpu_events, cpu);
-               bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
+               bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered);
  
                 *pevent = bp;
  
@@ -566,6 +565,61 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
         .priority = 0x7fffffff
  };
  
+static void bp_perf_event_destroy(struct perf_event *event)
+{
+       release_bp_slot(event);
+}
+
+static int hw_breakpoint_event_init(struct perf_event *bp)
+{
+       int err;
+
+       if (bp->attr.type != PERF_TYPE_BREAKPOINT)
+               return -ENOENT;
+
+       err = register_perf_hw_breakpoint(bp);
+       if (err)
+               return err;
+
+       bp->destroy = bp_perf_event_destroy;
+
+       return 0;
+}
+
+static int hw_breakpoint_add(struct perf_event *bp, int flags)
+{
+       if (!(flags & PERF_EF_START))
+               bp->hw.state = PERF_HES_STOPPED;
+
+       return arch_install_hw_breakpoint(bp);
+}
+
+static void hw_breakpoint_del(struct perf_event *bp, int flags)
+{
+       arch_uninstall_hw_breakpoint(bp);
+}
+
+static void hw_breakpoint_start(struct perf_event *bp, int flags)
+{
+       bp->hw.state = 0;
+}
+
+static void hw_breakpoint_stop(struct perf_event *bp, int flags)
+{
+       bp->hw.state = PERF_HES_STOPPED;
+}
+
+static struct pmu perf_breakpoint = {
+       .task_ctx_nr    = perf_sw_context, /* could eventually get its own */
+
+       .event_init     = hw_breakpoint_event_init,
+       .add            = hw_breakpoint_add,
+       .del            = hw_breakpoint_del,
+       .start          = hw_breakpoint_start,
+       .stop           = hw_breakpoint_stop,
+       .read           = hw_breakpoint_pmu_read,
+};
+
  static int __init init_hw_breakpoint(void)
  {
         unsigned int **task_bp_pinned;
@@ -587,6 +641,8 @@ static int __init init_hw_breakpoint(void)
  
         constraints_initialized = 1;
  
+       perf_pmu_register(&perf_breakpoint);
+
         return register_die_notifier(&hw_breakpoint_exceptions_nb);
  
   err_alloc:
@@ -602,8 +658,3 @@ static int __init init_hw_breakpoint(void)
  core_initcall(init_hw_breakpoint);
  
  
-struct pmu perf_ops_bp = {
-       .enable         = arch_install_hw_breakpoint,
-       .disable        = arch_uninstall_hw_breakpoint,
-       .read           = hw_breakpoint_pmu_read,
-};
diff --git a/kernel/irq_work.c b/kernel/irq_work.c

new file mode 100644 (file)

index 0000000..f16763f
--- /dev/null
+++ b/kernel/irq_work.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * Provides a framework for enqueueing and running callbacks from hardirq
+ * context. The enqueueing is NMI-safe.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/irq_work.h>
+#include <linux/hardirq.h>
+
+/*
+ * An entry can be in one of four states:
+ *
+ * free             NULL, 0 -> {claimed}       : free to be used
+ * claimed   NULL, 3 -> {pending}       : claimed to be enqueued
+ * pending   next, 3 -> {busy}          : queued, pending callback
+ * busy      NULL, 2 -> {free, claimed} : callback in progress, can be claimed
+ *
+ * We use the lower two bits of the next pointer to keep PENDING and BUSY
+ * flags.
+ */
+
+#define IRQ_WORK_PENDING       1UL
+#define IRQ_WORK_BUSY          2UL
+#define IRQ_WORK_FLAGS         3UL
+
+static inline bool irq_work_is_set(struct irq_work *entry, int flags)
+{
+       return (unsigned long)entry->next & flags;
+}
+
+static inline struct irq_work *irq_work_next(struct irq_work *entry)
+{
+       unsigned long next = (unsigned long)entry->next;
+       next &= ~IRQ_WORK_FLAGS;
+       return (struct irq_work *)next;
+}
+
+static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
+{
+       unsigned long next = (unsigned long)entry;
+       next |= flags;
+       return (struct irq_work *)next;
+}
+
+static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
+
+/*
+ * Claim the entry so that no one else will poke at it.
+ */
+static bool irq_work_claim(struct irq_work *entry)
+{
+       struct irq_work *next, *nflags;
+
+       do {
+               next = entry->next;
+               if ((unsigned long)next & IRQ_WORK_PENDING)
+                       return false;
+               nflags = next_flags(next, IRQ_WORK_FLAGS);
+       } while (cmpxchg(&entry->next, next, nflags) != next);
+
+       return true;
+}
+
+
+void __weak arch_irq_work_raise(void)
+{
+       /*
+        * Lame architectures will get the timer tick callback
+        */
+}
+
+/*
+ * Queue the entry and raise the IPI if needed.
+ */
+static void __irq_work_queue(struct irq_work *entry)
+{
+       struct irq_work **head, *next;
+
+       head = &get_cpu_var(irq_work_list);
+
+       do {
+               next = *head;
+               /* Can assign non-atomic because we keep the flags set. */
+               entry->next = next_flags(next, IRQ_WORK_FLAGS);
+       } while (cmpxchg(head, next, entry) != next);
+
+       /* The list was empty, raise self-interrupt to start processing. */
+       if (!irq_work_next(entry))
+               arch_irq_work_raise();
+
+       put_cpu_var(irq_work_list);
+}
+
+/*
+ * Enqueue the irq_work @entry, returns true on success, failure when the
+ * @entry was already enqueued by someone else.
+ *
+ * Can be re-enqueued while the callback is still in progress.
+ */
+bool irq_work_queue(struct irq_work *entry)
+{
+       if (!irq_work_claim(entry)) {
+               /*
+                * Already enqueued, can't do!
+                */
+               return false;
+       }
+
+       __irq_work_queue(entry);
+       return true;
+}
+EXPORT_SYMBOL_GPL(irq_work_queue);
+
+/*
+ * Run the irq_work entries on this cpu. Requires to be ran from hardirq
+ * context with local IRQs disabled.
+ */
+void irq_work_run(void)
+{
+       struct irq_work *list, **head;
+
+       head = &__get_cpu_var(irq_work_list);
+       if (*head == NULL)
+               return;
+
+       BUG_ON(!in_irq());
+       BUG_ON(!irqs_disabled());
+
+       list = xchg(head, NULL);
+       while (list != NULL) {
+               struct irq_work *entry = list;
+
+               list = irq_work_next(list);
+
+               /*
+                * Clear the PENDING bit, after this point the @entry
+                * can be re-used.
+                */
+               entry->next = next_flags(NULL, IRQ_WORK_BUSY);
+               entry->func(entry);
+               /*
+                * Clear the BUSY bit and return to the free state if
+                * no-one else claimed it meanwhile.
+                */
+               cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL);
+       }
+}
+EXPORT_SYMBOL_GPL(irq_work_run);
+
+/*
+ * Synchronize against the irq_work @entry, ensures the entry is not
+ * currently in use.
+ */
+void irq_work_sync(struct irq_work *entry)
+{
+       WARN_ON_ONCE(irqs_disabled());
+
+       while (irq_work_is_set(entry, IRQ_WORK_BUSY))
+               cpu_relax();
+}
+EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c

new file mode 100644 (file)

index 0000000..7be868b
--- /dev/null
+++ b/kernel/jump_label.c
@@ -0,0 +1,429 @@
+/*
+ * jump label support
+ *
+ * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
+ *
+ */
+#include <linux/jump_label.h>
+#include <linux/memory.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+#include <linux/err.h>
+
+#ifdef HAVE_JUMP_LABEL
+
+#define JUMP_LABEL_HASH_BITS 6
+#define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS)
+static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE];
+
+/* mutex to protect coming/going of the the jump_label table */
+static DEFINE_MUTEX(jump_label_mutex);
+
+struct jump_label_entry {
+       struct hlist_node hlist;
+       struct jump_entry *table;
+       int nr_entries;
+       /* hang modules off here */
+       struct hlist_head modules;
+       unsigned long key;
+};
+
+struct jump_label_module_entry {
+       struct hlist_node hlist;
+       struct jump_entry *table;
+       int nr_entries;
+       struct module *mod;
+};
+
+static int jump_label_cmp(const void *a, const void *b)
+{
+       const struct jump_entry *jea = a;
+       const struct jump_entry *jeb = b;
+
+       if (jea->key < jeb->key)
+               return -1;
+
+       if (jea->key > jeb->key)
+               return 1;
+
+       return 0;
+}
+
+static void
+sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop)
+{
+       unsigned long size;
+
+       size = (((unsigned long)stop - (unsigned long)start)
+                                       / sizeof(struct jump_entry));
+       sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
+}
+
+static struct jump_label_entry *get_jump_label_entry(jump_label_t key)
+{
+       struct hlist_head *head;
+       struct hlist_node *node;
+       struct jump_label_entry *e;
+       u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
+
+       head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
+       hlist_for_each_entry(e, node, head, hlist) {
+               if (key == e->key)
+                       return e;
+       }
+       return NULL;
+}
+
+static struct jump_label_entry *
+add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table)
+{
+       struct hlist_head *head;
+       struct jump_label_entry *e;
+       u32 hash;
+
+       e = get_jump_label_entry(key);
+       if (e)
+               return ERR_PTR(-EEXIST);
+
+       e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL);
+       if (!e)
+               return ERR_PTR(-ENOMEM);
+
+       hash = jhash((void *)&key, sizeof(jump_label_t), 0);
+       head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
+       e->key = key;
+       e->table = table;
+       e->nr_entries = nr_entries;
+       INIT_HLIST_HEAD(&(e->modules));
+       hlist_add_head(&e->hlist, head);
+       return e;
+}
+
+static int
+build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop)
+{
+       struct jump_entry *iter, *iter_begin;
+       struct jump_label_entry *entry;
+       int count;
+
+       sort_jump_label_entries(start, stop);
+       iter = start;
+       while (iter < stop) {
+               entry = get_jump_label_entry(iter->key);
+               if (!entry) {
+                       iter_begin = iter;
+                       count = 0;
+                       while ((iter < stop) &&
+                               (iter->key == iter_begin->key)) {
+                               iter++;
+                               count++;
+                       }
+                       entry = add_jump_label_entry(iter_begin->key,
+                                                       count, iter_begin);
+                       if (IS_ERR(entry))
+                               return PTR_ERR(entry);
+                } else {
+                       WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n");
+                       return -1;
+               }
+       }
+       return 0;
+}
+
+/***
+ * jump_label_update - update jump label text
+ * @key -  key value associated with a a jump label
+ * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE
+ *
+ * Will enable/disable the jump for jump label @key, depending on the
+ * value of @type.
+ *
+ */
+
+void jump_label_update(unsigned long key, enum jump_label_type type)
+{
+       struct jump_entry *iter;
+       struct jump_label_entry *entry;
+       struct hlist_node *module_node;
+       struct jump_label_module_entry *e_module;
+       int count;
+
+       mutex_lock(&jump_label_mutex);
+       entry = get_jump_label_entry((jump_label_t)key);
+       if (entry) {
+               count = entry->nr_entries;
+               iter = entry->table;
+               while (count--) {
+                       if (kernel_text_address(iter->code))
+                               arch_jump_label_transform(iter, type);
+                       iter++;
+               }
+               /* eanble/disable jump labels in modules */
+               hlist_for_each_entry(e_module, module_node, &(entry->modules),
+                                                       hlist) {
+                       count = e_module->nr_entries;
+                       iter = e_module->table;
+                       while (count--) {
+                               if (kernel_text_address(iter->code))
+                                       arch_jump_label_transform(iter, type);
+                               iter++;
+                       }
+               }
+       }
+       mutex_unlock(&jump_label_mutex);
+}
+
+static int addr_conflict(struct jump_entry *entry, void *start, void *end)
+{
+       if (entry->code <= (unsigned long)end &&
+               entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start)
+               return 1;
+
+       return 0;
+}
+
+#ifdef CONFIG_MODULES
+
+static int module_conflict(void *start, void *end)
+{
+       struct hlist_head *head;
+       struct hlist_node *node, *node_next, *module_node, *module_node_next;
+       struct jump_label_entry *e;
+       struct jump_label_module_entry *e_module;
+       struct jump_entry *iter;
+       int i, count;
+       int conflict = 0;
+
+       for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
+               head = &jump_label_table[i];
+               hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
+                       hlist_for_each_entry_safe(e_module, module_node,
+                                                       module_node_next,
+                                                       &(e->modules), hlist) {
+                               count = e_module->nr_entries;
+                               iter = e_module->table;
+                               while (count--) {
+                                       if (addr_conflict(iter, start, end)) {
+                                               conflict = 1;
+                                               goto out;
+                                       }
+                                       iter++;
+                               }
+                       }
+               }
+       }
+out:
+       return conflict;
+}
+
+#endif
+
+/***
+ * jump_label_text_reserved - check if addr range is reserved
+ * @start: start text addr
+ * @end: end text addr
+ *
+ * checks if the text addr located between @start and @end
+ * overlaps with any of the jump label patch addresses. Code
+ * that wants to modify kernel text should first verify that
+ * it does not overlap with any of the jump label addresses.
+ *
+ * returns 1 if there is an overlap, 0 otherwise
+ */
+int jump_label_text_reserved(void *start, void *end)
+{
+       struct jump_entry *iter;
+       struct jump_entry *iter_start = __start___jump_table;
+       struct jump_entry *iter_stop = __start___jump_table;
+       int conflict = 0;
+
+       mutex_lock(&jump_label_mutex);
+       iter = iter_start;
+       while (iter < iter_stop) {
+               if (addr_conflict(iter, start, end)) {
+                       conflict = 1;
+                       goto out;
+               }
+               iter++;
+       }
+
+       /* now check modules */
+#ifdef CONFIG_MODULES
+       conflict = module_conflict(start, end);
+#endif
+out:
+       mutex_unlock(&jump_label_mutex);
+       return conflict;
+}
+
+static __init int init_jump_label(void)
+{
+       int ret;
+       struct jump_entry *iter_start = __start___jump_table;
+       struct jump_entry *iter_stop = __stop___jump_table;
+       struct jump_entry *iter;
+
+       mutex_lock(&jump_label_mutex);
+       ret = build_jump_label_hashtable(__start___jump_table,
+                                        __stop___jump_table);
+       iter = iter_start;
+       while (iter < iter_stop) {
+               arch_jump_label_text_poke_early(iter->code);
+               iter++;
+       }
+       mutex_unlock(&jump_label_mutex);
+       return ret;
+}
+early_initcall(init_jump_label);
+
+#ifdef CONFIG_MODULES
+
+static struct jump_label_module_entry *
+add_jump_label_module_entry(struct jump_label_entry *entry,
+                           struct jump_entry *iter_begin,
+                           int count, struct module *mod)
+{
+       struct jump_label_module_entry *e;
+
+       e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL);
+       if (!e)
+               return ERR_PTR(-ENOMEM);
+       e->mod = mod;
+       e->nr_entries = count;
+       e->table = iter_begin;
+       hlist_add_head(&e->hlist, &entry->modules);
+       return e;
+}
+
+static int add_jump_label_module(struct module *mod)
+{
+       struct jump_entry *iter, *iter_begin;
+       struct jump_label_entry *entry;
+       struct jump_label_module_entry *module_entry;
+       int count;
+
+       /* if the module doesn't have jump label entries, just return */
+       if (!mod->num_jump_entries)
+               return 0;
+
+       sort_jump_label_entries(mod->jump_entries,
+                               mod->jump_entries + mod->num_jump_entries);
+       iter = mod->jump_entries;
+       while (iter < mod->jump_entries + mod->num_jump_entries) {
+               entry = get_jump_label_entry(iter->key);
+               iter_begin = iter;
+               count = 0;
+               while ((iter < mod->jump_entries + mod->num_jump_entries) &&
+                       (iter->key == iter_begin->key)) {
+                               iter++;
+                               count++;
+               }
+               if (!entry) {
+                       entry = add_jump_label_entry(iter_begin->key, 0, NULL);
+                       if (IS_ERR(entry))
+                               return PTR_ERR(entry);
+               }
+               module_entry = add_jump_label_module_entry(entry, iter_begin,
+                                                          count, mod);
+               if (IS_ERR(module_entry))
+                       return PTR_ERR(module_entry);
+       }
+       return 0;
+}
+
+static void remove_jump_label_module(struct module *mod)
+{
+       struct hlist_head *head;
+       struct hlist_node *node, *node_next, *module_node, *module_node_next;
+       struct jump_label_entry *e;
+       struct jump_label_module_entry *e_module;
+       int i;
+
+       /* if the module doesn't have jump label entries, just return */
+       if (!mod->num_jump_entries)
+               return;
+
+       for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
+               head = &jump_label_table[i];
+               hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
+                       hlist_for_each_entry_safe(e_module, module_node,
+                                                 module_node_next,
+                                                 &(e->modules), hlist) {
+                               if (e_module->mod == mod) {
+                                       hlist_del(&e_module->hlist);
+                                       kfree(e_module);
+                               }
+                       }
+                       if (hlist_empty(&e->modules) && (e->nr_entries == 0)) {
+                               hlist_del(&e->hlist);
+                               kfree(e);
+                       }
+               }
+       }
+}
+
+static int
+jump_label_module_notify(struct notifier_block *self, unsigned long val,
+                        void *data)
+{
+       struct module *mod = data;
+       int ret = 0;
+
+       switch (val) {
+       case MODULE_STATE_COMING:
+               mutex_lock(&jump_label_mutex);
+               ret = add_jump_label_module(mod);
+               if (ret)
+                       remove_jump_label_module(mod);
+               mutex_unlock(&jump_label_mutex);
+               break;
+       case MODULE_STATE_GOING:
+               mutex_lock(&jump_label_mutex);
+               remove_jump_label_module(mod);
+               mutex_unlock(&jump_label_mutex);
+               break;
+       }
+       return ret;
+}
+
+/***
+ * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
+ * @mod: module to patch
+ *
+ * Allow for run-time selection of the optimal nops. Before the module
+ * loads patch these with arch_get_jump_label_nop(), which is specified by
+ * the arch specific jump label code.
+ */
+void jump_label_apply_nops(struct module *mod)
+{
+       struct jump_entry *iter;
+
+       /* if the module doesn't have jump label entries, just return */
+       if (!mod->num_jump_entries)
+               return;
+
+       iter = mod->jump_entries;
+       while (iter < mod->jump_entries + mod->num_jump_entries) {
+               arch_jump_label_text_poke_early(iter->code);
+               iter++;
+       }
+}
+
+struct notifier_block jump_label_module_nb = {
+       .notifier_call = jump_label_module_notify,
+       .priority = 0,
+};
+
+static __init int init_jump_label_module(void)
+{
+       return register_module_notifier(&jump_label_module_nb);
+}
+early_initcall(init_jump_label_module);
+
+#endif /* CONFIG_MODULES */
+
+#endif
diff --git a/kernel/kprobes.c b/kernel/kprobes.c

index 282035f3ae964e1e288f352c370be8edd11d3078..ec4210c6501e1b549d18c2a89fa3cbe8725da35b 100644 (file)
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -47,6 +47,7 @@
  #include <linux/memory.h>
  #include <linux/ftrace.h>
  #include <linux/cpu.h>
+#include <linux/jump_label.h>
  
  #include <asm-generic/sections.h>
  #include <asm/cacheflush.h>
@@ -399,7 +400,7 @@ static inline int kprobe_optready(struct kprobe *p)
   * Return an optimized kprobe whose optimizing code replaces
   * instructions including addr (exclude breakpoint).
   */
-struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
+static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
  {
         int i;
         struct kprobe *p = NULL;
@@ -831,6 +832,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
  
  void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
                          struct hlist_head **head, unsigned long *flags)
+__acquires(hlist_lock)
  {
         unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
         spinlock_t *hlist_lock;
@@ -842,6 +844,7 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
  
  static void __kprobes kretprobe_table_lock(unsigned long hash,
         unsigned long *flags)
+__acquires(hlist_lock)
  {
         spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
         spin_lock_irqsave(hlist_lock, *flags);
@@ -849,6 +852,7 @@ static void __kprobes kretprobe_table_lock(unsigned long hash,
  
  void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
         unsigned long *flags)
+__releases(hlist_lock)
  {
         unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
         spinlock_t *hlist_lock;
@@ -857,7 +861,9 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
         spin_unlock_irqrestore(hlist_lock, *flags);
  }
  
-void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags)
+static void __kprobes kretprobe_table_unlock(unsigned long hash,
+       unsigned long *flags)
+__releases(hlist_lock)
  {
         spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
         spin_unlock_irqrestore(hlist_lock, *flags);
@@ -1141,7 +1147,8 @@ int __kprobes register_kprobe(struct kprobe *p)
         preempt_disable();
         if (!kernel_text_address((unsigned long) p->addr) ||
             in_kprobes_functions((unsigned long) p->addr) ||
-           ftrace_text_reserved(p->addr, p->addr)) {
+           ftrace_text_reserved(p->addr, p->addr) ||
+           jump_label_text_reserved(p->addr, p->addr)) {
                 preempt_enable();
                 return -EINVAL;
         }
@@ -1339,18 +1346,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
         if (num <= 0)
                 return -EINVAL;
         for (i = 0; i < num; i++) {
-               unsigned long addr;
+               unsigned long addr, offset;
                 jp = jps[i];
                 addr = arch_deref_entry_point(jp->entry);
  
-               if (!kernel_text_address(addr))
-                       ret = -EINVAL;
-               else {
-                       /* Todo: Verify probepoint is a function entry point */
+               /* Verify probepoint is a function entry point */
+               if (kallsyms_lookup_size_offset(addr, NULL, &offset) &&
+                   offset == 0) {
                         jp->kp.pre_handler = setjmp_pre_handler;
                         jp->kp.break_handler = longjmp_break_handler;
                         ret = register_kprobe(&jp->kp);
-               }
+               } else
+                       ret = -EINVAL;
+
                 if (ret < 0) {
                         if (i > 0)
                                 unregister_jprobes(jps, i);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c

index f2852a5102327f74c39531a517e488c0d1e673e5..42ba65dff7d99e4eeadfe6175d70c2f9665ad431 100644 (file)
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -639,6 +639,16 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
         }
  #endif
  
+       if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
+               debug_locks_off();
+               printk(KERN_ERR
+                       "BUG: looking up invalid subclass: %u\n", subclass);
+               printk(KERN_ERR
+                       "turning off the locking correctness validator.\n");
+               dump_stack();
+               return NULL;
+       }
+
         /*
          * Static locks do not have their class-keys yet - for them the key
          * is the lock object itself:
@@ -774,7 +784,9 @@ out_unlock_set:
         raw_local_irq_restore(flags);
  
         if (!subclass || force)
-               lock->class_cache = class;
+               lock->class_cache[0] = class;
+       else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
+               lock->class_cache[subclass] = class;
  
         if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
                 return NULL;
@@ -2679,7 +2691,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
  void lockdep_init_map(struct lockdep_map *lock, const char *name,
                       struct lock_class_key *key, int subclass)
  {
-       lock->class_cache = NULL;
+       int i;
+
+       for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
+               lock->class_cache[i] = NULL;
+
  #ifdef CONFIG_LOCK_STAT
         lock->cpu = raw_smp_processor_id();
  #endif
@@ -2739,21 +2755,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
         if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                 return 0;
  
-       if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
-               debug_locks_off();
-               printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
-               printk("turning off the locking correctness validator.\n");
-               dump_stack();
-               return 0;
-       }
-
         if (lock->key == &__lockdep_no_validate__)
                 check = 1;
  
-       if (!subclass)
-               class = lock->class_cache;
+       if (subclass < NR_LOCKDEP_CACHING_CLASSES)
+               class = lock->class_cache[subclass];
         /*
-        * Not cached yet or subclass?
+        * Not cached?
          */
         if (unlikely(!class)) {
                 class = register_lock_class(lock, subclass, 0);
@@ -2918,7 +2926,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
                 return 1;
  
         if (hlock->references) {
-               struct lock_class *class = lock->class_cache;
+               struct lock_class *class = lock->class_cache[0];
  
                 if (!class)
                         class = look_up_lock_class(lock, 0);
@@ -3559,7 +3567,12 @@ void lockdep_reset_lock(struct lockdep_map *lock)
                 if (list_empty(head))
                         continue;
                 list_for_each_entry_safe(class, next, head, hash_entry) {
-                       if (unlikely(class == lock->class_cache)) {
+                       int match = 0;
+
+                       for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
+                               match |= class == lock->class_cache[j];
+
+                       if (unlikely(match)) {
                                 if (debug_locks_off_graph_unlock())
                                         WARN_ON(1);
                                 goto out_restore;
@@ -3775,7 +3788,7 @@ EXPORT_SYMBOL_GPL(debug_show_all_locks);
   * Careful: only use this function if you are sure that
   * the task cannot run in parallel!
   */
-void __debug_show_held_locks(struct task_struct *task)
+void debug_show_held_locks(struct task_struct *task)
  {
         if (unlikely(!debug_locks)) {
                 printk("INFO: lockdep is turned off.\n");
@@ -3783,12 +3796,6 @@ void __debug_show_held_locks(struct task_struct *task)
         }
         lockdep_print_held_locks(task);
  }
-EXPORT_SYMBOL_GPL(__debug_show_held_locks);
-
-void debug_show_held_locks(struct task_struct *task)
-{
-               __debug_show_held_locks(task);
-}
  EXPORT_SYMBOL_GPL(debug_show_held_locks);
  
  void lockdep_sys_exit(void)
diff --git a/kernel/module.c b/kernel/module.c

index ccd641991842f4990906946f895fd5062a0389c3..2df46301a7a407dcde3435542e38f6944358d7c1 100644 (file)
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -55,6 +55,7 @@
  #include <linux/async.h>
  #include <linux/percpu.h>
  #include <linux/kmemleak.h>
+#include <linux/jump_label.h>
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/module.h>
@@ -2309,6 +2310,11 @@ static void find_module_sections(struct module *mod, struct load_info *info)
                                         sizeof(*mod->tracepoints),
                                         &mod->num_tracepoints);
  #endif
+#ifdef HAVE_JUMP_LABEL
+       mod->jump_entries = section_objs(info, "__jump_table",
+                                       sizeof(*mod->jump_entries),
+                                       &mod->num_jump_entries);
+#endif
  #ifdef CONFIG_EVENT_TRACING
         mod->trace_events = section_objs(info, "_ftrace_events",
                                          sizeof(*mod->trace_events),
diff --git a/kernel/perf_event.c b/kernel/perf_event.c

index b98bed3d8182685ffff2b840c0ff4809b4f65cb8..f309e8014c7853105d1a38bc662f10164dc4d3d1 100644 (file)
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -31,24 +31,18 @@
  #include <linux/kernel_stat.h>
  #include <linux/perf_event.h>
  #include <linux/ftrace_event.h>
-#include <linux/hw_breakpoint.h>
  
  #include <asm/irq_regs.h>
  
-/*
- * Each CPU has a list of per CPU events:
- */
-static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
-
-int perf_max_events __read_mostly = 1;
-static int perf_reserved_percpu __read_mostly;
-static int perf_overcommit __read_mostly = 1;
-
-static atomic_t nr_events __read_mostly;
+atomic_t perf_task_events __read_mostly;
  static atomic_t nr_mmap_events __read_mostly;
  static atomic_t nr_comm_events __read_mostly;
  static atomic_t nr_task_events __read_mostly;
  
+static LIST_HEAD(pmus);
+static DEFINE_MUTEX(pmus_lock);
+static struct srcu_struct pmus_srcu;
+
  /*
   * perf event paranoia level:
   *  -1 - not paranoid at all
@@ -67,36 +61,43 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;
  
  static atomic64_t perf_event_id;
  
-/*
- * Lock for (sysadmin-configurable) event reservations:
- */
-static DEFINE_SPINLOCK(perf_resource_lock);
+void __weak perf_event_print_debug(void)       { }
  
-/*
- * Architecture provided APIs - weak aliases:
- */
-extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
+extern __weak const char *perf_pmu_name(void)
  {
-       return NULL;
+       return "pmu";
  }
  
-void __weak hw_perf_disable(void)              { barrier(); }
-void __weak hw_perf_enable(void)               { barrier(); }
-
-void __weak perf_event_print_debug(void)       { }
-
-static DEFINE_PER_CPU(int, perf_disable_count);
+void perf_pmu_disable(struct pmu *pmu)
+{
+       int *count = this_cpu_ptr(pmu->pmu_disable_count);
+       if (!(*count)++)
+               pmu->pmu_disable(pmu);
+}
  
-void perf_disable(void)
+void perf_pmu_enable(struct pmu *pmu)
  {
-       if (!__get_cpu_var(perf_disable_count)++)
-               hw_perf_disable();
+       int *count = this_cpu_ptr(pmu->pmu_disable_count);
+       if (!--(*count))
+               pmu->pmu_enable(pmu);
  }
  
-void perf_enable(void)
+static DEFINE_PER_CPU(struct list_head, rotation_list);
+
+/*
+ * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
+ * because they're strictly cpu affine and rotate_start is called with IRQs
+ * disabled, while rotate_context is called from IRQ context.
+ */
+static void perf_pmu_rotate_start(struct pmu *pmu)
  {
-       if (!--__get_cpu_var(perf_disable_count))
-               hw_perf_enable();
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+       struct list_head *head = &__get_cpu_var(rotation_list);
+
+       WARN_ON(!irqs_disabled());
+
+       if (list_empty(&cpuctx->rotation_list))
+               list_add(&cpuctx->rotation_list, head);
  }
  
  static void get_ctx(struct perf_event_context *ctx)
@@ -151,13 +152,13 @@ static u64 primary_event_id(struct perf_event *event)
   * the context could get moved to another task.
   */
  static struct perf_event_context *
-perf_lock_task_context(struct task_struct *task, unsigned long *flags)
+perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
  {
         struct perf_event_context *ctx;
  
         rcu_read_lock();
- retry:
-       ctx = rcu_dereference(task->perf_event_ctxp);
+retry:
+       ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
         if (ctx) {
                 /*
                  * If this context is a clone of another, it might
@@ -170,7 +171,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
                  * can't get swapped on us any more.
                  */
                 raw_spin_lock_irqsave(&ctx->lock, *flags);
-               if (ctx != rcu_dereference(task->perf_event_ctxp)) {
+               if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
                         goto retry;
                 }
@@ -189,12 +190,13 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
   * can't get swapped to another task.  This also increments its
   * reference count so that the context can't get freed.
   */
-static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
+static struct perf_event_context *
+perf_pin_task_context(struct task_struct *task, int ctxn)
  {
         struct perf_event_context *ctx;
         unsigned long flags;
  
-       ctx = perf_lock_task_context(task, &flags);
+       ctx = perf_lock_task_context(task, ctxn, &flags);
         if (ctx) {
                 ++ctx->pin_count;
                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -302,6 +304,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
         }
  
         list_add_rcu(&event->event_entry, &ctx->event_list);
+       if (!ctx->nr_events)
+               perf_pmu_rotate_start(ctx->pmu);
         ctx->nr_events++;
         if (event->attr.inherit_stat)
                 ctx->nr_stat++;
@@ -311,7 +315,12 @@ static void perf_group_attach(struct perf_event *event)
  {
         struct perf_event *group_leader = event->group_leader;
  
-       WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP);
+       /*
+        * We can have double attach due to group movement in perf_event_open.
+        */
+       if (event->attach_state & PERF_ATTACH_GROUP)
+               return;
+
         event->attach_state |= PERF_ATTACH_GROUP;
  
         if (group_leader == event)
@@ -408,8 +417,8 @@ event_filter_match(struct perf_event *event)
         return event->cpu == -1 || event->cpu == smp_processor_id();
  }
  
-static void
-event_sched_out(struct perf_event *event,
+static int
+__event_sched_out(struct perf_event *event,
                   struct perf_cpu_context *cpuctx,
                   struct perf_event_context *ctx)
  {
@@ -428,15 +437,14 @@ event_sched_out(struct perf_event *event,
         }
  
         if (event->state != PERF_EVENT_STATE_ACTIVE)
-               return;
+               return 0;
  
         event->state = PERF_EVENT_STATE_INACTIVE;
         if (event->pending_disable) {
                 event->pending_disable = 0;
                 event->state = PERF_EVENT_STATE_OFF;
         }
-       event->tstamp_stopped = ctx->time;
-       event->pmu->disable(event);
+       event->pmu->del(event, 0);
         event->oncpu = -1;
  
         if (!is_software_event(event))
@@ -444,6 +452,19 @@ event_sched_out(struct perf_event *event,
         ctx->nr_active--;
         if (event->attr.exclusive || !cpuctx->active_oncpu)
                 cpuctx->exclusive = 0;
+       return 1;
+}
+
+static void
+event_sched_out(struct perf_event *event,
+                 struct perf_cpu_context *cpuctx,
+                 struct perf_event_context *ctx)
+{
+       int ret;
+
+       ret = __event_sched_out(event, cpuctx, ctx);
+       if (ret)
+               event->tstamp_stopped = ctx->time;
  }
  
  static void
@@ -466,6 +487,12 @@ group_sched_out(struct perf_event *group_event,
                 cpuctx->exclusive = 0;
  }
  
+static inline struct perf_cpu_context *
+__get_cpu_context(struct perf_event_context *ctx)
+{
+       return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
+}
+
  /*
   * Cross CPU call to remove a performance event
   *
@@ -474,9 +501,9 @@ group_sched_out(struct perf_event *group_event,
   */
  static void __perf_event_remove_from_context(void *info)
  {
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
         struct perf_event *event = info;
         struct perf_event_context *ctx = event->ctx;
+       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
  
         /*
          * If this is a task context, we need to check whether it is
@@ -487,27 +514,11 @@ static void __perf_event_remove_from_context(void *info)
                 return;
  
         raw_spin_lock(&ctx->lock);
-       /*
-        * Protect the list operation against NMI by disabling the
-        * events on a global level.
-        */
-       perf_disable();
  
         event_sched_out(event, cpuctx, ctx);
  
         list_del_event(event, ctx);
  
-       if (!ctx->task) {
-               /*
-                * Allow more per task events with respect to the
-                * reservation:
-                */
-               cpuctx->max_pertask =
-                       min(perf_max_events - ctx->nr_events,
-                           perf_max_events - perf_reserved_percpu);
-       }
-
-       perf_enable();
         raw_spin_unlock(&ctx->lock);
  }
  
@@ -572,8 +583,8 @@ retry:
  static void __perf_event_disable(void *info)
  {
         struct perf_event *event = info;
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
         struct perf_event_context *ctx = event->ctx;
+       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
  
         /*
          * If this is a per-task event, need to check whether this
@@ -628,7 +639,7 @@ void perf_event_disable(struct perf_event *event)
                 return;
         }
  
- retry:
+retry:
         task_oncpu_function_call(task, __perf_event_disable, event);
  
         raw_spin_lock_irq(&ctx->lock);
@@ -653,7 +664,7 @@ void perf_event_disable(struct perf_event *event)
  }
  
  static int
-event_sched_in(struct perf_event *event,
+__event_sched_in(struct perf_event *event,
                  struct perf_cpu_context *cpuctx,
                  struct perf_event_context *ctx)
  {
@@ -667,14 +678,12 @@ event_sched_in(struct perf_event *event,
          */
         smp_wmb();
  
-       if (event->pmu->enable(event)) {
+       if (event->pmu->add(event, PERF_EF_START)) {
                 event->state = PERF_EVENT_STATE_INACTIVE;
                 event->oncpu = -1;
                 return -EAGAIN;
         }
  
-       event->tstamp_running += ctx->time - event->tstamp_stopped;
-
         if (!is_software_event(event))
                 cpuctx->active_oncpu++;
         ctx->nr_active++;
@@ -685,28 +694,56 @@ event_sched_in(struct perf_event *event,
         return 0;
  }
  
+static inline int
+event_sched_in(struct perf_event *event,
+                struct perf_cpu_context *cpuctx,
+                struct perf_event_context *ctx)
+{
+       int ret = __event_sched_in(event, cpuctx, ctx);
+       if (ret)
+               return ret;
+       event->tstamp_running += ctx->time - event->tstamp_stopped;
+       return 0;
+}
+
+static void
+group_commit_event_sched_in(struct perf_event *group_event,
+              struct perf_cpu_context *cpuctx,
+              struct perf_event_context *ctx)
+{
+       struct perf_event *event;
+       u64 now = ctx->time;
+
+       group_event->tstamp_running += now - group_event->tstamp_stopped;
+       /*
+        * Schedule in siblings as one group (if any):
+        */
+       list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+               event->tstamp_running += now - event->tstamp_stopped;
+       }
+}
+
  static int
  group_sched_in(struct perf_event *group_event,
                struct perf_cpu_context *cpuctx,
                struct perf_event_context *ctx)
  {
         struct perf_event *event, *partial_group = NULL;
-       const struct pmu *pmu = group_event->pmu;
-       bool txn = false;
+       struct pmu *pmu = group_event->pmu;
  
         if (group_event->state == PERF_EVENT_STATE_OFF)
                 return 0;
  
-       /* Check if group transaction availabe */
-       if (pmu->start_txn)
-               txn = true;
+       pmu->start_txn(pmu);
  
-       if (txn)
-               pmu->start_txn(pmu);
-
-       if (event_sched_in(group_event, cpuctx, ctx)) {
-               if (txn)
-                       pmu->cancel_txn(pmu);
+       /*
+        * use __event_sched_in() to delay updating tstamp_running
+        * until the transaction is committed. In case of failure
+        * we will keep an unmodified tstamp_running which is a
+        * requirement to get correct timing information
+        */
+       if (__event_sched_in(group_event, cpuctx, ctx)) {
+               pmu->cancel_txn(pmu);
                 return -EAGAIN;
         }
  
@@ -714,29 +751,33 @@ group_sched_in(struct perf_event *group_event,
          * Schedule in siblings as one group (if any):
          */
         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
-               if (event_sched_in(event, cpuctx, ctx)) {
+               if (__event_sched_in(event, cpuctx, ctx)) {
                         partial_group = event;
                         goto group_error;
                 }
         }
  
-       if (!txn || !pmu->commit_txn(pmu))
+       if (!pmu->commit_txn(pmu)) {
+               /* commit tstamp_running */
+               group_commit_event_sched_in(group_event, cpuctx, ctx);
                 return 0;
-
+       }
  group_error:
         /*
          * Groups can be scheduled in as one unit only, so undo any
          * partial group before returning:
+        *
+        * use __event_sched_out() to avoid updating tstamp_stopped
+        * because the event never actually ran
          */
         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
                 if (event == partial_group)
                         break;
-               event_sched_out(event, cpuctx, ctx);
+               __event_sched_out(event, cpuctx, ctx);
         }
-       event_sched_out(group_event, cpuctx, ctx);
+       __event_sched_out(group_event, cpuctx, ctx);
  
-       if (txn)
-               pmu->cancel_txn(pmu);
+       pmu->cancel_txn(pmu);
  
         return -EAGAIN;
  }
@@ -789,10 +830,10 @@ static void add_event_to_ctx(struct perf_event *event,
   */
  static void __perf_install_in_context(void *info)
  {
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
         struct perf_event *event = info;
         struct perf_event_context *ctx = event->ctx;
         struct perf_event *leader = event->group_leader;
+       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
         int err;
  
         /*
@@ -812,12 +853,6 @@ static void __perf_install_in_context(void *info)
         ctx->is_active = 1;
         update_context_time(ctx);
  
-       /*
-        * Protect the list operation against NMI by disabling the
-        * events on a global level. NOP for non NMI based events.
-        */
-       perf_disable();
-
         add_event_to_ctx(event, ctx);
  
         if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -855,12 +890,7 @@ static void __perf_install_in_context(void *info)
                 }
         }
  
-       if (!err && !ctx->task && cpuctx->max_pertask)
-               cpuctx->max_pertask--;
-
- unlock:
-       perf_enable();
-
+unlock:
         raw_spin_unlock(&ctx->lock);
  }
  
@@ -883,6 +913,8 @@ perf_install_in_context(struct perf_event_context *ctx,
  {
         struct task_struct *task = ctx->task;
  
+       event->ctx = ctx;
+
         if (!task) {
                 /*
                  * Per cpu events are installed via an smp call and
@@ -931,10 +963,12 @@ static void __perf_event_mark_enabled(struct perf_event *event,
  
         event->state = PERF_EVENT_STATE_INACTIVE;
         event->tstamp_enabled = ctx->time - event->total_time_enabled;
-       list_for_each_entry(sub, &event->sibling_list, group_entry)
-               if (sub->state >= PERF_EVENT_STATE_INACTIVE)
+       list_for_each_entry(sub, &event->sibling_list, group_entry) {
+               if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
                         sub->tstamp_enabled =
                                 ctx->time - sub->total_time_enabled;
+               }
+       }
  }
  
  /*
@@ -943,9 +977,9 @@ static void __perf_event_mark_enabled(struct perf_event *event,
  static void __perf_event_enable(void *info)
  {
         struct perf_event *event = info;
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
         struct perf_event_context *ctx = event->ctx;
         struct perf_event *leader = event->group_leader;
+       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
         int err;
  
         /*
@@ -979,12 +1013,10 @@ static void __perf_event_enable(void *info)
         if (!group_can_go_on(event, cpuctx, 1)) {
                 err = -EEXIST;
         } else {
-               perf_disable();
                 if (event == leader)
                         err = group_sched_in(event, cpuctx, ctx);
                 else
                         err = event_sched_in(event, cpuctx, ctx);
-               perf_enable();
         }
  
         if (err) {
@@ -1000,7 +1032,7 @@ static void __perf_event_enable(void *info)
                 }
         }
  
- unlock:
+unlock:
         raw_spin_unlock(&ctx->lock);
  }
  
@@ -1041,7 +1073,7 @@ void perf_event_enable(struct perf_event *event)
         if (event->state == PERF_EVENT_STATE_ERROR)
                 event->state = PERF_EVENT_STATE_OFF;
  
- retry:
+retry:
         raw_spin_unlock_irq(&ctx->lock);
         task_oncpu_function_call(task, __perf_event_enable, event);
  
@@ -1061,7 +1093,7 @@ void perf_event_enable(struct perf_event *event)
         if (event->state == PERF_EVENT_STATE_OFF)
                 __perf_event_mark_enabled(event, ctx);
  
- out:
+out:
         raw_spin_unlock_irq(&ctx->lock);
  }
  
@@ -1092,26 +1124,26 @@ static void ctx_sched_out(struct perf_event_context *ctx,
         struct perf_event *event;
  
         raw_spin_lock(&ctx->lock);
+       perf_pmu_disable(ctx->pmu);
         ctx->is_active = 0;
         if (likely(!ctx->nr_events))
                 goto out;
         update_context_time(ctx);
  
-       perf_disable();
         if (!ctx->nr_active)
-               goto out_enable;
+               goto out;
  
-       if (event_type & EVENT_PINNED)
+       if (event_type & EVENT_PINNED) {
                 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
                         group_sched_out(event, cpuctx, ctx);
+       }
  
-       if (event_type & EVENT_FLEXIBLE)
+       if (event_type & EVENT_FLEXIBLE) {
                 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
                         group_sched_out(event, cpuctx, ctx);
-
- out_enable:
-       perf_enable();
- out:
+       }
+out:
+       perf_pmu_enable(ctx->pmu);
         raw_spin_unlock(&ctx->lock);
  }
  
@@ -1209,34 +1241,25 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
         }
  }
  
-/*
- * Called from scheduler to remove the events of the current task,
- * with interrupts disabled.
- *
- * We stop each event and update the event value in event->count.
- *
- * This does not protect us against NMI, but disable()
- * sets the disabled bit in the control field of event _before_
- * accessing the event control register. If a NMI hits, then it will
- * not restart the event.
- */
-void perf_event_task_sched_out(struct task_struct *task,
-                                struct task_struct *next)
+void perf_event_context_sched_out(struct task_struct *task, int ctxn,
+                                 struct task_struct *next)
  {
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-       struct perf_event_context *ctx = task->perf_event_ctxp;
+       struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
         struct perf_event_context *next_ctx;
         struct perf_event_context *parent;
+       struct perf_cpu_context *cpuctx;
         int do_switch = 1;
  
-       perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
+       if (likely(!ctx))
+               return;
  
-       if (likely(!ctx || !cpuctx->task_ctx))
+       cpuctx = __get_cpu_context(ctx);
+       if (!cpuctx->task_ctx)
                 return;
  
         rcu_read_lock();
         parent = rcu_dereference(ctx->parent_ctx);
-       next_ctx = next->perf_event_ctxp;
+       next_ctx = next->perf_event_ctxp[ctxn];
         if (parent && next_ctx &&
             rcu_dereference(next_ctx->parent_ctx) == parent) {
                 /*
@@ -1255,8 +1278,8 @@ void perf_event_task_sched_out(struct task_struct *task,
                          * XXX do we need a memory barrier of sorts
                          * wrt to rcu_dereference() of perf_event_ctxp
                          */
-                       task->perf_event_ctxp = next_ctx;
-                       next->perf_event_ctxp = ctx;
+                       task->perf_event_ctxp[ctxn] = next_ctx;
+                       next->perf_event_ctxp[ctxn] = ctx;
                         ctx->task = next;
                         next_ctx->task = task;
                         do_switch = 0;
@@ -1274,10 +1297,35 @@ void perf_event_task_sched_out(struct task_struct *task,
         }
  }
  
+#define for_each_task_context_nr(ctxn)                                 \
+       for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
+
+/*
+ * Called from scheduler to remove the events of the current task,
+ * with interrupts disabled.
+ *
+ * We stop each event and update the event value in event->count.
+ *
+ * This does not protect us against NMI, but disable()
+ * sets the disabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * not restart the event.
+ */
+void __perf_event_task_sched_out(struct task_struct *task,
+                                struct task_struct *next)
+{
+       int ctxn;
+
+       perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
+
+       for_each_task_context_nr(ctxn)
+               perf_event_context_sched_out(task, ctxn, next);
+}
+
  static void task_ctx_sched_out(struct perf_event_context *ctx,
                                enum event_type_t event_type)
  {
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
  
         if (!cpuctx->task_ctx)
                 return;
@@ -1289,14 +1337,6 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
         cpuctx->task_ctx = NULL;
  }
  
-/*
- * Called with IRQs disabled
- */
-static void __perf_event_task_sched_out(struct perf_event_context *ctx)
-{
-       task_ctx_sched_out(ctx, EVENT_ALL);
-}
-
  /*
   * Called with IRQs disabled
   */
@@ -1350,9 +1390,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
                 if (event->cpu != -1 && event->cpu != smp_processor_id())
                         continue;
  
-               if (group_can_go_on(event, cpuctx, can_add_hw))
+               if (group_can_go_on(event, cpuctx, can_add_hw)) {
                         if (group_sched_in(event, cpuctx, ctx))
                                 can_add_hw = 0;
+               }
         }
  }
  
@@ -1368,8 +1409,6 @@ ctx_sched_in(struct perf_event_context *ctx,
  
         ctx->timestamp = perf_clock();
  
-       perf_disable();
-
         /*
          * First go through the list and put on any pinned groups
          * in order to give them the best chance of going on.
@@ -1381,8 +1420,7 @@ ctx_sched_in(struct perf_event_context *ctx,
         if (event_type & EVENT_FLEXIBLE)
                 ctx_flexible_sched_in(ctx, cpuctx);
  
-       perf_enable();
- out:
+out:
         raw_spin_unlock(&ctx->lock);
  }
  
@@ -1394,43 +1432,28 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
         ctx_sched_in(ctx, cpuctx, event_type);
  }
  
-static void task_ctx_sched_in(struct task_struct *task,
+static void task_ctx_sched_in(struct perf_event_context *ctx,
                               enum event_type_t event_type)
  {
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-       struct perf_event_context *ctx = task->perf_event_ctxp;
+       struct perf_cpu_context *cpuctx;
  
-       if (likely(!ctx))
-               return;
+               cpuctx = __get_cpu_context(ctx);
         if (cpuctx->task_ctx == ctx)
                 return;
+
         ctx_sched_in(ctx, cpuctx, event_type);
         cpuctx->task_ctx = ctx;
  }
-/*
- * Called from scheduler to add the events of the current task
- * with interrupts disabled.
- *
- * We restore the event value and then enable it.
- *
- * This does not protect us against NMI, but enable()
- * sets the enabled bit in the control field of event _before_
- * accessing the event control register. If a NMI hits, then it will
- * keep the event running.
- */
-void perf_event_task_sched_in(struct task_struct *task)
-{
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-       struct perf_event_context *ctx = task->perf_event_ctxp;
  
-       if (likely(!ctx))
-               return;
+void perf_event_context_sched_in(struct perf_event_context *ctx)
+{
+       struct perf_cpu_context *cpuctx;
  
+       cpuctx = __get_cpu_context(ctx);
         if (cpuctx->task_ctx == ctx)
                 return;
  
-       perf_disable();
-
+       perf_pmu_disable(ctx->pmu);
         /*
          * We want to keep the following priority order:
          * cpu pinned (that don't need to move), task pinned,
@@ -1444,7 +1467,37 @@ void perf_event_task_sched_in(struct task_struct *task)
  
         cpuctx->task_ctx = ctx;
  
-       perf_enable();
+       /*
+        * Since these rotations are per-cpu, we need to ensure the
+        * cpu-context we got scheduled on is actually rotating.
+        */
+       perf_pmu_rotate_start(ctx->pmu);
+       perf_pmu_enable(ctx->pmu);
+}
+
+/*
+ * Called from scheduler to add the events of the current task
+ * with interrupts disabled.
+ *
+ * We restore the event value and then enable it.
+ *
+ * This does not protect us against NMI, but enable()
+ * sets the enabled bit in the control field of event _before_
+ * accessing the event control register. If a NMI hits, then it will
+ * keep the event running.
+ */
+void __perf_event_task_sched_in(struct task_struct *task)
+{
+       struct perf_event_context *ctx;
+       int ctxn;
+
+       for_each_task_context_nr(ctxn) {
+               ctx = task->perf_event_ctxp[ctxn];
+               if (likely(!ctx))
+                       continue;
+
+               perf_event_context_sched_in(ctx);
+       }
  }
  
  #define MAX_INTERRUPTS (~0ULL)
@@ -1524,22 +1577,6 @@ do {                                     \
         return div64_u64(dividend, divisor);
  }
  
-static void perf_event_stop(struct perf_event *event)
-{
-       if (!event->pmu->stop)
-               return event->pmu->disable(event);
-
-       return event->pmu->stop(event);
-}
-
-static int perf_event_start(struct perf_event *event)
-{
-       if (!event->pmu->start)
-               return event->pmu->enable(event);
-
-       return event->pmu->start(event);
-}
-
  static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
  {
         struct hw_perf_event *hwc = &event->hw;
@@ -1559,15 +1596,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
         hwc->sample_period = sample_period;
  
         if (local64_read(&hwc->period_left) > 8*sample_period) {
-               perf_disable();
-               perf_event_stop(event);
+               event->pmu->stop(event, PERF_EF_UPDATE);
                 local64_set(&hwc->period_left, 0);
-               perf_event_start(event);
-               perf_enable();
+               event->pmu->start(event, PERF_EF_RELOAD);
         }
  }
  
-static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
+static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
  {
         struct perf_event *event;
         struct hw_perf_event *hwc;
@@ -1592,23 +1627,19 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
                  */
                 if (interrupts == MAX_INTERRUPTS) {
                         perf_log_throttle(event, 1);
-                       perf_disable();
-                       event->pmu->unthrottle(event);
-                       perf_enable();
+                       event->pmu->start(event, 0);
                 }
  
                 if (!event->attr.freq || !event->attr.sample_freq)
                         continue;
  
-               perf_disable();
                 event->pmu->read(event);
                 now = local64_read(&event->count);
                 delta = now - hwc->freq_count_stamp;
                 hwc->freq_count_stamp = now;
  
                 if (delta > 0)
-                       perf_adjust_period(event, TICK_NSEC, delta);
-               perf_enable();
+                       perf_adjust_period(event, period, delta);
         }
         raw_spin_unlock(&ctx->lock);
  }
@@ -1626,32 +1657,38 @@ static void rotate_ctx(struct perf_event_context *ctx)
         raw_spin_unlock(&ctx->lock);
  }
  
-void perf_event_task_tick(struct task_struct *curr)
+/*
+ * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
+ * because they're strictly cpu affine and rotate_start is called with IRQs
+ * disabled, while rotate_context is called from IRQ context.
+ */
+static void perf_rotate_context(struct perf_cpu_context *cpuctx)
  {
-       struct perf_cpu_context *cpuctx;
-       struct perf_event_context *ctx;
-       int rotate = 0;
-
-       if (!atomic_read(&nr_events))
-               return;
+       u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
+       struct perf_event_context *ctx = NULL;
+       int rotate = 0, remove = 1;
  
-       cpuctx = &__get_cpu_var(perf_cpu_context);
-       if (cpuctx->ctx.nr_events &&
-           cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
-               rotate = 1;
+       if (cpuctx->ctx.nr_events) {
+               remove = 0;
+               if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
+                       rotate = 1;
+       }
  
-       ctx = curr->perf_event_ctxp;
-       if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active)
-               rotate = 1;
+       ctx = cpuctx->task_ctx;
+       if (ctx && ctx->nr_events) {
+               remove = 0;
+               if (ctx->nr_events != ctx->nr_active)
+                       rotate = 1;
+       }
  
-       perf_ctx_adjust_freq(&cpuctx->ctx);
+       perf_pmu_disable(cpuctx->ctx.pmu);
+       perf_ctx_adjust_freq(&cpuctx->ctx, interval);
         if (ctx)
-               perf_ctx_adjust_freq(ctx);
+               perf_ctx_adjust_freq(ctx, interval);
  
         if (!rotate)
-               return;
+               goto done;
  
-       perf_disable();
         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
         if (ctx)
                 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
@@ -1662,8 +1699,27 @@ void perf_event_task_tick(struct task_struct *curr)
  
         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
         if (ctx)
-               task_ctx_sched_in(curr, EVENT_FLEXIBLE);
-       perf_enable();
+               task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
+
+done:
+       if (remove)
+               list_del_init(&cpuctx->rotation_list);
+
+       perf_pmu_enable(cpuctx->ctx.pmu);
+}
+
+void perf_event_task_tick(void)
+{
+       struct list_head *head = &__get_cpu_var(rotation_list);
+       struct perf_cpu_context *cpuctx, *tmp;
+
+       WARN_ON(!irqs_disabled());
+
+       list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
+               if (cpuctx->jiffies_interval == 1 ||
+                               !(jiffies % cpuctx->jiffies_interval))
+                       perf_rotate_context(cpuctx);
+       }
  }
  
  static int event_enable_on_exec(struct perf_event *event,
@@ -1685,20 +1741,18 @@ static int event_enable_on_exec(struct perf_event *event,
   * Enable all of a task's events that have been marked enable-on-exec.
   * This expects task == current.
   */
-static void perf_event_enable_on_exec(struct task_struct *task)
+static void perf_event_enable_on_exec(struct perf_event_context *ctx)
  {
-       struct perf_event_context *ctx;
         struct perf_event *event;
         unsigned long flags;
         int enabled = 0;
         int ret;
  
         local_irq_save(flags);
-       ctx = task->perf_event_ctxp;
         if (!ctx || !ctx->nr_events)
                 goto out;
  
-       __perf_event_task_sched_out(ctx);
+       task_ctx_sched_out(ctx, EVENT_ALL);
  
         raw_spin_lock(&ctx->lock);
  
@@ -1722,8 +1776,8 @@ static void perf_event_enable_on_exec(struct task_struct *task)
  
         raw_spin_unlock(&ctx->lock);
  
-       perf_event_task_sched_in(task);
- out:
+       perf_event_context_sched_in(ctx);
+out:
         local_irq_restore(flags);
  }
  
@@ -1732,9 +1786,9 @@ static void perf_event_enable_on_exec(struct task_struct *task)
   */
  static void __perf_event_read(void *info)
  {
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
         struct perf_event *event = info;
         struct perf_event_context *ctx = event->ctx;
+       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
  
         /*
          * If this is a task context, we need to check whether it is
@@ -1773,7 +1827,13 @@ static u64 perf_event_read(struct perf_event *event)
                 unsigned long flags;
  
                 raw_spin_lock_irqsave(&ctx->lock, flags);
-               update_context_time(ctx);
+               /*
+                * may read while context is not active
+                * (e.g., thread is blocked), in that case
+                * we cannot update context time
+                */
+               if (ctx->is_active)
+                       update_context_time(ctx);
                 update_event_times(event);
                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
         }
@@ -1782,57 +1842,258 @@ static u64 perf_event_read(struct perf_event *event)
  }
  
  /*
- * Initialize the perf_event context in a task_struct:
+ * Callchain support
   */
-static void
-__perf_event_init_context(struct perf_event_context *ctx,
-                           struct task_struct *task)
+
+struct callchain_cpus_entries {
+       struct rcu_head                 rcu_head;
+       struct perf_callchain_entry     *cpu_entries[0];
+};
+
+static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
+static atomic_t nr_callchain_events;
+static DEFINE_MUTEX(callchain_mutex);
+struct callchain_cpus_entries *callchain_cpus_entries;
+
+
+__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
+                                 struct pt_regs *regs)
  {
-       raw_spin_lock_init(&ctx->lock);
-       mutex_init(&ctx->mutex);
-       INIT_LIST_HEAD(&ctx->pinned_groups);
-       INIT_LIST_HEAD(&ctx->flexible_groups);
-       INIT_LIST_HEAD(&ctx->event_list);
-       atomic_set(&ctx->refcount, 1);
-       ctx->task = task;
  }
  
-static struct perf_event_context *find_get_context(pid_t pid, int cpu)
+__weak void perf_callchain_user(struct perf_callchain_entry *entry,
+                               struct pt_regs *regs)
  {
-       struct perf_event_context *ctx;
-       struct perf_cpu_context *cpuctx;
-       struct task_struct *task;
-       unsigned long flags;
-       int err;
-
-       if (pid == -1 && cpu != -1) {
-               /* Must be root to operate on a CPU event: */
-               if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
-                       return ERR_PTR(-EACCES);
+}
  
-               if (cpu < 0 || cpu >= nr_cpumask_bits)
-                       return ERR_PTR(-EINVAL);
+static void release_callchain_buffers_rcu(struct rcu_head *head)
+{
+       struct callchain_cpus_entries *entries;
+       int cpu;
  
-               /*
-                * We could be clever and allow to attach a event to an
-                * offline CPU and activate it when the CPU comes up, but
-                * that's for later.
-                */
-               if (!cpu_online(cpu))
-                       return ERR_PTR(-ENODEV);
+       entries = container_of(head, struct callchain_cpus_entries, rcu_head);
  
-               cpuctx = &per_cpu(perf_cpu_context, cpu);
-               ctx = &cpuctx->ctx;
-               get_ctx(ctx);
+       for_each_possible_cpu(cpu)
+               kfree(entries->cpu_entries[cpu]);
  
-               return ctx;
+       kfree(entries);
+}
+
+static void release_callchain_buffers(void)
+{
+       struct callchain_cpus_entries *entries;
+
+       entries = callchain_cpus_entries;
+       rcu_assign_pointer(callchain_cpus_entries, NULL);
+       call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
+}
+
+static int alloc_callchain_buffers(void)
+{
+       int cpu;
+       int size;
+       struct callchain_cpus_entries *entries;
+
+       /*
+        * We can't use the percpu allocation API for data that can be
+        * accessed from NMI. Use a temporary manual per cpu allocation
+        * until that gets sorted out.
+        */
+       size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) *
+               num_possible_cpus();
+
+       entries = kzalloc(size, GFP_KERNEL);
+       if (!entries)
+               return -ENOMEM;
+
+       size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+
+       for_each_possible_cpu(cpu) {
+               entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
+                                                        cpu_to_node(cpu));
+               if (!entries->cpu_entries[cpu])
+                       goto fail;
+       }
+
+       rcu_assign_pointer(callchain_cpus_entries, entries);
+
+       return 0;
+
+fail:
+       for_each_possible_cpu(cpu)
+               kfree(entries->cpu_entries[cpu]);
+       kfree(entries);
+
+       return -ENOMEM;
+}
+
+static int get_callchain_buffers(void)
+{
+       int err = 0;
+       int count;
+
+       mutex_lock(&callchain_mutex);
+
+       count = atomic_inc_return(&nr_callchain_events);
+       if (WARN_ON_ONCE(count < 1)) {
+               err = -EINVAL;
+               goto exit;
+       }
+
+       if (count > 1) {
+               /* If the allocation failed, give up */
+               if (!callchain_cpus_entries)
+                       err = -ENOMEM;
+               goto exit;
+       }
+
+       err = alloc_callchain_buffers();
+       if (err)
+               release_callchain_buffers();
+exit:
+       mutex_unlock(&callchain_mutex);
+
+       return err;
+}
+
+static void put_callchain_buffers(void)
+{
+       if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
+               release_callchain_buffers();
+               mutex_unlock(&callchain_mutex);
+       }
+}
+
+static int get_recursion_context(int *recursion)
+{
+       int rctx;
+
+       if (in_nmi())
+               rctx = 3;
+       else if (in_irq())
+               rctx = 2;
+       else if (in_softirq())
+               rctx = 1;
+       else
+               rctx = 0;
+
+       if (recursion[rctx])
+               return -1;
+
+       recursion[rctx]++;
+       barrier();
+
+       return rctx;
+}
+
+static inline void put_recursion_context(int *recursion, int rctx)
+{
+       barrier();
+       recursion[rctx]--;
+}
+
+static struct perf_callchain_entry *get_callchain_entry(int *rctx)
+{
+       int cpu;
+       struct callchain_cpus_entries *entries;
+
+       *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
+       if (*rctx == -1)
+               return NULL;
+
+       entries = rcu_dereference(callchain_cpus_entries);
+       if (!entries)
+               return NULL;
+
+       cpu = smp_processor_id();
+
+       return &entries->cpu_entries[cpu][*rctx];
+}
+
+static void
+put_callchain_entry(int rctx)
+{
+       put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
+}
+
+static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+       int rctx;
+       struct perf_callchain_entry *entry;
+
+
+       entry = get_callchain_entry(&rctx);
+       if (rctx == -1)
+               return NULL;
+
+       if (!entry)
+               goto exit_put;
+
+       entry->nr = 0;
+
+       if (!user_mode(regs)) {
+               perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
+               perf_callchain_kernel(entry, regs);
+               if (current->mm)
+                       regs = task_pt_regs(current);
+               else
+                       regs = NULL;
+       }
+
+       if (regs) {
+               perf_callchain_store(entry, PERF_CONTEXT_USER);
+               perf_callchain_user(entry, regs);
+       }
+
+exit_put:
+       put_callchain_entry(rctx);
+
+       return entry;
+}
+
+/*
+ * Initialize the perf_event context in a task_struct:
+ */
+static void __perf_event_init_context(struct perf_event_context *ctx)
+{
+       raw_spin_lock_init(&ctx->lock);
+       mutex_init(&ctx->mutex);
+       INIT_LIST_HEAD(&ctx->pinned_groups);
+       INIT_LIST_HEAD(&ctx->flexible_groups);
+       INIT_LIST_HEAD(&ctx->event_list);
+       atomic_set(&ctx->refcount, 1);
+}
+
+static struct perf_event_context *
+alloc_perf_context(struct pmu *pmu, struct task_struct *task)
+{
+       struct perf_event_context *ctx;
+
+       ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
+       if (!ctx)
+               return NULL;
+
+       __perf_event_init_context(ctx);
+       if (task) {
+               ctx->task = task;
+               get_task_struct(task);
         }
+       ctx->pmu = pmu;
+
+       return ctx;
+}
+
+static struct task_struct *
+find_lively_task_by_vpid(pid_t vpid)
+{
+       struct task_struct *task;
+       int err;
  
         rcu_read_lock();
-       if (!pid)
+       if (!vpid)
                 task = current;
         else
-               task = find_task_by_vpid(pid);
+               task = find_task_by_vpid(vpid);
         if (task)
                 get_task_struct(task);
         rcu_read_unlock();
@@ -1852,36 +2113,78 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
         if (!ptrace_may_access(task, PTRACE_MODE_READ))
                 goto errout;
  
- retry:
-       ctx = perf_lock_task_context(task, &flags);
+       return task;
+errout:
+       put_task_struct(task);
+       return ERR_PTR(err);
+
+}
+
+static struct perf_event_context *
+find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
+{
+       struct perf_event_context *ctx;
+       struct perf_cpu_context *cpuctx;
+       unsigned long flags;
+       int ctxn, err;
+
+       if (!task && cpu != -1) {
+               /* Must be root to operate on a CPU event: */
+               if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+                       return ERR_PTR(-EACCES);
+
+               if (cpu < 0 || cpu >= nr_cpumask_bits)
+                       return ERR_PTR(-EINVAL);
+
+               /*
+                * We could be clever and allow to attach a event to an
+                * offline CPU and activate it when the CPU comes up, but
+                * that's for later.
+                */
+               if (!cpu_online(cpu))
+                       return ERR_PTR(-ENODEV);
+
+               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+               ctx = &cpuctx->ctx;
+               get_ctx(ctx);
+
+               return ctx;
+       }
+
+       err = -EINVAL;
+       ctxn = pmu->task_ctx_nr;
+       if (ctxn < 0)
+               goto errout;
+
+retry:
+       ctx = perf_lock_task_context(task, ctxn, &flags);
         if (ctx) {
                 unclone_ctx(ctx);
                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
         }
  
         if (!ctx) {
-               ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
+               ctx = alloc_perf_context(pmu, task);
                 err = -ENOMEM;
                 if (!ctx)
                         goto errout;
-               __perf_event_init_context(ctx, task);
+
                 get_ctx(ctx);
-               if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
+
+               if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) {
                         /*
                          * We raced with some other task; use
                          * the context they set.
                          */
+                       put_task_struct(task);
                         kfree(ctx);
                         goto retry;
                 }
-               get_task_struct(task);
         }
  
-       put_task_struct(task);
         return ctx;
  
- errout:
-       put_task_struct(task);
+errout:
         return ERR_PTR(err);
  }
  
@@ -1898,21 +2201,23 @@ static void free_event_rcu(struct rcu_head *head)
         kfree(event);
  }
  
-static void perf_pending_sync(struct perf_event *event);
  static void perf_buffer_put(struct perf_buffer *buffer);
  
  static void free_event(struct perf_event *event)
  {
-       perf_pending_sync(event);
+       irq_work_sync(&event->pending);
  
         if (!event->parent) {
-               atomic_dec(&nr_events);
+               if (event->attach_state & PERF_ATTACH_TASK)
+                       jump_label_dec(&perf_task_events);
                 if (event->attr.mmap || event->attr.mmap_data)
                         atomic_dec(&nr_mmap_events);
                 if (event->attr.comm)
                         atomic_dec(&nr_comm_events);
                 if (event->attr.task)
                         atomic_dec(&nr_task_events);
+               if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+                       put_callchain_buffers();
         }
  
         if (event->buffer) {
@@ -1923,7 +2228,9 @@ static void free_event(struct perf_event *event)
         if (event->destroy)
                 event->destroy(event);
  
-       put_ctx(event->ctx);
+       if (event->ctx)
+               put_ctx(event->ctx);
+
         call_rcu(&event->rcu_head, free_event_rcu);
  }
  
@@ -2342,6 +2649,9 @@ int perf_event_task_disable(void)
  
  static int perf_event_index(struct perf_event *event)
  {
+       if (event->hw.state & PERF_HES_STOPPED)
+               return 0;
+
         if (event->state != PERF_EVENT_STATE_ACTIVE)
                 return 0;
  
@@ -2845,16 +3155,7 @@ void perf_event_wakeup(struct perf_event *event)
         }
  }
  
-/*
- * Pending wakeups
- *
- * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
- *
- * The NMI bit means we cannot possibly take locks. Therefore, maintain a
- * single linked list and use cmpxchg() to add entries lockless.
- */
-
-static void perf_pending_event(struct perf_pending_entry *entry)
+static void perf_pending_event(struct irq_work *entry)
  {
         struct perf_event *event = container_of(entry,
                         struct perf_event, pending);
@@ -2870,99 +3171,6 @@ static void perf_pending_event(struct perf_pending_entry *entry)
         }
  }
  
-#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
-
-static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
-       PENDING_TAIL,
-};
-
-static void perf_pending_queue(struct perf_pending_entry *entry,
-                              void (*func)(struct perf_pending_entry *))
-{
-       struct perf_pending_entry **head;
-
-       if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
-               return;
-
-       entry->func = func;
-
-       head = &get_cpu_var(perf_pending_head);
-
-       do {
-               entry->next = *head;
-       } while (cmpxchg(head, entry->next, entry) != entry->next);
-
-       set_perf_event_pending();
-
-       put_cpu_var(perf_pending_head);
-}
-
-static int __perf_pending_run(void)
-{
-       struct perf_pending_entry *list;
-       int nr = 0;
-
-       list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
-       while (list != PENDING_TAIL) {
-               void (*func)(struct perf_pending_entry *);
-               struct perf_pending_entry *entry = list;
-
-               list = list->next;
-
-               func = entry->func;
-               entry->next = NULL;
-               /*
-                * Ensure we observe the unqueue before we issue the wakeup,
-                * so that we won't be waiting forever.
-                * -- see perf_not_pending().
-                */
-               smp_wmb();
-
-               func(entry);
-               nr++;
-       }
-
-       return nr;
-}
-
-static inline int perf_not_pending(struct perf_event *event)
-{
-       /*
-        * If we flush on whatever cpu we run, there is a chance we don't
-        * need to wait.
-        */
-       get_cpu();
-       __perf_pending_run();
-       put_cpu();
-
-       /*
-        * Ensure we see the proper queue state before going to sleep
-        * so that we do not miss the wakeup. -- see perf_pending_handle()
-        */
-       smp_rmb();
-       return event->pending.next == NULL;
-}
-
-static void perf_pending_sync(struct perf_event *event)
-{
-       wait_event(event->waitq, perf_not_pending(event));
-}
-
-void perf_event_do_pending(void)
-{
-       __perf_pending_run();
-}
-
-/*
- * Callchain support -- arch specific
- */
-
-__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-       return NULL;
-}
-
-
  /*
   * We assume there is only KVM supporting the callbacks.
   * Later on, we might change it to a list if there is
@@ -3012,8 +3220,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
  
         if (handle->nmi) {
                 handle->event->pending_wakeup = 1;
-               perf_pending_queue(&handle->event->pending,
-                                  perf_pending_event);
+               irq_work_queue(&handle->event->pending);
         } else
                 perf_event_wakeup(handle->event);
  }
@@ -3069,7 +3276,7 @@ again:
         if (handle->wakeup != local_read(&buffer->wakeup))
                 perf_output_wakeup(handle);
  
- out:
+out:
         preempt_enable();
  }
  
@@ -3457,14 +3664,20 @@ static void perf_event_output(struct perf_event *event, int nmi,
         struct perf_output_handle handle;
         struct perf_event_header header;
  
+       /* protect the callchain buffers */
+       rcu_read_lock();
+
         perf_prepare_sample(&header, data, event, regs);
  
         if (perf_output_begin(&handle, event, header.size, nmi, 1))
-               return;
+               goto exit;
  
         perf_output_sample(&handle, &header, data, event);
  
         perf_output_end(&handle);
+
+exit:
+       rcu_read_unlock();
  }
  
  /*
@@ -3578,16 +3791,27 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
  static void perf_event_task_event(struct perf_task_event *task_event)
  {
         struct perf_cpu_context *cpuctx;
-       struct perf_event_context *ctx = task_event->task_ctx;
+       struct perf_event_context *ctx;
+       struct pmu *pmu;
+       int ctxn;
  
         rcu_read_lock();
-       cpuctx = &get_cpu_var(perf_cpu_context);
-       perf_event_task_ctx(&cpuctx->ctx, task_event);
-       if (!ctx)
-               ctx = rcu_dereference(current->perf_event_ctxp);
-       if (ctx)
-               perf_event_task_ctx(ctx, task_event);
-       put_cpu_var(perf_cpu_context);
+       list_for_each_entry_rcu(pmu, &pmus, entry) {
+               cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+               perf_event_task_ctx(&cpuctx->ctx, task_event);
+
+               ctx = task_event->task_ctx;
+               if (!ctx) {
+                       ctxn = pmu->task_ctx_nr;
+                       if (ctxn < 0)
+                               goto next;
+                       ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+               }
+               if (ctx)
+                       perf_event_task_ctx(ctx, task_event);
+next:
+               put_cpu_ptr(pmu->pmu_cpu_context);
+       }
         rcu_read_unlock();
  }
  
@@ -3692,8 +3916,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
  {
         struct perf_cpu_context *cpuctx;
         struct perf_event_context *ctx;
-       unsigned int size;
         char comm[TASK_COMM_LEN];
+       unsigned int size;
+       struct pmu *pmu;
+       int ctxn;
  
         memset(comm, 0, sizeof(comm));
         strlcpy(comm, comm_event->task->comm, sizeof(comm));
@@ -3705,22 +3931,37 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
  
         rcu_read_lock();
-       cpuctx = &get_cpu_var(perf_cpu_context);
-       perf_event_comm_ctx(&cpuctx->ctx, comm_event);
-       ctx = rcu_dereference(current->perf_event_ctxp);
-       if (ctx)
-               perf_event_comm_ctx(ctx, comm_event);
-       put_cpu_var(perf_cpu_context);
+       list_for_each_entry_rcu(pmu, &pmus, entry) {
+               cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+               perf_event_comm_ctx(&cpuctx->ctx, comm_event);
+
+               ctxn = pmu->task_ctx_nr;
+               if (ctxn < 0)
+                       goto next;
+
+               ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+               if (ctx)
+                       perf_event_comm_ctx(ctx, comm_event);
+next:
+               put_cpu_ptr(pmu->pmu_cpu_context);
+       }
         rcu_read_unlock();
  }
  
  void perf_event_comm(struct task_struct *task)
  {
         struct perf_comm_event comm_event;
+       struct perf_event_context *ctx;
+       int ctxn;
+
+       for_each_task_context_nr(ctxn) {
+               ctx = task->perf_event_ctxp[ctxn];
+               if (!ctx)
+                       continue;
+
+               perf_event_enable_on_exec(ctx);
+       }
  
-       if (task->perf_event_ctxp)
-               perf_event_enable_on_exec(task);
-
         if (!atomic_read(&nr_comm_events))
                 return;
  
@@ -3821,6 +4062,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
         char tmp[16];
         char *buf = NULL;
         const char *name;
+       struct pmu *pmu;
+       int ctxn;
  
         memset(tmp, 0, sizeof(tmp));
  
@@ -3873,12 +4116,23 @@ got_name:
         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
  
         rcu_read_lock();
-       cpuctx = &get_cpu_var(perf_cpu_context);
-       perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC);
-       ctx = rcu_dereference(current->perf_event_ctxp);
-       if (ctx)
-               perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC);
-       put_cpu_var(perf_cpu_context);
+       list_for_each_entry_rcu(pmu, &pmus, entry) {
+               cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+               perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
+                                       vma->vm_flags & VM_EXEC);
+
+               ctxn = pmu->task_ctx_nr;
+               if (ctxn < 0)
+                       goto next;
+
+               ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+               if (ctx) {
+                       perf_event_mmap_ctx(ctx, mmap_event,
+                                       vma->vm_flags & VM_EXEC);
+               }
+next:
+               put_cpu_ptr(pmu->pmu_cpu_context);
+       }
         rcu_read_unlock();
  
         kfree(buf);
@@ -3960,8 +4214,6 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
         struct hw_perf_event *hwc = &event->hw;
         int ret = 0;
  
-       throttle = (throttle && event->pmu->unthrottle != NULL);
-
         if (!throttle) {
                 hwc->interrupts++;
         } else {
@@ -4004,8 +4256,7 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
                 event->pending_kill = POLL_HUP;
                 if (nmi) {
                         event->pending_disable = 1;
-                       perf_pending_queue(&event->pending,
-                                          perf_pending_event);
+                       irq_work_queue(&event->pending);
                 } else
                         perf_event_disable(event);
         }
@@ -4029,6 +4280,17 @@ int perf_event_overflow(struct perf_event *event, int nmi,
   * Generic software event infrastructure
   */
  
+struct swevent_htable {
+       struct swevent_hlist            *swevent_hlist;
+       struct mutex                    hlist_mutex;
+       int                             hlist_refcount;
+
+       /* Recursion avoidance in each contexts */
+       int                             recursion[PERF_NR_CONTEXTS];
+};
+
+static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
+
  /*
   * We directly increment event->count and keep a second value in
   * event->hw.period_left to count intervals. This period event
@@ -4086,7 +4348,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
         }
  }
  
-static void perf_swevent_add(struct perf_event *event, u64 nr,
+static void perf_swevent_event(struct perf_event *event, u64 nr,
                                int nmi, struct perf_sample_data *data,
                                struct pt_regs *regs)
  {
@@ -4112,6 +4374,9 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
  static int perf_exclude_event(struct perf_event *event,
                               struct pt_regs *regs)
  {
+       if (event->hw.state & PERF_HES_STOPPED)
+               return 0;
+
         if (regs) {
                 if (event->attr.exclude_user && user_mode(regs))
                         return 1;
@@ -4158,11 +4423,11 @@ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
  
  /* For the read side: events when they trigger */
  static inline struct hlist_head *
-find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
  {
         struct swevent_hlist *hlist;
  
-       hlist = rcu_dereference(ctx->swevent_hlist);
+       hlist = rcu_dereference(swhash->swevent_hlist);
         if (!hlist)
                 return NULL;
  
@@ -4171,7 +4436,7 @@ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
  
  /* For the event head insertion and removal in the hlist */
  static inline struct hlist_head *
-find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
+find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
  {
         struct swevent_hlist *hlist;
         u32 event_id = event->attr.config;
@@ -4182,7 +4447,7 @@ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
          * and release. Which makes the protected version suitable here.
          * The context lock guarantees that.
          */
-       hlist = rcu_dereference_protected(ctx->swevent_hlist,
+       hlist = rcu_dereference_protected(swhash->swevent_hlist,
                                           lockdep_is_held(&event->ctx->lock));
         if (!hlist)
                 return NULL;
@@ -4195,23 +4460,19 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
                                     struct perf_sample_data *data,
                                     struct pt_regs *regs)
  {
-       struct perf_cpu_context *cpuctx;
+       struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
         struct perf_event *event;
         struct hlist_node *node;
         struct hlist_head *head;
  
-       cpuctx = &__get_cpu_var(perf_cpu_context);
-
         rcu_read_lock();
-
-       head = find_swevent_head_rcu(cpuctx, type, event_id);
-
+       head = find_swevent_head_rcu(swhash, type, event_id);
         if (!head)
                 goto end;
  
         hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
                 if (perf_swevent_match(event, type, event_id, data, regs))
-                       perf_swevent_add(event, nr, nmi, data, regs);
+                       perf_swevent_event(event, nr, nmi, data, regs);
         }
  end:
         rcu_read_unlock();
@@ -4219,33 +4480,17 @@ end:
  
  int perf_swevent_get_recursion_context(void)
  {
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-       int rctx;
-
-       if (in_nmi())
-               rctx = 3;
-       else if (in_irq())
-               rctx = 2;
-       else if (in_softirq())
-               rctx = 1;
-       else
-               rctx = 0;
-
-       if (cpuctx->recursion[rctx])
-               return -1;
-
-       cpuctx->recursion[rctx]++;
-       barrier();
+       struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
  
-       return rctx;
+       return get_recursion_context(swhash->recursion);
  }
  EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
  
  void inline perf_swevent_put_recursion_context(int rctx)
  {
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-       barrier();
-       cpuctx->recursion[rctx]--;
+       struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
+
+       put_recursion_context(swhash->recursion, rctx);
  }
  
  void __perf_sw_event(u32 event_id, u64 nr, int nmi,
@@ -4271,20 +4516,20 @@ static void perf_swevent_read(struct perf_event *event)
  {
  }
  
-static int perf_swevent_enable(struct perf_event *event)
+static int perf_swevent_add(struct perf_event *event, int flags)
  {
+       struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
         struct hw_perf_event *hwc = &event->hw;
-       struct perf_cpu_context *cpuctx;
         struct hlist_head *head;
  
-       cpuctx = &__get_cpu_var(perf_cpu_context);
-
         if (hwc->sample_period) {
                 hwc->last_period = hwc->sample_period;
                 perf_swevent_set_period(event);
         }
  
-       head = find_swevent_head(cpuctx, event);
+       hwc->state = !(flags & PERF_EF_START);
+
+       head = find_swevent_head(swhash, event);
         if (WARN_ON_ONCE(!head))
                 return -EINVAL;
  
@@ -4293,202 +4538,27 @@ static int perf_swevent_enable(struct perf_event *event)
         return 0;
  }
  
-static void perf_swevent_disable(struct perf_event *event)
+static void perf_swevent_del(struct perf_event *event, int flags)
  {
         hlist_del_rcu(&event->hlist_entry);
  }
  
-static void perf_swevent_void(struct perf_event *event)
-{
-}
-
-static int perf_swevent_int(struct perf_event *event)
-{
-       return 0;
-}
-
-static const struct pmu perf_ops_generic = {
-       .enable         = perf_swevent_enable,
-       .disable        = perf_swevent_disable,
-       .start          = perf_swevent_int,
-       .stop           = perf_swevent_void,
-       .read           = perf_swevent_read,
-       .unthrottle     = perf_swevent_void, /* hwc->interrupts already reset */
-};
-
-/*
- * hrtimer based swevent callback
- */
-
-static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
-{
-       enum hrtimer_restart ret = HRTIMER_RESTART;
-       struct perf_sample_data data;
-       struct pt_regs *regs;
-       struct perf_event *event;
-       u64 period;
-
-       event = container_of(hrtimer, struct perf_event, hw.hrtimer);
-       event->pmu->read(event);
-
-       perf_sample_data_init(&data, 0);
-       data.period = event->hw.last_period;
-       regs = get_irq_regs();
-
-       if (regs && !perf_exclude_event(event, regs)) {
-               if (!(event->attr.exclude_idle && current->pid == 0))
-                       if (perf_event_overflow(event, 0, &data, regs))
-                               ret = HRTIMER_NORESTART;
-       }
-
-       period = max_t(u64, 10000, event->hw.sample_period);
-       hrtimer_forward_now(hrtimer, ns_to_ktime(period));
-
-       return ret;
-}
-
-static void perf_swevent_start_hrtimer(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-       hwc->hrtimer.function = perf_swevent_hrtimer;
-       if (hwc->sample_period) {
-               u64 period;
-
-               if (hwc->remaining) {
-                       if (hwc->remaining < 0)
-                               period = 10000;
-                       else
-                               period = hwc->remaining;
-                       hwc->remaining = 0;
-               } else {
-                       period = max_t(u64, 10000, hwc->sample_period);
-               }
-               __hrtimer_start_range_ns(&hwc->hrtimer,
-                               ns_to_ktime(period), 0,
-                               HRTIMER_MODE_REL, 0);
-       }
-}
-
-static void perf_swevent_cancel_hrtimer(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-
-       if (hwc->sample_period) {
-               ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
-               hwc->remaining = ktime_to_ns(remaining);
-
-               hrtimer_cancel(&hwc->hrtimer);
-       }
-}
-
-/*
- * Software event: cpu wall time clock
- */
-
-static void cpu_clock_perf_event_update(struct perf_event *event)
-{
-       int cpu = raw_smp_processor_id();
-       s64 prev;
-       u64 now;
-
-       now = cpu_clock(cpu);
-       prev = local64_xchg(&event->hw.prev_count, now);
-       local64_add(now - prev, &event->count);
-}
-
-static int cpu_clock_perf_event_enable(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       int cpu = raw_smp_processor_id();
-
-       local64_set(&hwc->prev_count, cpu_clock(cpu));
-       perf_swevent_start_hrtimer(event);
-
-       return 0;
-}
-
-static void cpu_clock_perf_event_disable(struct perf_event *event)
+static void perf_swevent_start(struct perf_event *event, int flags)
  {
-       perf_swevent_cancel_hrtimer(event);
-       cpu_clock_perf_event_update(event);
-}
-
-static void cpu_clock_perf_event_read(struct perf_event *event)
-{
-       cpu_clock_perf_event_update(event);
-}
-
-static const struct pmu perf_ops_cpu_clock = {
-       .enable         = cpu_clock_perf_event_enable,
-       .disable        = cpu_clock_perf_event_disable,
-       .read           = cpu_clock_perf_event_read,
-};
-
-/*
- * Software event: task time clock
- */
-
-static void task_clock_perf_event_update(struct perf_event *event, u64 now)
-{
-       u64 prev;
-       s64 delta;
-
-       prev = local64_xchg(&event->hw.prev_count, now);
-       delta = now - prev;
-       local64_add(delta, &event->count);
-}
-
-static int task_clock_perf_event_enable(struct perf_event *event)
-{
-       struct hw_perf_event *hwc = &event->hw;
-       u64 now;
-
-       now = event->ctx->time;
-
-       local64_set(&hwc->prev_count, now);
-
-       perf_swevent_start_hrtimer(event);
-
-       return 0;
-}
-
-static void task_clock_perf_event_disable(struct perf_event *event)
-{
-       perf_swevent_cancel_hrtimer(event);
-       task_clock_perf_event_update(event, event->ctx->time);
-
+       event->hw.state = 0;
  }
  
-static void task_clock_perf_event_read(struct perf_event *event)
+static void perf_swevent_stop(struct perf_event *event, int flags)
  {
-       u64 time;
-
-       if (!in_nmi()) {
-               update_context_time(event->ctx);
-               time = event->ctx->time;
-       } else {
-               u64 now = perf_clock();
-               u64 delta = now - event->ctx->timestamp;
-               time = event->ctx->time + delta;
-       }
-
-       task_clock_perf_event_update(event, time);
+       event->hw.state = PERF_HES_STOPPED;
  }
  
-static const struct pmu perf_ops_task_clock = {
-       .enable         = task_clock_perf_event_enable,
-       .disable        = task_clock_perf_event_disable,
-       .read           = task_clock_perf_event_read,
-};
-
  /* Deref the hlist from the update side */
  static inline struct swevent_hlist *
-swevent_hlist_deref(struct perf_cpu_context *cpuctx)
+swevent_hlist_deref(struct swevent_htable *swhash)
  {
-       return rcu_dereference_protected(cpuctx->swevent_hlist,
-                                        lockdep_is_held(&cpuctx->hlist_mutex));
+       return rcu_dereference_protected(swhash->swevent_hlist,
+                                        lockdep_is_held(&swhash->hlist_mutex));
  }
  
  static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
@@ -4499,27 +4569,27 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
         kfree(hlist);
  }
  
-static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
+static void swevent_hlist_release(struct swevent_htable *swhash)
  {
-       struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx);
+       struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
  
         if (!hlist)
                 return;
  
-       rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
+       rcu_assign_pointer(swhash->swevent_hlist, NULL);
         call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
  }
  
  static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
  {
-       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+       struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
  
-       mutex_lock(&cpuctx->hlist_mutex);
+       mutex_lock(&swhash->hlist_mutex);
  
-       if (!--cpuctx->hlist_refcount)
-               swevent_hlist_release(cpuctx);
+       if (!--swhash->hlist_refcount)
+               swevent_hlist_release(swhash);
  
-       mutex_unlock(&cpuctx->hlist_mutex);
+       mutex_unlock(&swhash->hlist_mutex);
  }
  
  static void swevent_hlist_put(struct perf_event *event)
@@ -4537,12 +4607,12 @@ static void swevent_hlist_put(struct perf_event *event)
  
  static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
  {
-       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+       struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
         int err = 0;
  
-       mutex_lock(&cpuctx->hlist_mutex);
+       mutex_lock(&swhash->hlist_mutex);
  
-       if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) {
+       if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
                 struct swevent_hlist *hlist;
  
                 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -4550,11 +4620,11 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
                         err = -ENOMEM;
                         goto exit;
                 }
-               rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+               rcu_assign_pointer(swhash->swevent_hlist, hlist);
         }
-       cpuctx->hlist_refcount++;
- exit:
-       mutex_unlock(&cpuctx->hlist_mutex);
+       swhash->hlist_refcount++;
+exit:
+       mutex_unlock(&swhash->hlist_mutex);
  
         return err;
  }
@@ -4578,7 +4648,7 @@ static int swevent_hlist_get(struct perf_event *event)
         put_online_cpus();
  
         return 0;
- fail:
+fail:
         for_each_possible_cpu(cpu) {
                 if (cpu == failed_cpu)
                         break;
@@ -4589,17 +4659,64 @@ static int swevent_hlist_get(struct perf_event *event)
         return err;
  }
  
-#ifdef CONFIG_EVENT_TRACING
+atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
+
+static void sw_perf_event_destroy(struct perf_event *event)
+{
+       u64 event_id = event->attr.config;
+
+       WARN_ON(event->parent);
+
+       jump_label_dec(&perf_swevent_enabled[event_id]);
+       swevent_hlist_put(event);
+}
+
+static int perf_swevent_init(struct perf_event *event)
+{
+       int event_id = event->attr.config;
+
+       if (event->attr.type != PERF_TYPE_SOFTWARE)
+               return -ENOENT;
+
+       switch (event_id) {
+       case PERF_COUNT_SW_CPU_CLOCK:
+       case PERF_COUNT_SW_TASK_CLOCK:
+               return -ENOENT;
+
+       default:
+               break;
+       }
+
+       if (event_id > PERF_COUNT_SW_MAX)
+               return -ENOENT;
+
+       if (!event->parent) {
+               int err;
+
+               err = swevent_hlist_get(event);
+               if (err)
+                       return err;
+
+               jump_label_inc(&perf_swevent_enabled[event_id]);
+               event->destroy = sw_perf_event_destroy;
+       }
+
+       return 0;
+}
+
+static struct pmu perf_swevent = {
+       .task_ctx_nr    = perf_sw_context,
  
-static const struct pmu perf_ops_tracepoint = {
-       .enable         = perf_trace_enable,
-       .disable        = perf_trace_disable,
-       .start          = perf_swevent_int,
-       .stop           = perf_swevent_void,
+       .event_init     = perf_swevent_init,
+       .add            = perf_swevent_add,
+       .del            = perf_swevent_del,
+       .start          = perf_swevent_start,
+       .stop           = perf_swevent_stop,
         .read           = perf_swevent_read,
-       .unthrottle     = perf_swevent_void,
  };
  
+#ifdef CONFIG_EVENT_TRACING
+
  static int perf_tp_filter_match(struct perf_event *event,
                                 struct perf_sample_data *data)
  {
@@ -4643,7 +4760,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
  
         hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
                 if (perf_tp_event_match(event, &data, regs))
-                       perf_swevent_add(event, count, 1, &data, regs);
+                       perf_swevent_event(event, count, 1, &data, regs);
         }
  
         perf_swevent_put_recursion_context(rctx);
@@ -4655,10 +4772,13 @@ static void tp_perf_event_destroy(struct perf_event *event)
         perf_trace_destroy(event);
  }
  
-static const struct pmu *tp_perf_event_init(struct perf_event *event)
+static int perf_tp_event_init(struct perf_event *event)
  {
         int err;
  
+       if (event->attr.type != PERF_TYPE_TRACEPOINT)
+               return -ENOENT;
+
         /*
          * Raw tracepoint data is a severe data leak, only allow root to
          * have these.
@@ -4666,15 +4786,31 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
         if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
                         perf_paranoid_tracepoint_raw() &&
                         !capable(CAP_SYS_ADMIN))
-               return ERR_PTR(-EPERM);
+               return -EPERM;
  
         err = perf_trace_init(event);
         if (err)
-               return NULL;
+               return err;
  
         event->destroy = tp_perf_event_destroy;
  
-       return &perf_ops_tracepoint;
+       return 0;
+}
+
+static struct pmu perf_tracepoint = {
+       .task_ctx_nr    = perf_sw_context,
+
+       .event_init     = perf_tp_event_init,
+       .add            = perf_trace_add,
+       .del            = perf_trace_del,
+       .start          = perf_swevent_start,
+       .stop           = perf_swevent_stop,
+       .read           = perf_swevent_read,
+};
+
+static inline void perf_tp_register(void)
+{
+       perf_pmu_register(&perf_tracepoint);
  }
  
  static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4702,9 +4838,8 @@ static void perf_event_free_filter(struct perf_event *event)
  
  #else
  
-static const struct pmu *tp_perf_event_init(struct perf_event *event)
+static inline void perf_tp_register(void)
  {
-       return NULL;
  }
  
  static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4719,24 +4854,6 @@ static void perf_event_free_filter(struct perf_event *event)
  #endif /* CONFIG_EVENT_TRACING */
  
  #ifdef CONFIG_HAVE_HW_BREAKPOINT
-static void bp_perf_event_destroy(struct perf_event *event)
-{
-       release_bp_slot(event);
-}
-
-static const struct pmu *bp_perf_event_init(struct perf_event *bp)
-{
-       int err;
-
-       err = register_perf_hw_breakpoint(bp);
-       if (err)
-               return ERR_PTR(err);
-
-       bp->destroy = bp_perf_event_destroy;
-
-       return &perf_ops_bp;
-}
-
  void perf_bp_event(struct perf_event *bp, void *data)
  {
         struct perf_sample_data sample;
@@ -4744,81 +4861,383 @@ void perf_bp_event(struct perf_event *bp, void *data)
  
         perf_sample_data_init(&sample, bp->attr.bp_addr);
  
-       if (!perf_exclude_event(bp, regs))
-               perf_swevent_add(bp, 1, 1, &sample, regs);
-}
-#else
-static const struct pmu *bp_perf_event_init(struct perf_event *bp)
-{
-       return NULL;
-}
-
-void perf_bp_event(struct perf_event *bp, void *regs)
-{
+       if (!bp->hw.state && !perf_exclude_event(bp, regs))
+               perf_swevent_event(bp, 1, 1, &sample, regs);
  }
  #endif
  
-atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
+/*
+ * hrtimer based swevent callback
+ */
  
-static void sw_perf_event_destroy(struct perf_event *event)
+static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
  {
-       u64 event_id = event->attr.config;
+       enum hrtimer_restart ret = HRTIMER_RESTART;
+       struct perf_sample_data data;
+       struct pt_regs *regs;
+       struct perf_event *event;
+       u64 period;
  
-       WARN_ON(event->parent);
+       event = container_of(hrtimer, struct perf_event, hw.hrtimer);
+       event->pmu->read(event);
  
-       atomic_dec(&perf_swevent_enabled[event_id]);
-       swevent_hlist_put(event);
+       perf_sample_data_init(&data, 0);
+       data.period = event->hw.last_period;
+       regs = get_irq_regs();
+
+       if (regs && !perf_exclude_event(event, regs)) {
+               if (!(event->attr.exclude_idle && current->pid == 0))
+                       if (perf_event_overflow(event, 0, &data, regs))
+                               ret = HRTIMER_NORESTART;
+       }
+
+       period = max_t(u64, 10000, event->hw.sample_period);
+       hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+
+       return ret;
  }
  
-static const struct pmu *sw_perf_event_init(struct perf_event *event)
+static void perf_swevent_start_hrtimer(struct perf_event *event)
  {
-       const struct pmu *pmu = NULL;
-       u64 event_id = event->attr.config;
+       struct hw_perf_event *hwc = &event->hw;
+
+       hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hwc->hrtimer.function = perf_swevent_hrtimer;
+       if (hwc->sample_period) {
+               s64 period = local64_read(&hwc->period_left);
+
+               if (period) {
+                       if (period < 0)
+                               period = 10000;
+
+                       local64_set(&hwc->period_left, 0);
+               } else {
+                       period = max_t(u64, 10000, hwc->sample_period);
+               }
+               __hrtimer_start_range_ns(&hwc->hrtimer,
+                               ns_to_ktime(period), 0,
+                               HRTIMER_MODE_REL_PINNED, 0);
+       }
+}
+
+static void perf_swevent_cancel_hrtimer(struct perf_event *event)
+{
+       struct hw_perf_event *hwc = &event->hw;
+
+       if (hwc->sample_period) {
+               ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
+               local64_set(&hwc->period_left, ktime_to_ns(remaining));
+
+               hrtimer_cancel(&hwc->hrtimer);
+       }
+}
+
+/*
+ * Software event: cpu wall time clock
+ */
+
+static void cpu_clock_event_update(struct perf_event *event)
+{
+       s64 prev;
+       u64 now;
+
+       now = local_clock();
+       prev = local64_xchg(&event->hw.prev_count, now);
+       local64_add(now - prev, &event->count);
+}
+
+static void cpu_clock_event_start(struct perf_event *event, int flags)
+{
+       local64_set(&event->hw.prev_count, local_clock());
+       perf_swevent_start_hrtimer(event);
+}
+
+static void cpu_clock_event_stop(struct perf_event *event, int flags)
+{
+       perf_swevent_cancel_hrtimer(event);
+       cpu_clock_event_update(event);
+}
+
+static int cpu_clock_event_add(struct perf_event *event, int flags)
+{
+       if (flags & PERF_EF_START)
+               cpu_clock_event_start(event, flags);
+
+       return 0;
+}
+
+static void cpu_clock_event_del(struct perf_event *event, int flags)
+{
+       cpu_clock_event_stop(event, flags);
+}
+
+static void cpu_clock_event_read(struct perf_event *event)
+{
+       cpu_clock_event_update(event);
+}
+
+static int cpu_clock_event_init(struct perf_event *event)
+{
+       if (event->attr.type != PERF_TYPE_SOFTWARE)
+               return -ENOENT;
+
+       if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
+               return -ENOENT;
+
+       return 0;
+}
+
+static struct pmu perf_cpu_clock = {
+       .task_ctx_nr    = perf_sw_context,
+
+       .event_init     = cpu_clock_event_init,
+       .add            = cpu_clock_event_add,
+       .del            = cpu_clock_event_del,
+       .start          = cpu_clock_event_start,
+       .stop           = cpu_clock_event_stop,
+       .read           = cpu_clock_event_read,
+};
+
+/*
+ * Software event: task time clock
+ */
+
+static void task_clock_event_update(struct perf_event *event, u64 now)
+{
+       u64 prev;
+       s64 delta;
+
+       prev = local64_xchg(&event->hw.prev_count, now);
+       delta = now - prev;
+       local64_add(delta, &event->count);
+}
  
+static void task_clock_event_start(struct perf_event *event, int flags)
+{
+       local64_set(&event->hw.prev_count, event->ctx->time);
+       perf_swevent_start_hrtimer(event);
+}
+
+static void task_clock_event_stop(struct perf_event *event, int flags)
+{
+       perf_swevent_cancel_hrtimer(event);
+       task_clock_event_update(event, event->ctx->time);
+}
+
+static int task_clock_event_add(struct perf_event *event, int flags)
+{
+       if (flags & PERF_EF_START)
+               task_clock_event_start(event, flags);
+
+       return 0;
+}
+
+static void task_clock_event_del(struct perf_event *event, int flags)
+{
+       task_clock_event_stop(event, PERF_EF_UPDATE);
+}
+
+static void task_clock_event_read(struct perf_event *event)
+{
+       u64 time;
+
+       if (!in_nmi()) {
+               update_context_time(event->ctx);
+               time = event->ctx->time;
+       } else {
+               u64 now = perf_clock();
+               u64 delta = now - event->ctx->timestamp;
+               time = event->ctx->time + delta;
+       }
+
+       task_clock_event_update(event, time);
+}
+
+static int task_clock_event_init(struct perf_event *event)
+{
+       if (event->attr.type != PERF_TYPE_SOFTWARE)
+               return -ENOENT;
+
+       if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
+               return -ENOENT;
+
+       return 0;
+}
+
+static struct pmu perf_task_clock = {
+       .task_ctx_nr    = perf_sw_context,
+
+       .event_init     = task_clock_event_init,
+       .add            = task_clock_event_add,
+       .del            = task_clock_event_del,
+       .start          = task_clock_event_start,
+       .stop           = task_clock_event_stop,
+       .read           = task_clock_event_read,
+};
+
+static void perf_pmu_nop_void(struct pmu *pmu)
+{
+}
+
+static int perf_pmu_nop_int(struct pmu *pmu)
+{
+       return 0;
+}
+
+static void perf_pmu_start_txn(struct pmu *pmu)
+{
+       perf_pmu_disable(pmu);
+}
+
+static int perf_pmu_commit_txn(struct pmu *pmu)
+{
+       perf_pmu_enable(pmu);
+       return 0;
+}
+
+static void perf_pmu_cancel_txn(struct pmu *pmu)
+{
+       perf_pmu_enable(pmu);
+}
+
+/*
+ * Ensures all contexts with the same task_ctx_nr have the same
+ * pmu_cpu_context too.
+ */
+static void *find_pmu_context(int ctxn)
+{
+       struct pmu *pmu;
+
+       if (ctxn < 0)
+               return NULL;
+
+       list_for_each_entry(pmu, &pmus, entry) {
+               if (pmu->task_ctx_nr == ctxn)
+                       return pmu->pmu_cpu_context;
+       }
+
+       return NULL;
+}
+
+static void free_pmu_context(void * __percpu cpu_context)
+{
+       struct pmu *pmu;
+
+       mutex_lock(&pmus_lock);
         /*
-        * Software events (currently) can't in general distinguish
-        * between user, kernel and hypervisor events.
-        * However, context switches and cpu migrations are considered
-        * to be kernel events, and page faults are never hypervisor
-        * events.
+        * Like a real lame refcount.
          */
-       switch (event_id) {
-       case PERF_COUNT_SW_CPU_CLOCK:
-               pmu = &perf_ops_cpu_clock;
+       list_for_each_entry(pmu, &pmus, entry) {
+               if (pmu->pmu_cpu_context == cpu_context)
+                       goto out;
+       }
  
-               break;
-       case PERF_COUNT_SW_TASK_CLOCK:
-               /*
-                * If the user instantiates this as a per-cpu event,
-                * use the cpu_clock event instead.
-                */
-               if (event->ctx->task)
-                       pmu = &perf_ops_task_clock;
-               else
-                       pmu = &perf_ops_cpu_clock;
+       free_percpu(cpu_context);
+out:
+       mutex_unlock(&pmus_lock);
+}
  
-               break;
-       case PERF_COUNT_SW_PAGE_FAULTS:
-       case PERF_COUNT_SW_PAGE_FAULTS_MIN:
-       case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
-       case PERF_COUNT_SW_CONTEXT_SWITCHES:
-       case PERF_COUNT_SW_CPU_MIGRATIONS:
-       case PERF_COUNT_SW_ALIGNMENT_FAULTS:
-       case PERF_COUNT_SW_EMULATION_FAULTS:
-               if (!event->parent) {
-                       int err;
-
-                       err = swevent_hlist_get(event);
-                       if (err)
-                               return ERR_PTR(err);
+int perf_pmu_register(struct pmu *pmu)
+{
+       int cpu, ret;
+
+       mutex_lock(&pmus_lock);
+       ret = -ENOMEM;
+       pmu->pmu_disable_count = alloc_percpu(int);
+       if (!pmu->pmu_disable_count)
+               goto unlock;
+
+       pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
+       if (pmu->pmu_cpu_context)
+               goto got_cpu_context;
+
+       pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
+       if (!pmu->pmu_cpu_context)
+               goto free_pdc;
+
+       for_each_possible_cpu(cpu) {
+               struct perf_cpu_context *cpuctx;
+
+               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+               __perf_event_init_context(&cpuctx->ctx);
+               cpuctx->ctx.type = cpu_context;
+               cpuctx->ctx.pmu = pmu;
+               cpuctx->jiffies_interval = 1;
+               INIT_LIST_HEAD(&cpuctx->rotation_list);
+       }
  
-                       atomic_inc(&perf_swevent_enabled[event_id]);
-                       event->destroy = sw_perf_event_destroy;
+got_cpu_context:
+       if (!pmu->start_txn) {
+               if (pmu->pmu_enable) {
+                       /*
+                        * If we have pmu_enable/pmu_disable calls, install
+                        * transaction stubs that use that to try and batch
+                        * hardware accesses.
+                        */
+                       pmu->start_txn  = perf_pmu_start_txn;
+                       pmu->commit_txn = perf_pmu_commit_txn;
+                       pmu->cancel_txn = perf_pmu_cancel_txn;
+               } else {
+                       pmu->start_txn  = perf_pmu_nop_void;
+                       pmu->commit_txn = perf_pmu_nop_int;
+                       pmu->cancel_txn = perf_pmu_nop_void;
                 }
-               pmu = &perf_ops_generic;
-               break;
         }
  
+       if (!pmu->pmu_enable) {
+               pmu->pmu_enable  = perf_pmu_nop_void;
+               pmu->pmu_disable = perf_pmu_nop_void;
+       }
+
+       list_add_rcu(&pmu->entry, &pmus);
+       ret = 0;
+unlock:
+       mutex_unlock(&pmus_lock);
+
+       return ret;
+
+free_pdc:
+       free_percpu(pmu->pmu_disable_count);
+       goto unlock;
+}
+
+void perf_pmu_unregister(struct pmu *pmu)
+{
+       mutex_lock(&pmus_lock);
+       list_del_rcu(&pmu->entry);
+       mutex_unlock(&pmus_lock);
+
+       /*
+        * We dereference the pmu list under both SRCU and regular RCU, so
+        * synchronize against both of those.
+        */
+       synchronize_srcu(&pmus_srcu);
+       synchronize_rcu();
+
+       free_percpu(pmu->pmu_disable_count);
+       free_pmu_context(pmu->pmu_cpu_context);
+}
+
+struct pmu *perf_init_event(struct perf_event *event)
+{
+       struct pmu *pmu = NULL;
+       int idx;
+
+       idx = srcu_read_lock(&pmus_srcu);
+       list_for_each_entry_rcu(pmu, &pmus, entry) {
+               int ret = pmu->event_init(event);
+               if (!ret)
+                       goto unlock;
+
+               if (ret != -ENOENT) {
+                       pmu = ERR_PTR(ret);
+                       goto unlock;
+               }
+       }
+       pmu = ERR_PTR(-ENOENT);
+unlock:
+       srcu_read_unlock(&pmus_srcu, idx);
+
         return pmu;
  }
  
@@ -4826,20 +5245,18 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
   * Allocate and initialize a event structure
   */
  static struct perf_event *
-perf_event_alloc(struct perf_event_attr *attr,
-                  int cpu,
-                  struct perf_event_context *ctx,
-                  struct perf_event *group_leader,
-                  struct perf_event *parent_event,
-                  perf_overflow_handler_t overflow_handler,
-                  gfp_t gfpflags)
-{
-       const struct pmu *pmu;
+perf_event_alloc(struct perf_event_attr *attr, int cpu,
+                struct task_struct *task,
+                struct perf_event *group_leader,
+                struct perf_event *parent_event,
+                perf_overflow_handler_t overflow_handler)
+{
+       struct pmu *pmu;
         struct perf_event *event;
         struct hw_perf_event *hwc;
         long err;
  
-       event = kzalloc(sizeof(*event), gfpflags);
+       event = kzalloc(sizeof(*event), GFP_KERNEL);
         if (!event)
                 return ERR_PTR(-ENOMEM);
  
@@ -4857,6 +5274,7 @@ perf_event_alloc(struct perf_event_attr *attr,
         INIT_LIST_HEAD(&event->event_entry);
         INIT_LIST_HEAD(&event->sibling_list);
         init_waitqueue_head(&event->waitq);
+       init_irq_work(&event->pending, perf_pending_event);
  
         mutex_init(&event->mmap_mutex);
  
@@ -4864,7 +5282,6 @@ perf_event_alloc(struct perf_event_attr *attr,
         event->attr             = *attr;
         event->group_leader     = group_leader;
         event->pmu              = NULL;
-       event->ctx              = ctx;
         event->oncpu            = -1;
  
         event->parent           = parent_event;
@@ -4874,6 +5291,17 @@ perf_event_alloc(struct perf_event_attr *attr,
  
         event->state            = PERF_EVENT_STATE_INACTIVE;
  
+       if (task) {
+               event->attach_state = PERF_ATTACH_TASK;
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+               /*
+                * hw_breakpoint is a bit difficult here..
+                */
+               if (attr->type == PERF_TYPE_BREAKPOINT)
+                       event->hw.bp_target = task;
+#endif
+       }
+
         if (!overflow_handler && parent_event)
                 overflow_handler = parent_event->overflow_handler;
         
@@ -4898,29 +5326,8 @@ perf_event_alloc(struct perf_event_attr *attr,
         if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
                 goto done;
  
-       switch (attr->type) {
-       case PERF_TYPE_RAW:
-       case PERF_TYPE_HARDWARE:
-       case PERF_TYPE_HW_CACHE:
-               pmu = hw_perf_event_init(event);
-               break;
-
-       case PERF_TYPE_SOFTWARE:
-               pmu = sw_perf_event_init(event);
-               break;
-
-       case PERF_TYPE_TRACEPOINT:
-               pmu = tp_perf_event_init(event);
-               break;
+       pmu = perf_init_event(event);
  
-       case PERF_TYPE_BREAKPOINT:
-               pmu = bp_perf_event_init(event);
-               break;
-
-
-       default:
-               break;
-       }
  done:
         err = 0;
         if (!pmu)
@@ -4938,13 +5345,21 @@ done:
         event->pmu = pmu;
  
         if (!event->parent) {
-               atomic_inc(&nr_events);
+               if (event->attach_state & PERF_ATTACH_TASK)
+                       jump_label_inc(&perf_task_events);
                 if (event->attr.mmap || event->attr.mmap_data)
                         atomic_inc(&nr_mmap_events);
                 if (event->attr.comm)
                         atomic_inc(&nr_comm_events);
                 if (event->attr.task)
                         atomic_inc(&nr_task_events);
+               if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
+                       err = get_callchain_buffers();
+                       if (err) {
+                               free_event(event);
+                               return ERR_PTR(err);
+                       }
+               }
         }
  
         return event;
@@ -5092,12 +5507,16 @@ SYSCALL_DEFINE5(perf_event_open,
                 struct perf_event_attr __user *, attr_uptr,
                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
  {
-       struct perf_event *event, *group_leader = NULL, *output_event = NULL;
+       struct perf_event *group_leader = NULL, *output_event = NULL;
+       struct perf_event *event, *sibling;
         struct perf_event_attr attr;
         struct perf_event_context *ctx;
         struct file *event_file = NULL;
         struct file *group_file = NULL;
+       struct task_struct *task = NULL;
+       struct pmu *pmu;
         int event_fd;
+       int move_group = 0;
         int fput_needed = 0;
         int err;
  
@@ -5123,20 +5542,11 @@ SYSCALL_DEFINE5(perf_event_open,
         if (event_fd < 0)
                 return event_fd;
  
-       /*
-        * Get the target context (task or percpu):
-        */
-       ctx = find_get_context(pid, cpu);
-       if (IS_ERR(ctx)) {
-               err = PTR_ERR(ctx);
-               goto err_fd;
-       }
-
         if (group_fd != -1) {
                 group_leader = perf_fget_light(group_fd, &fput_needed);
                 if (IS_ERR(group_leader)) {
                         err = PTR_ERR(group_leader);
-                       goto err_put_context;
+                       goto err_fd;
                 }
                 group_file = group_leader->filp;
                 if (flags & PERF_FLAG_FD_OUTPUT)
@@ -5145,6 +5555,58 @@ SYSCALL_DEFINE5(perf_event_open,
                         group_leader = NULL;
         }
  
+       if (pid != -1) {
+               task = find_lively_task_by_vpid(pid);
+               if (IS_ERR(task)) {
+                       err = PTR_ERR(task);
+                       goto err_group_fd;
+               }
+       }
+
+       event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL);
+       if (IS_ERR(event)) {
+               err = PTR_ERR(event);
+               goto err_task;
+       }
+
+       /*
+        * Special case software events and allow them to be part of
+        * any hardware group.
+        */
+       pmu = event->pmu;
+
+       if (group_leader &&
+           (is_software_event(event) != is_software_event(group_leader))) {
+               if (is_software_event(event)) {
+                       /*
+                        * If event and group_leader are not both a software
+                        * event, and event is, then group leader is not.
+                        *
+                        * Allow the addition of software events to !software
+                        * groups, this is safe because software events never
+                        * fail to schedule.
+                        */
+                       pmu = group_leader->pmu;
+               } else if (is_software_event(group_leader) &&
+                          (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
+                       /*
+                        * In case the group is a pure software group, and we
+                        * try to add a hardware event, move the whole group to
+                        * the hardware context.
+                        */
+                       move_group = 1;
+               }
+       }
+
+       /*
+        * Get the target context (task or percpu):
+        */
+       ctx = find_get_context(pmu, task, cpu);
+       if (IS_ERR(ctx)) {
+               err = PTR_ERR(ctx);
+               goto err_alloc;
+       }
+
         /*
          * Look up the group leader (we will attach this event to it):
          */
@@ -5156,42 +5618,66 @@ SYSCALL_DEFINE5(perf_event_open,
                  * becoming part of another group-sibling):
                  */
                 if (group_leader->group_leader != group_leader)
-                       goto err_put_context;
+                       goto err_context;
                 /*
                  * Do not allow to attach to a group in a different
                  * task or CPU context:
                  */
-               if (group_leader->ctx != ctx)
-                       goto err_put_context;
+               if (move_group) {
+                       if (group_leader->ctx->type != ctx->type)
+                               goto err_context;
+               } else {
+                       if (group_leader->ctx != ctx)
+                               goto err_context;
+               }
+
                 /*
                  * Only a group leader can be exclusive or pinned
                  */
                 if (attr.exclusive || attr.pinned)
-                       goto err_put_context;
-       }
-
-       event = perf_event_alloc(&attr, cpu, ctx, group_leader,
-                                    NULL, NULL, GFP_KERNEL);
-       if (IS_ERR(event)) {
-               err = PTR_ERR(event);
-               goto err_put_context;
+                       goto err_context;
         }
  
         if (output_event) {
                 err = perf_event_set_output(event, output_event);
                 if (err)
-                       goto err_free_put_context;
+                       goto err_context;
         }
  
         event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
         if (IS_ERR(event_file)) {
                 err = PTR_ERR(event_file);
-               goto err_free_put_context;
+               goto err_context;
+       }
+
+       if (move_group) {
+               struct perf_event_context *gctx = group_leader->ctx;
+
+               mutex_lock(&gctx->mutex);
+               perf_event_remove_from_context(group_leader);
+               list_for_each_entry(sibling, &group_leader->sibling_list,
+                                   group_entry) {
+                       perf_event_remove_from_context(sibling);
+                       put_ctx(gctx);
+               }
+               mutex_unlock(&gctx->mutex);
+               put_ctx(gctx);
         }
  
         event->filp = event_file;
         WARN_ON_ONCE(ctx->parent_ctx);
         mutex_lock(&ctx->mutex);
+
+       if (move_group) {
+               perf_install_in_context(ctx, group_leader, cpu);
+               get_ctx(ctx);
+               list_for_each_entry(sibling, &group_leader->sibling_list,
+                                   group_entry) {
+                       perf_install_in_context(ctx, sibling, cpu);
+                       get_ctx(ctx);
+               }
+       }
+
         perf_install_in_context(ctx, event, cpu);
         ++ctx->generation;
         mutex_unlock(&ctx->mutex);
@@ -5212,11 +5698,15 @@ SYSCALL_DEFINE5(perf_event_open,
         fd_install(event_fd, event_file);
         return event_fd;
  
-err_free_put_context:
+err_context:
+       put_ctx(ctx);
+err_alloc:
         free_event(event);
-err_put_context:
+err_task:
+       if (task)
+               put_task_struct(task);
+err_group_fd:
         fput_light(group_file, fput_needed);
-       put_ctx(ctx);
  err_fd:
         put_unused_fd(event_fd);
         return err;
@@ -5227,154 +5717,54 @@ err_fd:
   *
   * @attr: attributes of the counter to create
   * @cpu: cpu in which the counter is bound
- * @pid: task to profile
+ * @task: task to profile (NULL for percpu)
   */
  struct perf_event *
  perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
-                                pid_t pid,
+                                struct task_struct *task,
                                  perf_overflow_handler_t overflow_handler)
  {
-       struct perf_event *event;
         struct perf_event_context *ctx;
-       int err;
-
-       /*
-        * Get the target context (task or percpu):
-        */
-
-       ctx = find_get_context(pid, cpu);
-       if (IS_ERR(ctx)) {
-               err = PTR_ERR(ctx);
-               goto err_exit;
-       }
-
-       event = perf_event_alloc(attr, cpu, ctx, NULL,
-                                NULL, overflow_handler, GFP_KERNEL);
-       if (IS_ERR(event)) {
-               err = PTR_ERR(event);
-               goto err_put_context;
-       }
-
-       event->filp = NULL;
-       WARN_ON_ONCE(ctx->parent_ctx);
-       mutex_lock(&ctx->mutex);
-       perf_install_in_context(ctx, event, cpu);
-       ++ctx->generation;
-       mutex_unlock(&ctx->mutex);
-
-       event->owner = current;
-       get_task_struct(current);
-       mutex_lock(&current->perf_event_mutex);
-       list_add_tail(&event->owner_entry, &current->perf_event_list);
-       mutex_unlock(&current->perf_event_mutex);
-
-       return event;
-
- err_put_context:
-       put_ctx(ctx);
- err_exit:
-       return ERR_PTR(err);
-}
-EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
-
-/*
- * inherit a event from parent task to child task:
- */
-static struct perf_event *
-inherit_event(struct perf_event *parent_event,
-             struct task_struct *parent,
-             struct perf_event_context *parent_ctx,
-             struct task_struct *child,
-             struct perf_event *group_leader,
-             struct perf_event_context *child_ctx)
-{
-       struct perf_event *child_event;
-
-       /*
-        * Instead of creating recursive hierarchies of events,
-        * we link inherited events back to the original parent,
-        * which has a filp for sure, which we use as the reference
-        * count:
-        */
-       if (parent_event->parent)
-               parent_event = parent_event->parent;
-
-       child_event = perf_event_alloc(&parent_event->attr,
-                                          parent_event->cpu, child_ctx,
-                                          group_leader, parent_event,
-                                          NULL, GFP_KERNEL);
-       if (IS_ERR(child_event))
-               return child_event;
-       get_ctx(child_ctx);
-
-       /*
-        * Make the child state follow the state of the parent event,
-        * not its attr.disabled bit.  We hold the parent's mutex,
-        * so we won't race with perf_event_{en, dis}able_family.
-        */
-       if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
-               child_event->state = PERF_EVENT_STATE_INACTIVE;
-       else
-               child_event->state = PERF_EVENT_STATE_OFF;
-
-       if (parent_event->attr.freq) {
-               u64 sample_period = parent_event->hw.sample_period;
-               struct hw_perf_event *hwc = &child_event->hw;
-
-               hwc->sample_period = sample_period;
-               hwc->last_period   = sample_period;
-
-               local64_set(&hwc->period_left, sample_period);
-       }
-
-       child_event->overflow_handler = parent_event->overflow_handler;
-
-       /*
-        * Link it up in the child's context:
-        */
-       add_event_to_ctx(child_event, child_ctx);
-
-       /*
-        * Get a reference to the parent filp - we will fput it
-        * when the child event exits. This is safe to do because
-        * we are in the parent and we know that the filp still
-        * exists and has a nonzero count:
-        */
-       atomic_long_inc(&parent_event->filp->f_count);
-
-       /*
-        * Link this into the parent event's child list
-        */
-       WARN_ON_ONCE(parent_event->ctx->parent_ctx);
-       mutex_lock(&parent_event->child_mutex);
-       list_add_tail(&child_event->child_list, &parent_event->child_list);
-       mutex_unlock(&parent_event->child_mutex);
+       struct perf_event *event;
+       int err;
  
-       return child_event;
-}
+       /*
+        * Get the target context (task or percpu):
+        */
  
-static int inherit_group(struct perf_event *parent_event,
-             struct task_struct *parent,
-             struct perf_event_context *parent_ctx,
-             struct task_struct *child,
-             struct perf_event_context *child_ctx)
-{
-       struct perf_event *leader;
-       struct perf_event *sub;
-       struct perf_event *child_ctr;
+       event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler);
+       if (IS_ERR(event)) {
+               err = PTR_ERR(event);
+               goto err;
+       }
  
-       leader = inherit_event(parent_event, parent, parent_ctx,
-                                child, NULL, child_ctx);
-       if (IS_ERR(leader))
-               return PTR_ERR(leader);
-       list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
-               child_ctr = inherit_event(sub, parent, parent_ctx,
-                                           child, leader, child_ctx);
-               if (IS_ERR(child_ctr))
-                       return PTR_ERR(child_ctr);
+       ctx = find_get_context(event->pmu, task, cpu);
+       if (IS_ERR(ctx)) {
+               err = PTR_ERR(ctx);
+               goto err_free;
         }
-       return 0;
+
+       event->filp = NULL;
+       WARN_ON_ONCE(ctx->parent_ctx);
+       mutex_lock(&ctx->mutex);
+       perf_install_in_context(ctx, event, cpu);
+       ++ctx->generation;
+       mutex_unlock(&ctx->mutex);
+
+       event->owner = current;
+       get_task_struct(current);
+       mutex_lock(&current->perf_event_mutex);
+       list_add_tail(&event->owner_entry, &current->perf_event_list);
+       mutex_unlock(&current->perf_event_mutex);
+
+       return event;
+
+err_free:
+       free_event(event);
+err:
+       return ERR_PTR(err);
  }
+EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
  
  static void sync_child_event(struct perf_event *child_event,
                                struct task_struct *child)
@@ -5432,16 +5822,13 @@ __perf_event_exit_task(struct perf_event *child_event,
         }
  }
  
-/*
- * When a child task exits, feed back event values to parent events.
- */
-void perf_event_exit_task(struct task_struct *child)
+static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
  {
         struct perf_event *child_event, *tmp;
         struct perf_event_context *child_ctx;
         unsigned long flags;
  
-       if (likely(!child->perf_event_ctxp)) {
+       if (likely(!child->perf_event_ctxp[ctxn])) {
                 perf_event_task(child, NULL, 0);
                 return;
         }
@@ -5453,8 +5840,8 @@ void perf_event_exit_task(struct task_struct *child)
          * scheduled, so we are now safe from rescheduling changing
          * our context.
          */
-       child_ctx = child->perf_event_ctxp;
-       __perf_event_task_sched_out(child_ctx);
+       child_ctx = child->perf_event_ctxp[ctxn];
+       task_ctx_sched_out(child_ctx, EVENT_ALL);
  
         /*
          * Take the context lock here so that if find_get_context is
@@ -5462,7 +5849,7 @@ void perf_event_exit_task(struct task_struct *child)
          * incremented the context's refcount before we do put_ctx below.
          */
         raw_spin_lock(&child_ctx->lock);
-       child->perf_event_ctxp = NULL;
+       child->perf_event_ctxp[ctxn] = NULL;
         /*
          * If this context is a clone; unclone it so it can't get
          * swapped to another process while we're removing all
@@ -5515,6 +5902,17 @@ again:
         put_ctx(child_ctx);
  }
  
+/*
+ * When a child task exits, feed back event values to parent events.
+ */
+void perf_event_exit_task(struct task_struct *child)
+{
+       int ctxn;
+
+       for_each_task_context_nr(ctxn)
+               perf_event_exit_task_context(child, ctxn);
+}
+
  static void perf_free_event(struct perf_event *event,
                             struct perf_event_context *ctx)
  {
@@ -5536,48 +5934,166 @@ static void perf_free_event(struct perf_event *event,
  
  /*
   * free an unexposed, unused context as created by inheritance by
- * init_task below, used by fork() in case of fail.
+ * perf_event_init_task below, used by fork() in case of fail.
   */
  void perf_event_free_task(struct task_struct *task)
  {
-       struct perf_event_context *ctx = task->perf_event_ctxp;
+       struct perf_event_context *ctx;
         struct perf_event *event, *tmp;
+       int ctxn;
  
-       if (!ctx)
-               return;
+       for_each_task_context_nr(ctxn) {
+               ctx = task->perf_event_ctxp[ctxn];
+               if (!ctx)
+                       continue;
  
-       mutex_lock(&ctx->mutex);
+               mutex_lock(&ctx->mutex);
  again:
-       list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
-               perf_free_event(event, ctx);
+               list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
+                               group_entry)
+                       perf_free_event(event, ctx);
  
-       list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
-                                group_entry)
-               perf_free_event(event, ctx);
+               list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
+                               group_entry)
+                       perf_free_event(event, ctx);
  
-       if (!list_empty(&ctx->pinned_groups) ||
-           !list_empty(&ctx->flexible_groups))
-               goto again;
+               if (!list_empty(&ctx->pinned_groups) ||
+                               !list_empty(&ctx->flexible_groups))
+                       goto again;
  
-       mutex_unlock(&ctx->mutex);
+               mutex_unlock(&ctx->mutex);
  
-       put_ctx(ctx);
+               put_ctx(ctx);
+       }
+}
+
+void perf_event_delayed_put(struct task_struct *task)
+{
+       int ctxn;
+
+       for_each_task_context_nr(ctxn)
+               WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
+}
+
+/*
+ * inherit a event from parent task to child task:
+ */
+static struct perf_event *
+inherit_event(struct perf_event *parent_event,
+             struct task_struct *parent,
+             struct perf_event_context *parent_ctx,
+             struct task_struct *child,
+             struct perf_event *group_leader,
+             struct perf_event_context *child_ctx)
+{
+       struct perf_event *child_event;
+       unsigned long flags;
+
+       /*
+        * Instead of creating recursive hierarchies of events,
+        * we link inherited events back to the original parent,
+        * which has a filp for sure, which we use as the reference
+        * count:
+        */
+       if (parent_event->parent)
+               parent_event = parent_event->parent;
+
+       child_event = perf_event_alloc(&parent_event->attr,
+                                          parent_event->cpu,
+                                          child,
+                                          group_leader, parent_event,
+                                          NULL);
+       if (IS_ERR(child_event))
+               return child_event;
+       get_ctx(child_ctx);
+
+       /*
+        * Make the child state follow the state of the parent event,
+        * not its attr.disabled bit.  We hold the parent's mutex,
+        * so we won't race with perf_event_{en, dis}able_family.
+        */
+       if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
+               child_event->state = PERF_EVENT_STATE_INACTIVE;
+       else
+               child_event->state = PERF_EVENT_STATE_OFF;
+
+       if (parent_event->attr.freq) {
+               u64 sample_period = parent_event->hw.sample_period;
+               struct hw_perf_event *hwc = &child_event->hw;
+
+               hwc->sample_period = sample_period;
+               hwc->last_period   = sample_period;
+
+               local64_set(&hwc->period_left, sample_period);
+       }
+
+       child_event->ctx = child_ctx;
+       child_event->overflow_handler = parent_event->overflow_handler;
+
+       /*
+        * Link it up in the child's context:
+        */
+       raw_spin_lock_irqsave(&child_ctx->lock, flags);
+       add_event_to_ctx(child_event, child_ctx);
+       raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
+
+       /*
+        * Get a reference to the parent filp - we will fput it
+        * when the child event exits. This is safe to do because
+        * we are in the parent and we know that the filp still
+        * exists and has a nonzero count:
+        */
+       atomic_long_inc(&parent_event->filp->f_count);
+
+       /*
+        * Link this into the parent event's child list
+        */
+       WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+       mutex_lock(&parent_event->child_mutex);
+       list_add_tail(&child_event->child_list, &parent_event->child_list);
+       mutex_unlock(&parent_event->child_mutex);
+
+       return child_event;
+}
+
+static int inherit_group(struct perf_event *parent_event,
+             struct task_struct *parent,
+             struct perf_event_context *parent_ctx,
+             struct task_struct *child,
+             struct perf_event_context *child_ctx)
+{
+       struct perf_event *leader;
+       struct perf_event *sub;
+       struct perf_event *child_ctr;
+
+       leader = inherit_event(parent_event, parent, parent_ctx,
+                                child, NULL, child_ctx);
+       if (IS_ERR(leader))
+               return PTR_ERR(leader);
+       list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
+               child_ctr = inherit_event(sub, parent, parent_ctx,
+                                           child, leader, child_ctx);
+               if (IS_ERR(child_ctr))
+                       return PTR_ERR(child_ctr);
+       }
+       return 0;
  }
  
  static int
  inherit_task_group(struct perf_event *event, struct task_struct *parent,
                    struct perf_event_context *parent_ctx,
-                  struct task_struct *child,
+                  struct task_struct *child, int ctxn,
                    int *inherited_all)
  {
         int ret;
-       struct perf_event_context *child_ctx = child->perf_event_ctxp;
+       struct perf_event_context *child_ctx;
  
         if (!event->attr.inherit) {
                 *inherited_all = 0;
                 return 0;
         }
  
+               child_ctx = child->perf_event_ctxp[ctxn];
         if (!child_ctx) {
                 /*
                  * This is executed from the parent task context, so
@@ -5586,14 +6102,11 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
                  * child.
                  */
  
-               child_ctx = kzalloc(sizeof(struct perf_event_context),
-                                   GFP_KERNEL);
+               child_ctx = alloc_perf_context(event->pmu, child);
                 if (!child_ctx)
                         return -ENOMEM;
  
-               __perf_event_init_context(child_ctx, child);
-               child->perf_event_ctxp = child_ctx;
-               get_task_struct(child);
+               child->perf_event_ctxp[ctxn] = child_ctx;
         }
  
         ret = inherit_group(event, parent, parent_ctx,
@@ -5605,11 +6118,10 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
         return ret;
  }
  
-
  /*
   * Initialize the perf_event context in task_struct
   */
-int perf_event_init_task(struct task_struct *child)
+int perf_event_init_context(struct task_struct *child, int ctxn)
  {
         struct perf_event_context *child_ctx, *parent_ctx;
         struct perf_event_context *cloned_ctx;
@@ -5618,19 +6130,19 @@ int perf_event_init_task(struct task_struct *child)
         int inherited_all = 1;
         int ret = 0;
  
-       child->perf_event_ctxp = NULL;
+       child->perf_event_ctxp[ctxn] = NULL;
  
         mutex_init(&child->perf_event_mutex);
         INIT_LIST_HEAD(&child->perf_event_list);
  
-       if (likely(!parent->perf_event_ctxp))
+       if (likely(!parent->perf_event_ctxp[ctxn]))
                 return 0;
  
         /*
          * If the parent's context is a clone, pin it so it won't get
          * swapped under us.
          */
-       parent_ctx = perf_pin_task_context(parent);
+       parent_ctx = perf_pin_task_context(parent, ctxn);
  
         /*
          * No need to check if parent_ctx != NULL here; since we saw
@@ -5650,20 +6162,20 @@ int perf_event_init_task(struct task_struct *child)
          * the list, not manipulating it:
          */
         list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
-               ret = inherit_task_group(event, parent, parent_ctx, child,
-                                        &inherited_all);
+               ret = inherit_task_group(event, parent, parent_ctx,
+                                        child, ctxn, &inherited_all);
                 if (ret)
                         break;
         }
  
         list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
-               ret = inherit_task_group(event, parent, parent_ctx, child,
-                                        &inherited_all);
+               ret = inherit_task_group(event, parent, parent_ctx,
+                                        child, ctxn, &inherited_all);
                 if (ret)
                         break;
         }
  
-       child_ctx = child->perf_event_ctxp;
+       child_ctx = child->perf_event_ctxp[ctxn];
  
         if (child_ctx && inherited_all) {
                 /*
@@ -5692,63 +6204,98 @@ int perf_event_init_task(struct task_struct *child)
         return ret;
  }
  
+/*
+ * Initialize the perf_event context in task_struct
+ */
+int perf_event_init_task(struct task_struct *child)
+{
+       int ctxn, ret;
+
+       for_each_task_context_nr(ctxn) {
+               ret = perf_event_init_context(child, ctxn);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
  static void __init perf_event_init_all_cpus(void)
  {
+       struct swevent_htable *swhash;
         int cpu;
-       struct perf_cpu_context *cpuctx;
  
         for_each_possible_cpu(cpu) {
-               cpuctx = &per_cpu(perf_cpu_context, cpu);
-               mutex_init(&cpuctx->hlist_mutex);
-               __perf_event_init_context(&cpuctx->ctx, NULL);
+               swhash = &per_cpu(swevent_htable, cpu);
+               mutex_init(&swhash->hlist_mutex);
+               INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
         }
  }
  
  static void __cpuinit perf_event_init_cpu(int cpu)
  {
-       struct perf_cpu_context *cpuctx;
-
-       cpuctx = &per_cpu(perf_cpu_context, cpu);
+       struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
  
-       spin_lock(&perf_resource_lock);
-       cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
-       spin_unlock(&perf_resource_lock);
-
-       mutex_lock(&cpuctx->hlist_mutex);
-       if (cpuctx->hlist_refcount > 0) {
+       mutex_lock(&swhash->hlist_mutex);
+       if (swhash->hlist_refcount > 0) {
                 struct swevent_hlist *hlist;
  
-               hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
-               WARN_ON_ONCE(!hlist);
-               rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+               hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
+               WARN_ON(!hlist);
+               rcu_assign_pointer(swhash->swevent_hlist, hlist);
         }
-       mutex_unlock(&cpuctx->hlist_mutex);
+       mutex_unlock(&swhash->hlist_mutex);
  }
  
  #ifdef CONFIG_HOTPLUG_CPU
-static void __perf_event_exit_cpu(void *info)
+static void perf_pmu_rotate_stop(struct pmu *pmu)
  {
-       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-       struct perf_event_context *ctx = &cpuctx->ctx;
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+       WARN_ON(!irqs_disabled());
+
+       list_del_init(&cpuctx->rotation_list);
+}
+
+static void __perf_event_exit_context(void *__info)
+{
+       struct perf_event_context *ctx = __info;
         struct perf_event *event, *tmp;
  
+       perf_pmu_rotate_stop(ctx->pmu);
+
         list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
                 __perf_event_remove_from_context(event);
         list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
                 __perf_event_remove_from_context(event);
  }
+
+static void perf_event_exit_cpu_context(int cpu)
+{
+       struct perf_event_context *ctx;
+       struct pmu *pmu;
+       int idx;
+
+       idx = srcu_read_lock(&pmus_srcu);
+       list_for_each_entry_rcu(pmu, &pmus, entry) {
+               ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
+
+               mutex_lock(&ctx->mutex);
+               smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+               mutex_unlock(&ctx->mutex);
+       }
+       srcu_read_unlock(&pmus_srcu, idx);
+}
+
  static void perf_event_exit_cpu(int cpu)
  {
-       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-       struct perf_event_context *ctx = &cpuctx->ctx;
+       struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
  
-       mutex_lock(&cpuctx->hlist_mutex);
-       swevent_hlist_release(cpuctx);
-       mutex_unlock(&cpuctx->hlist_mutex);
+       mutex_lock(&swhash->hlist_mutex);
+       swevent_hlist_release(swhash);
+       mutex_unlock(&swhash->hlist_mutex);
  
-       mutex_lock(&ctx->mutex);
-       smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
-       mutex_unlock(&ctx->mutex);
+       perf_event_exit_cpu_context(cpu);
  }
  #else
  static inline void perf_event_exit_cpu(int cpu) { }
@@ -5778,118 +6325,13 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
         return NOTIFY_OK;
  }
  
-/*
- * This has to have a higher priority than migration_notifier in sched.c.
- */
-static struct notifier_block __cpuinitdata perf_cpu_nb = {
-       .notifier_call          = perf_cpu_notify,
-       .priority               = 20,
-};
-
  void __init perf_event_init(void)
  {
         perf_event_init_all_cpus();
-       perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
-                       (void *)(long)smp_processor_id());
-       perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
-                       (void *)(long)smp_processor_id());
-       register_cpu_notifier(&perf_cpu_nb);
-}
-
-static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
-                                       struct sysdev_class_attribute *attr,
-                                       char *buf)
-{
-       return sprintf(buf, "%d\n", perf_reserved_percpu);
-}
-
-static ssize_t
-perf_set_reserve_percpu(struct sysdev_class *class,
-                       struct sysdev_class_attribute *attr,
-                       const char *buf,
-                       size_t count)
-{
-       struct perf_cpu_context *cpuctx;
-       unsigned long val;
-       int err, cpu, mpt;
-
-       err = strict_strtoul(buf, 10, &val);
-       if (err)
-               return err;
-       if (val > perf_max_events)
-               return -EINVAL;
-
-       spin_lock(&perf_resource_lock);
-       perf_reserved_percpu = val;
-       for_each_online_cpu(cpu) {
-               cpuctx = &per_cpu(perf_cpu_context, cpu);
-               raw_spin_lock_irq(&cpuctx->ctx.lock);
-               mpt = min(perf_max_events - cpuctx->ctx.nr_events,
-                         perf_max_events - perf_reserved_percpu);
-               cpuctx->max_pertask = mpt;
-               raw_spin_unlock_irq(&cpuctx->ctx.lock);
-       }
-       spin_unlock(&perf_resource_lock);
-
-       return count;
-}
-
-static ssize_t perf_show_overcommit(struct sysdev_class *class,
-                                   struct sysdev_class_attribute *attr,
-                                   char *buf)
-{
-       return sprintf(buf, "%d\n", perf_overcommit);
-}
-
-static ssize_t
-perf_set_overcommit(struct sysdev_class *class,
-                   struct sysdev_class_attribute *attr,
-                   const char *buf, size_t count)
-{
-       unsigned long val;
-       int err;
-
-       err = strict_strtoul(buf, 10, &val);
-       if (err)
-               return err;
-       if (val > 1)
-               return -EINVAL;
-
-       spin_lock(&perf_resource_lock);
-       perf_overcommit = val;
-       spin_unlock(&perf_resource_lock);
-
-       return count;
-}
-
-static SYSDEV_CLASS_ATTR(
-                               reserve_percpu,
-                               0644,
-                               perf_show_reserve_percpu,
-                               perf_set_reserve_percpu
-                       );
-
-static SYSDEV_CLASS_ATTR(
-                               overcommit,
-                               0644,
-                               perf_show_overcommit,
-                               perf_set_overcommit
-                       );
-
-static struct attribute *perfclass_attrs[] = {
-       &attr_reserve_percpu.attr,
-       &attr_overcommit.attr,
-       NULL
-};
-
-static struct attribute_group perfclass_attr_group = {
-       .attrs                  = perfclass_attrs,
-       .name                   = "perf_events",
-};
-
-static int __init perf_event_sysfs_init(void)
-{
-       return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
-                                 &perfclass_attr_group);
+       init_srcu_struct(&pmus_srcu);
+       perf_pmu_register(&perf_swevent);
+       perf_pmu_register(&perf_cpu_clock);
+       perf_pmu_register(&perf_task_clock);
+       perf_tp_register();
+       perf_cpu_notifier(perf_cpu_notify);
  }
-device_initcall(perf_event_sysfs_init);
diff --git a/kernel/pid.c b/kernel/pid.c

index d55c6fb8d087a24a2d462886dfe1fc53bf9deced..39b65b69584f5b0f373e360d6ecc3118751840a9 100644 (file)
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -401,7 +401,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
         struct task_struct *result = NULL;
         if (pid) {
                 struct hlist_node *first;
-               first = rcu_dereference_check(pid->tasks[type].first,
+               first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
                                               rcu_read_lock_held() ||
                                               lockdep_tasklist_lock_is_held());
                 if (first)
@@ -416,6 +416,7 @@ EXPORT_SYMBOL(pid_task);
   */
  struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
  {
+       rcu_lockdep_assert(rcu_read_lock_held());
         return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
  }
  
diff --git a/kernel/printk.c b/kernel/printk.c

index 8fe465ac008aebdcdbb0e9db9fcd644d5b12c231..2531017795f63e7d9c60d1db976e2b227c98ffda 100644 (file)
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -85,7 +85,7 @@ EXPORT_SYMBOL(oops_in_progress);
   * provides serialisation for access to the entire console
   * driver system.
   */
-static DECLARE_MUTEX(console_sem);
+static DEFINE_SEMAPHORE(console_sem);
  struct console *console_drivers;
  EXPORT_SYMBOL_GPL(console_drivers);
  
@@ -556,7 +556,7 @@ static void zap_locks(void)
         /* If a crash is occurring, make sure we can't deadlock */
         spin_lock_init(&logbuf_lock);
         /* And make sure that we print immediately */
-       init_MUTEX(&console_sem);
+       sema_init(&console_sem, 1);
  }
  
  #if defined(CONFIG_PRINTK_TIME)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c

index 4d169835fb362dcd6eb52a916a0d8e85e48c8fb8..a23a57a976d1a46f69cb2c32ecc38dd78e5e8b08 100644 (file)
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -73,12 +73,14 @@ int debug_lockdep_rcu_enabled(void)
  EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
  
  /**
- * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section?
+ * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
   *
   * Check for bottom half being disabled, which covers both the
   * CONFIG_PROVE_RCU and not cases.  Note that if someone uses
   * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
- * will show the situation.
+ * will show the situation.  This is useful for debug checks in functions
+ * that require that they be called within an RCU read-side critical
+ * section.
   *
   * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
   */
@@ -86,7 +88,7 @@ int rcu_read_lock_bh_held(void)
  {
         if (!debug_lockdep_rcu_enabled())
                 return 1;
-       return in_softirq();
+       return in_softirq() || irqs_disabled();
  }
  EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
  
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c

index 196ec02f8be0c1ad884b7d69599005a6fc3a4429..d806735342acb10bc3e3ae787e62ade34f1d5955 100644 (file)
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -59,6 +59,14 @@ int rcu_scheduler_active __read_mostly;
  EXPORT_SYMBOL_GPL(rcu_scheduler_active);
  #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
  
+/* Forward declarations for rcutiny_plugin.h. */
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static void __call_rcu(struct rcu_head *head,
+                      void (*func)(struct rcu_head *rcu),
+                      struct rcu_ctrlblk *rcp);
+
+#include "rcutiny_plugin.h"
+
  #ifdef CONFIG_NO_HZ
  
  static long rcu_dynticks_nesting = 1;
@@ -140,6 +148,7 @@ void rcu_check_callbacks(int cpu, int user)
                 rcu_sched_qs(cpu);
         else if (!in_softirq())
                 rcu_bh_qs(cpu);
+       rcu_preempt_check_callbacks();
  }
  
  /*
@@ -162,6 +171,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
         *rcp->donetail = NULL;
         if (rcp->curtail == rcp->donetail)
                 rcp->curtail = &rcp->rcucblist;
+       rcu_preempt_remove_callbacks(rcp);
         rcp->donetail = &rcp->rcucblist;
         local_irq_restore(flags);
  
@@ -182,6 +192,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
  {
         __rcu_process_callbacks(&rcu_sched_ctrlblk);
         __rcu_process_callbacks(&rcu_bh_ctrlblk);
+       rcu_preempt_process_callbacks();
  }
  
  /*
@@ -223,15 +234,15 @@ static void __call_rcu(struct rcu_head *head,
  }
  
  /*
- * Post an RCU callback to be invoked after the end of an RCU grace
+ * Post an RCU callback to be invoked after the end of an RCU-sched grace
   * period.  But since we have but one CPU, that would be after any
   * quiescent state.
   */
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
  {
         __call_rcu(head, func, &rcu_sched_ctrlblk);
  }
-EXPORT_SYMBOL_GPL(call_rcu);
+EXPORT_SYMBOL_GPL(call_rcu_sched);
  
  /*
   * Post an RCU bottom-half callback to be invoked after any subsequent
@@ -243,20 +254,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
  }
  EXPORT_SYMBOL_GPL(call_rcu_bh);
  
-void rcu_barrier(void)
-{
-       struct rcu_synchronize rcu;
-
-       init_rcu_head_on_stack(&rcu.head);
-       init_completion(&rcu.completion);
-       /* Will wake me after RCU finished. */
-       call_rcu(&rcu.head, wakeme_after_rcu);
-       /* Wait for it. */
-       wait_for_completion(&rcu.completion);
-       destroy_rcu_head_on_stack(&rcu.head);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier);
-
  void rcu_barrier_bh(void)
  {
         struct rcu_synchronize rcu;
@@ -289,5 +286,3 @@ void __init rcu_init(void)
  {
         open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
  }
-
-#include "rcutiny_plugin.h"
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h

index d223a92bc7427ffd098f8c49d524974eb12902fe..6ceca4f745ffa1f4535c69467ea59704e2ddbe97 100644 (file)
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -1,7 +1,7 @@
  /*
- * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
   * Internal non-public definitions that provide either classic
- * or preemptable semantics.
+ * or preemptible semantics.
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
@@ -17,11 +17,587 @@
   * along with this program; if not, write to the Free Software
   * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
   *
- * Copyright IBM Corporation, 2009
+ * Copyright (c) 2010 Linaro
   *
   * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
   */
  
+#ifdef CONFIG_TINY_PREEMPT_RCU
+
+#include <linux/delay.h>
+
+/* Global control variables for preemptible RCU. */
+struct rcu_preempt_ctrlblk {
+       struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */
+       struct rcu_head **nexttail;
+                               /* Tasks blocked in a preemptible RCU */
+                               /*  read-side critical section while an */
+                               /*  preemptible-RCU grace period is in */
+                               /*  progress must wait for a later grace */
+                               /*  period.  This pointer points to the */
+                               /*  ->next pointer of the last task that */
+                               /*  must wait for a later grace period, or */
+                               /*  to &->rcb.rcucblist if there is no */
+                               /*  such task. */
+       struct list_head blkd_tasks;
+                               /* Tasks blocked in RCU read-side critical */
+                               /*  section.  Tasks are placed at the head */
+                               /*  of this list and age towards the tail. */
+       struct list_head *gp_tasks;
+                               /* Pointer to the first task blocking the */
+                               /*  current grace period, or NULL if there */
+                               /*  is not such task. */
+       struct list_head *exp_tasks;
+                               /* Pointer to first task blocking the */
+                               /*  current expedited grace period, or NULL */
+                               /*  if there is no such task.  If there */
+                               /*  is no current expedited grace period, */
+                               /*  then there cannot be any such task. */
+       u8 gpnum;               /* Current grace period. */
+       u8 gpcpu;               /* Last grace period blocked by the CPU. */
+       u8 completed;           /* Last grace period completed. */
+                               /*  If all three are equal, RCU is idle. */
+};
+
+static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
+       .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist,
+       .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
+       .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
+       .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
+};
+
+static int rcu_preempted_readers_exp(void);
+static void rcu_report_exp_done(void);
+
+/*
+ * Return true if the CPU has not yet responded to the current grace period.
+ */
+static int rcu_cpu_blocking_cur_gp(void)
+{
+       return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum;
+}
+
+/*
+ * Check for a running RCU reader.  Because there is only one CPU,
+ * there can be but one running RCU reader at a time.  ;-)
+ */
+static int rcu_preempt_running_reader(void)
+{
+       return current->rcu_read_lock_nesting;
+}
+
+/*
+ * Check for preempted RCU readers blocking any grace period.
+ * If the caller needs a reliable answer, it must disable hard irqs.
+ */
+static int rcu_preempt_blocked_readers_any(void)
+{
+       return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks);
+}
+
+/*
+ * Check for preempted RCU readers blocking the current grace period.
+ * If the caller needs a reliable answer, it must disable hard irqs.
+ */
+static int rcu_preempt_blocked_readers_cgp(void)
+{
+       return rcu_preempt_ctrlblk.gp_tasks != NULL;
+}
+
+/*
+ * Return true if another preemptible-RCU grace period is needed.
+ */
+static int rcu_preempt_needs_another_gp(void)
+{
+       return *rcu_preempt_ctrlblk.rcb.curtail != NULL;
+}
+
+/*
+ * Return true if a preemptible-RCU grace period is in progress.
+ * The caller must disable hardirqs.
+ */
+static int rcu_preempt_gp_in_progress(void)
+{
+       return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum;
+}
+
+/*
+ * Record a preemptible-RCU quiescent state for the specified CPU.  Note
+ * that this just means that the task currently running on the CPU is
+ * in a quiescent state.  There might be any number of tasks blocked
+ * while in an RCU read-side critical section.
+ *
+ * Unlike the other rcu_*_qs() functions, callers to this function
+ * must disable irqs in order to protect the assignment to
+ * ->rcu_read_unlock_special.
+ *
+ * Because this is a single-CPU implementation, the only way a grace
+ * period can end is if the CPU is in a quiescent state.  The reason is
+ * that a blocked preemptible-RCU reader can exit its critical section
+ * only if the CPU is running it at the time.  Therefore, when the
+ * last task blocking the current grace period exits its RCU read-side
+ * critical section, neither the CPU nor blocked tasks will be stopping
+ * the current grace period.  (In contrast, SMP implementations
+ * might have CPUs running in RCU read-side critical sections that
+ * block later grace periods -- but this is not possible given only
+ * one CPU.)
+ */
+static void rcu_preempt_cpu_qs(void)
+{
+       /* Record both CPU and task as having responded to current GP. */
+       rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
+       current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
+
+       /*
+        * If there is no GP, or if blocked readers are still blocking GP,
+        * then there is nothing more to do.
+        */
+       if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp())
+               return;
+
+       /* Advance callbacks. */
+       rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum;
+       rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail;
+       rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail;
+
+       /* If there are no blocked readers, next GP is done instantly. */
+       if (!rcu_preempt_blocked_readers_any())
+               rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
+
+       /* If there are done callbacks, make RCU_SOFTIRQ process them. */
+       if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
+               raise_softirq(RCU_SOFTIRQ);
+}
+
+/*
+ * Start a new RCU grace period if warranted.  Hard irqs must be disabled.
+ */
+static void rcu_preempt_start_gp(void)
+{
+       if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) {
+
+               /* Official start of GP. */
+               rcu_preempt_ctrlblk.gpnum++;
+
+               /* Any blocked RCU readers block new GP. */
+               if (rcu_preempt_blocked_readers_any())
+                       rcu_preempt_ctrlblk.gp_tasks =
+                               rcu_preempt_ctrlblk.blkd_tasks.next;
+
+               /* If there is no running reader, CPU is done with GP. */
+               if (!rcu_preempt_running_reader())
+                       rcu_preempt_cpu_qs();
+       }
+}
+
+/*
+ * We have entered the scheduler, and the current task might soon be
+ * context-switched away from.  If this task is in an RCU read-side
+ * critical section, we will no longer be able to rely on the CPU to
+ * record that fact, so we enqueue the task on the blkd_tasks list.
+ * If the task started after the current grace period began, as recorded
+ * by ->gpcpu, we enqueue at the beginning of the list.  Otherwise
+ * before the element referenced by ->gp_tasks (or at the tail if
+ * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element.
+ * The task will dequeue itself when it exits the outermost enclosing
+ * RCU read-side critical section.  Therefore, the current grace period
+ * cannot be permitted to complete until the ->gp_tasks pointer becomes
+ * NULL.
+ *
+ * Caller must disable preemption.
+ */
+void rcu_preempt_note_context_switch(void)
+{
+       struct task_struct *t = current;
+       unsigned long flags;
+
+       local_irq_save(flags); /* must exclude scheduler_tick(). */
+       if (rcu_preempt_running_reader() &&
+           (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
+
+               /* Possibly blocking in an RCU read-side critical section. */
+               t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
+
+               /*
+                * If this CPU has already checked in, then this task
+                * will hold up the next grace period rather than the
+                * current grace period.  Queue the task accordingly.
+                * If the task is queued for the current grace period
+                * (i.e., this CPU has not yet passed through a quiescent
+                * state for the current grace period), then as long
+                * as that task remains queued, the current grace period
+                * cannot end.
+                */
+               list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);
+               if (rcu_cpu_blocking_cur_gp())
+                       rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry;
+       }
+
+       /*
+        * Either we were not in an RCU read-side critical section to
+        * begin with, or we have now recorded that critical section
+        * globally.  Either way, we can now note a quiescent state
+        * for this CPU.  Again, if we were in an RCU read-side critical
+        * section, and if that critical section was blocking the current
+        * grace period, then the fact that the task has been enqueued
+        * means that current grace period continues to be blocked.
+        */
+       rcu_preempt_cpu_qs();
+       local_irq_restore(flags);
+}
+
+/*
+ * Tiny-preemptible RCU implementation for rcu_read_lock().
+ * Just increment ->rcu_read_lock_nesting, shared state will be updated
+ * if we block.
+ */
+void __rcu_read_lock(void)
+{
+       current->rcu_read_lock_nesting++;
+       barrier();  /* needed if we ever invoke rcu_read_lock in rcutiny.c */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+
+/*
+ * Handle special cases during rcu_read_unlock(), such as needing to
+ * notify RCU core processing or task having blocked during the RCU
+ * read-side critical section.
+ */
+static void rcu_read_unlock_special(struct task_struct *t)
+{
+       int empty;
+       int empty_exp;
+       unsigned long flags;
+       struct list_head *np;
+       int special;
+
+       /*
+        * NMI handlers cannot block and cannot safely manipulate state.
+        * They therefore cannot possibly be special, so just leave.
+        */
+       if (in_nmi())
+               return;
+
+       local_irq_save(flags);
+
+       /*
+        * If RCU core is waiting for this CPU to exit critical section,
+        * let it know that we have done so.
+        */
+       special = t->rcu_read_unlock_special;
+       if (special & RCU_READ_UNLOCK_NEED_QS)
+               rcu_preempt_cpu_qs();
+
+       /* Hardware IRQ handlers cannot block. */
+       if (in_irq()) {
+               local_irq_restore(flags);
+               return;
+       }
+
+       /* Clean up if blocked during RCU read-side critical section. */
+       if (special & RCU_READ_UNLOCK_BLOCKED) {
+               t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
+
+               /*
+                * Remove this task from the ->blkd_tasks list and adjust
+                * any pointers that might have been referencing it.
+                */
+               empty = !rcu_preempt_blocked_readers_cgp();
+               empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
+               np = t->rcu_node_entry.next;
+               if (np == &rcu_preempt_ctrlblk.blkd_tasks)
+                       np = NULL;
+               list_del(&t->rcu_node_entry);
+               if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
+                       rcu_preempt_ctrlblk.gp_tasks = np;
+               if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
+                       rcu_preempt_ctrlblk.exp_tasks = np;
+               INIT_LIST_HEAD(&t->rcu_node_entry);
+
+               /*
+                * If this was the last task on the current list, and if
+                * we aren't waiting on the CPU, report the quiescent state
+                * and start a new grace period if needed.
+                */
+               if (!empty && !rcu_preempt_blocked_readers_cgp()) {
+                       rcu_preempt_cpu_qs();
+                       rcu_preempt_start_gp();
+               }
+
+               /*
+                * If this was the last task on the expedited lists,
+                * then we need wake up the waiting task.
+                */
+               if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
+                       rcu_report_exp_done();
+       }
+       local_irq_restore(flags);
+}
+
+/*
+ * Tiny-preemptible RCU implementation for rcu_read_unlock().
+ * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
+ * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
+ * invoke rcu_read_unlock_special() to clean up after a context switch
+ * in an RCU read-side critical section and other special cases.
+ */
+void __rcu_read_unlock(void)
+{
+       struct task_struct *t = current;
+
+       barrier();  /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
+       --t->rcu_read_lock_nesting;
+       barrier();  /* decrement before load of ->rcu_read_unlock_special */
+       if (t->rcu_read_lock_nesting == 0 &&
+           unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+               rcu_read_unlock_special(t);
+#ifdef CONFIG_PROVE_LOCKING
+       WARN_ON_ONCE(t->rcu_read_lock_nesting < 0);
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+
+/*
+ * Check for a quiescent state from the current CPU.  When a task blocks,
+ * the task is recorded in the rcu_preempt_ctrlblk structure, which is
+ * checked elsewhere.  This is called from the scheduling-clock interrupt.
+ *
+ * Caller must disable hard irqs.
+ */
+static void rcu_preempt_check_callbacks(void)
+{
+       struct task_struct *t = current;
+
+       if (rcu_preempt_gp_in_progress() &&
+           (!rcu_preempt_running_reader() ||
+            !rcu_cpu_blocking_cur_gp()))
+               rcu_preempt_cpu_qs();
+       if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
+           rcu_preempt_ctrlblk.rcb.donetail)
+               raise_softirq(RCU_SOFTIRQ);
+       if (rcu_preempt_gp_in_progress() &&
+           rcu_cpu_blocking_cur_gp() &&
+           rcu_preempt_running_reader())
+               t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
+}
+
+/*
+ * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
+ * update, so this is invoked from __rcu_process_callbacks() to
+ * handle that case.  Of course, it is invoked for all flavors of
+ * RCU, but RCU callbacks can appear only on one of the lists, and
+ * neither ->nexttail nor ->donetail can possibly be NULL, so there
+ * is no need for an explicit check.
+ */
+static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
+{
+       if (rcu_preempt_ctrlblk.nexttail == rcp->donetail)
+               rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist;
+}
+
+/*
+ * Process callbacks for preemptible RCU.
+ */
+static void rcu_preempt_process_callbacks(void)
+{
+       __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
+}
+
+/*
+ * Queue a preemptible -RCU callback for invocation after a grace period.
+ */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+       unsigned long flags;
+
+       debug_rcu_head_queue(head);
+       head->func = func;
+       head->next = NULL;
+
+       local_irq_save(flags);
+       *rcu_preempt_ctrlblk.nexttail = head;
+       rcu_preempt_ctrlblk.nexttail = &head->next;
+       rcu_preempt_start_gp();  /* checks to see if GP needed. */
+       local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+void rcu_barrier(void)
+{
+       struct rcu_synchronize rcu;
+
+       init_rcu_head_on_stack(&rcu.head);
+       init_completion(&rcu.completion);
+       /* Will wake me after RCU finished. */
+       call_rcu(&rcu.head, wakeme_after_rcu);
+       /* Wait for it. */
+       wait_for_completion(&rcu.completion);
+       destroy_rcu_head_on_stack(&rcu.head);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier);
+
+/*
+ * synchronize_rcu - wait until a grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ */
+void synchronize_rcu(void)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+       if (!rcu_scheduler_active)
+               return;
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+       WARN_ON_ONCE(rcu_preempt_running_reader());
+       if (!rcu_preempt_blocked_readers_any())
+               return;
+
+       /* Once we get past the fastpath checks, same code as rcu_barrier(). */
+       rcu_barrier();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu);
+
+static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
+static unsigned long sync_rcu_preempt_exp_count;
+static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
+
+/*
+ * Return non-zero if there are any tasks in RCU read-side critical
+ * sections blocking the current preemptible-RCU expedited grace period.
+ * If there is no preemptible-RCU expedited grace period currently in
+ * progress, returns zero unconditionally.
+ */
+static int rcu_preempted_readers_exp(void)
+{
+       return rcu_preempt_ctrlblk.exp_tasks != NULL;
+}
+
+/*
+ * Report the exit from RCU read-side critical section for the last task
+ * that queued itself during or before the current expedited preemptible-RCU
+ * grace period.
+ */
+static void rcu_report_exp_done(void)
+{
+       wake_up(&sync_rcu_preempt_exp_wq);
+}
+
+/*
+ * Wait for an rcu-preempt grace period, but expedite it.  The basic idea
+ * is to rely in the fact that there is but one CPU, and that it is
+ * illegal for a task to invoke synchronize_rcu_expedited() while in a
+ * preemptible-RCU read-side critical section.  Therefore, any such
+ * critical sections must correspond to blocked tasks, which must therefore
+ * be on the ->blkd_tasks list.  So just record the current head of the
+ * list in the ->exp_tasks pointer, and wait for all tasks including and
+ * after the task pointed to by ->exp_tasks to drain.
+ */
+void synchronize_rcu_expedited(void)
+{
+       unsigned long flags;
+       struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk;
+       unsigned long snap;
+
+       barrier(); /* ensure prior action seen before grace period. */
+
+       WARN_ON_ONCE(rcu_preempt_running_reader());
+
+       /*
+        * Acquire lock so that there is only one preemptible RCU grace
+        * period in flight.  Of course, if someone does the expedited
+        * grace period for us while we are acquiring the lock, just leave.
+        */
+       snap = sync_rcu_preempt_exp_count + 1;
+       mutex_lock(&sync_rcu_preempt_exp_mutex);
+       if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count))
+               goto unlock_mb_ret; /* Others did our work for us. */
+
+       local_irq_save(flags);
+
+       /*
+        * All RCU readers have to already be on blkd_tasks because
+        * we cannot legally be executing in an RCU read-side critical
+        * section.
+        */
+
+       /* Snapshot current head of ->blkd_tasks list. */
+       rpcp->exp_tasks = rpcp->blkd_tasks.next;
+       if (rpcp->exp_tasks == &rpcp->blkd_tasks)
+               rpcp->exp_tasks = NULL;
+       local_irq_restore(flags);
+
+       /* Wait for tail of ->blkd_tasks list to drain. */
+       if (rcu_preempted_readers_exp())
+               wait_event(sync_rcu_preempt_exp_wq,
+                          !rcu_preempted_readers_exp());
+
+       /* Clean up and exit. */
+       barrier(); /* ensure expedited GP seen before counter increment. */
+       sync_rcu_preempt_exp_count++;
+unlock_mb_ret:
+       mutex_unlock(&sync_rcu_preempt_exp_mutex);
+       barrier(); /* ensure subsequent action seen after grace period. */
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
+/*
+ * Does preemptible RCU need the CPU to stay out of dynticks mode?
+ */
+int rcu_preempt_needs_cpu(void)
+{
+       if (!rcu_preempt_running_reader())
+               rcu_preempt_cpu_qs();
+       return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
+}
+
+/*
+ * Check for a task exiting while in a preemptible -RCU read-side
+ * critical section, clean up if so.  No need to issue warnings,
+ * as debug_check_no_locks_held() already does this if lockdep
+ * is enabled.
+ */
+void exit_rcu(void)
+{
+       struct task_struct *t = current;
+
+       if (t->rcu_read_lock_nesting == 0)
+               return;
+       t->rcu_read_lock_nesting = 1;
+       rcu_read_unlock();
+}
+
+#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
+
+/*
+ * Because preemptible RCU does not exist, it never has any callbacks
+ * to check.
+ */
+static void rcu_preempt_check_callbacks(void)
+{
+}
+
+/*
+ * Because preemptible RCU does not exist, it never has any callbacks
+ * to remove.
+ */
+static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
+{
+}
+
+/*
+ * Because preemptible RCU does not exist, it never has any callbacks
+ * to process.
+ */
+static void rcu_preempt_process_callbacks(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
+
  #ifdef CONFIG_DEBUG_LOCK_ALLOC
  
  #include <linux/kernel_stat.h>
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c

index 2e2726d790b98eff18d88d71dbeba8caf8ec7f7a..9d8e8fb2515f4e4801c214841a7f8c95b8b45ffe 100644 (file)
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -120,7 +120,7 @@ struct rcu_torture {
  };
  
  static LIST_HEAD(rcu_torture_freelist);
-static struct rcu_torture *rcu_torture_current;
+static struct rcu_torture __rcu *rcu_torture_current;
  static long rcu_torture_current_version;
  static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
  static DEFINE_SPINLOCK(rcu_torture_lock);
@@ -153,8 +153,10 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
  #define FULLSTOP_SHUTDOWN 1    /* System shutdown with rcutorture running. */
  #define FULLSTOP_RMMOD    2    /* Normal rmmod of rcutorture. */
  static int fullstop = FULLSTOP_RMMOD;
-DEFINE_MUTEX(fullstop_mutex);  /* Protect fullstop transitions and spawning */
-                               /*  of kthreads. */
+/*
+ * Protect fullstop transitions and spawning of kthreads.
+ */
+static DEFINE_MUTEX(fullstop_mutex);
  
  /*
   * Detect and respond to a system shutdown.
@@ -303,6 +305,10 @@ static void rcu_read_delay(struct rcu_random_state *rrsp)
                 mdelay(longdelay_ms);
         if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
                 udelay(shortdelay_us);
+#ifdef CONFIG_PREEMPT
+       if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000)))
+               preempt_schedule();  /* No QS if preempt_disable() in effect */
+#endif
  }
  
  static void rcu_torture_read_unlock(int idx) __releases(RCU)
@@ -536,6 +542,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
         delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick);
         if (!delay)
                 schedule_timeout_interruptible(longdelay);
+       else
+               rcu_read_delay(rrsp);
  }
  
  static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
@@ -731,7 +739,8 @@ rcu_torture_writer(void *arg)
                         continue;
                 rp->rtort_pipe_count = 0;
                 udelay(rcu_random(&rand) & 0x3ff);
-               old_rp = rcu_torture_current;
+               old_rp = rcu_dereference_check(rcu_torture_current,
+                                              current == writer_task);
                 rp->rtort_mbtest = 1;
                 rcu_assign_pointer(rcu_torture_current, rp);
                 smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
diff --git a/kernel/rcutree.c b/kernel/rcutree.c

index d5bc43976c5ad202fa41be0456797d40bdde0c0d..ccdc04c479815addc8dbacea69643174a4636670 100644 (file)
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -143,6 +143,11 @@ module_param(blimit, int, 0);
  module_param(qhimark, int, 0);
  module_param(qlowmark, int, 0);
  
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT;
+module_param(rcu_cpu_stall_suppress, int, 0644);
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
  static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
  static int rcu_pending(int cpu);
  
@@ -450,7 +455,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
  
  #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
  
-int rcu_cpu_stall_panicking __read_mostly;
+int rcu_cpu_stall_suppress __read_mostly;
  
  static void record_gp_stall_check_time(struct rcu_state *rsp)
  {
@@ -482,8 +487,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
         rcu_print_task_stall(rnp);
         raw_spin_unlock_irqrestore(&rnp->lock, flags);
  
-       /* OK, time to rat on our buddy... */
-
+       /*
+        * OK, time to rat on our buddy...
+        * See Documentation/RCU/stallwarn.txt for info on how to debug
+        * RCU CPU stall warnings.
+        */
         printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
                rsp->name);
         rcu_for_each_leaf_node(rsp, rnp) {
@@ -512,6 +520,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
         unsigned long flags;
         struct rcu_node *rnp = rcu_get_root(rsp);
  
+       /*
+        * OK, time to rat on ourselves...
+        * See Documentation/RCU/stallwarn.txt for info on how to debug
+        * RCU CPU stall warnings.
+        */
         printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
                rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
         trigger_all_cpu_backtrace();
@@ -530,11 +543,11 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
         long delta;
         struct rcu_node *rnp;
  
-       if (rcu_cpu_stall_panicking)
+       if (rcu_cpu_stall_suppress)
                 return;
-       delta = jiffies - rsp->jiffies_stall;
+       delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
         rnp = rdp->mynode;
-       if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
+       if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) {
  
                 /* We haven't checked in, so go dump stack. */
                 print_cpu_stall(rsp);
@@ -548,10 +561,26 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
  
  static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
  {
-       rcu_cpu_stall_panicking = 1;
+       rcu_cpu_stall_suppress = 1;
         return NOTIFY_DONE;
  }
  
+/**
+ * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
+ *
+ * Set the stall-warning timeout way off into the future, thus preventing
+ * any RCU CPU stall-warning messages from appearing in the current set of
+ * RCU grace periods.
+ *
+ * The caller must disable hard irqs.
+ */
+void rcu_cpu_stall_reset(void)
+{
+       rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+       rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+       rcu_preempt_stall_reset();
+}
+
  static struct notifier_block rcu_panic_block = {
         .notifier_call = rcu_panic,
  };
@@ -571,6 +600,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
  {
  }
  
+void rcu_cpu_stall_reset(void)
+{
+}
+
  static void __init check_cpu_stall_init(void)
  {
  }
@@ -712,7 +745,7 @@ static void
  rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
         __releases(rcu_get_root(rsp)->lock)
  {
-       struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+       struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
         struct rcu_node *rnp = rcu_get_root(rsp);
  
         if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
@@ -960,7 +993,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
  static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
  {
         int i;
-       struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+       struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
  
         if (rdp->nxtlist == NULL)
                 return;  /* irqs disabled, so comparison is stable. */
@@ -971,6 +1004,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
         for (i = 0; i < RCU_NEXT_SIZE; i++)
                 rdp->nxttail[i] = &rdp->nxtlist;
         rsp->orphan_qlen += rdp->qlen;
+       rdp->n_cbs_orphaned += rdp->qlen;
         rdp->qlen = 0;
         raw_spin_unlock(&rsp->onofflock);  /* irqs remain disabled. */
  }
@@ -984,7 +1018,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
         struct rcu_data *rdp;
  
         raw_spin_lock_irqsave(&rsp->onofflock, flags);
-       rdp = rsp->rda[smp_processor_id()];
+       rdp = this_cpu_ptr(rsp->rda);
         if (rsp->orphan_cbs_list == NULL) {
                 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
                 return;
@@ -992,6 +1026,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
         *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
         rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
         rdp->qlen += rsp->orphan_qlen;
+       rdp->n_cbs_adopted += rsp->orphan_qlen;
         rsp->orphan_cbs_list = NULL;
         rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
         rsp->orphan_qlen = 0;
@@ -1007,7 +1042,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
         unsigned long flags;
         unsigned long mask;
         int need_report = 0;
-       struct rcu_data *rdp = rsp->rda[cpu];
+       struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
         struct rcu_node *rnp;
  
         /* Exclude any attempts to start a new grace period. */
@@ -1123,6 +1158,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
  
         /* Update count, and requeue any remaining callbacks. */
         rdp->qlen -= count;
+       rdp->n_cbs_invoked += count;
         if (list != NULL) {
                 *tail = rdp->nxtlist;
                 rdp->nxtlist = list;
@@ -1226,7 +1262,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
                 cpu = rnp->grplo;
                 bit = 1;
                 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
-                       if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
+                       if ((rnp->qsmask & bit) != 0 &&
+                           f(per_cpu_ptr(rsp->rda, cpu)))
                                 mask |= bit;
                 }
                 if (mask != 0) {
@@ -1402,7 +1439,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
          * a quiescent state betweentimes.
          */
         local_irq_save(flags);
-       rdp = rsp->rda[smp_processor_id()];
+       rdp = this_cpu_ptr(rsp->rda);
         rcu_process_gp_end(rsp, rdp);
         check_for_new_grace_period(rsp, rdp);
  
@@ -1701,7 +1738,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
  {
         unsigned long flags;
         int i;
-       struct rcu_data *rdp = rsp->rda[cpu];
+       struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
         struct rcu_node *rnp = rcu_get_root(rsp);
  
         /* Set up local state, ensuring consistent view of global state. */
@@ -1729,7 +1766,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
  {
         unsigned long flags;
         unsigned long mask;
-       struct rcu_data *rdp = rsp->rda[cpu];
+       struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
         struct rcu_node *rnp = rcu_get_root(rsp);
  
         /* Set up local state, ensuring consistent view of global state. */
@@ -1865,7 +1902,8 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
  /*
   * Helper function for rcu_init() that initializes one rcu_state structure.
   */
-static void __init rcu_init_one(struct rcu_state *rsp)
+static void __init rcu_init_one(struct rcu_state *rsp,
+               struct rcu_data __percpu *rda)
  {
         static char *buf[] = { "rcu_node_level_0",
                                "rcu_node_level_1",
@@ -1918,37 +1956,23 @@ static void __init rcu_init_one(struct rcu_state *rsp)
                 }
         }
  
+       rsp->rda = rda;
         rnp = rsp->level[NUM_RCU_LVLS - 1];
         for_each_possible_cpu(i) {
                 while (i > rnp->grphi)
                         rnp++;
-               rsp->rda[i]->mynode = rnp;
+               per_cpu_ptr(rsp->rda, i)->mynode = rnp;
                 rcu_boot_init_percpu_data(i, rsp);
         }
  }
  
-/*
- * Helper macro for __rcu_init() and __rcu_init_preempt().  To be used
- * nowhere else!  Assigns leaf node pointers into each CPU's rcu_data
- * structure.
- */
-#define RCU_INIT_FLAVOR(rsp, rcu_data) \
-do { \
-       int i; \
-       \
-       for_each_possible_cpu(i) { \
-               (rsp)->rda[i] = &per_cpu(rcu_data, i); \
-       } \
-       rcu_init_one(rsp); \
-} while (0)
-
  void __init rcu_init(void)
  {
         int cpu;
  
         rcu_bootup_announce();
-       RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
-       RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
+       rcu_init_one(&rcu_sched_state, &rcu_sched_data);
+       rcu_init_one(&rcu_bh_state, &rcu_bh_data);
         __rcu_init_preempt();
         open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
  
diff --git a/kernel/rcutree.h b/kernel/rcutree.h

index 14c040b18ed04a23f34448d9278d9ad55e5ab0c1..91d4170c5c13afd2e8997bd59b28e7cc2a4385e8 100644 (file)
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -202,6 +202,9 @@ struct rcu_data {
         long            qlen;           /* # of queued callbacks */
         long            qlen_last_fqs_check;
                                         /* qlen at last check for QS forcing */
+       unsigned long   n_cbs_invoked;  /* count of RCU cbs invoked. */
+       unsigned long   n_cbs_orphaned; /* RCU cbs sent to orphanage. */
+       unsigned long   n_cbs_adopted;  /* RCU cbs adopted from orphanage. */
         unsigned long   n_force_qs_snap;
                                         /* did other CPU force QS recently? */
         long            blimit;         /* Upper limit on a processed batch */
@@ -254,19 +257,23 @@ struct rcu_data {
  #define RCU_STALL_DELAY_DELTA         0
  #endif
  
-#define RCU_SECONDS_TILL_STALL_CHECK   (10 * HZ + RCU_STALL_DELAY_DELTA)
+#define RCU_SECONDS_TILL_STALL_CHECK   (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \
+                                       RCU_STALL_DELAY_DELTA)
                                                 /* for rsp->jiffies_stall */
-#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA)
+#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30)
                                                 /* for rsp->jiffies_stall */
  #define RCU_STALL_RAT_DELAY            2       /* Allow other CPUs time */
                                                 /*  to take at least one */
                                                 /*  scheduling clock irq */
                                                 /*  before ratting on them. */
  
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE
+#define RCU_CPU_STALL_SUPPRESS_INIT 0
+#else
+#define RCU_CPU_STALL_SUPPRESS_INIT 1
+#endif
  
-#define ULONG_CMP_GE(a, b)     (ULONG_MAX / 2 >= (a) - (b))
-#define ULONG_CMP_LT(a, b)     (ULONG_MAX / 2 < (a) - (b))
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
  
  /*
   * RCU global state, including node hierarchy.  This hierarchy is
@@ -283,7 +290,7 @@ struct rcu_state {
         struct rcu_node *level[NUM_RCU_LVLS];   /* Hierarchy levels. */
         u32 levelcnt[MAX_RCU_LVLS + 1];         /* # nodes in each level. */
         u8 levelspread[NUM_RCU_LVLS];           /* kids/node in each level. */
-       struct rcu_data *rda[NR_CPUS];          /* array of rdp pointers. */
+       struct rcu_data __percpu *rda;          /* pointer of percu rcu_data. */
  
         /* The following fields are guarded by the root rcu_node's lock. */
  
@@ -365,6 +372,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
  #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
  static void rcu_print_detail_task_stall(struct rcu_state *rsp);
  static void rcu_print_task_stall(struct rcu_node *rnp);
+static void rcu_preempt_stall_reset(void);
  #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
  static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
  #ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h

index 0e4f420245d97369b7fdf6bb99815789ffe13233..71a4147473f95f51d2b2e88db4c14372dafe375f 100644 (file)
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -57,7 +57,7 @@ static void __init rcu_bootup_announce_oddness(void)
         printk(KERN_INFO
                "\tRCU-based detection of stalled CPUs is disabled.\n");
  #endif
-#ifndef CONFIG_RCU_CPU_STALL_VERBOSE
+#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
         printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
  #endif
  #if NUM_RCU_LVL_4 != 0
@@ -154,7 +154,7 @@ static void rcu_preempt_note_context_switch(int cpu)
             (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
  
                 /* Possibly blocking in an RCU read-side critical section. */
-               rdp = rcu_preempt_state.rda[cpu];
+               rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
                 rnp = rdp->mynode;
                 raw_spin_lock_irqsave(&rnp->lock, flags);
                 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -201,7 +201,7 @@ static void rcu_preempt_note_context_switch(int cpu)
   */
  void __rcu_read_lock(void)
  {
-       ACCESS_ONCE(current->rcu_read_lock_nesting)++;
+       current->rcu_read_lock_nesting++;
         barrier();  /* needed if we ever invoke rcu_read_lock in rcutree.c */
  }
  EXPORT_SYMBOL_GPL(__rcu_read_lock);
@@ -344,7 +344,9 @@ void __rcu_read_unlock(void)
         struct task_struct *t = current;
  
         barrier();  /* needed if we ever invoke rcu_read_unlock in rcutree.c */
-       if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
+       --t->rcu_read_lock_nesting;
+       barrier();  /* decrement before load of ->rcu_read_unlock_special */
+       if (t->rcu_read_lock_nesting == 0 &&
             unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
                 rcu_read_unlock_special(t);
  #ifdef CONFIG_PROVE_LOCKING
@@ -417,6 +419,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
         }
  }
  
+/*
+ * Suppress preemptible RCU's CPU stall warnings by pushing the
+ * time of the next stall-warning message comfortably far into the
+ * future.
+ */
+static void rcu_preempt_stall_reset(void)
+{
+       rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+}
+
  #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
  
  /*
@@ -546,9 +558,11 @@ EXPORT_SYMBOL_GPL(call_rcu);
   *
   * Control will return to the caller some time after a full grace
   * period has elapsed, in other words after all currently executing RCU
- * read-side critical sections have completed.  RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
+ * read-side critical sections have completed.  Note, however, that
+ * upon return from synchronize_rcu(), the caller might well be executing
+ * concurrently with new RCU read-side critical sections that began while
+ * synchronize_rcu() was waiting.  RCU read-side critical sections are
+ * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
   */
  void synchronize_rcu(void)
  {
@@ -771,7 +785,7 @@ static void rcu_preempt_send_cbs_to_orphanage(void)
   */
  static void __init __rcu_init_preempt(void)
  {
-       RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
+       rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
  }
  
  /*
@@ -865,6 +879,14 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
  {
  }
  
+/*
+ * Because preemptible RCU does not exist, there is no need to suppress
+ * its CPU stall warnings.
+ */
+static void rcu_preempt_stall_reset(void)
+{
+}
+
  #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
  
  /*
@@ -918,15 +940,6 @@ static void rcu_preempt_process_callbacks(void)
  {
  }
  
-/*
- * In classic RCU, call_rcu() is just call_rcu_sched().
- */
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
-{
-       call_rcu_sched(head, func);
-}
-EXPORT_SYMBOL_GPL(call_rcu);
-
  /*
   * Wait for an rcu-preempt grace period, but make it happen quickly.
   * But because preemptable RCU does not exist, map to rcu-sched.
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c

index 36c95b45738ed7f74fb78b901ddcff7fc1488498..d15430b9d122f4d619e76fb6b5069aa1f494a575 100644 (file)
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -64,7 +64,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
                    rdp->dynticks_fqs);
  #endif /* #ifdef CONFIG_NO_HZ */
         seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
-       seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit);
+       seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit);
+       seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
+                  rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
  }
  
  #define PRINT_RCU_DATA(name, func, m) \
@@ -119,7 +121,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
                    rdp->dynticks_fqs);
  #endif /* #ifdef CONFIG_NO_HZ */
         seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
-       seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit);
+       seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit);
+       seq_printf(m, ",%lu,%lu,%lu\n",
+                  rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
  }
  
  static int show_rcudata_csv(struct seq_file *m, void *unused)
@@ -128,7 +132,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
  #ifdef CONFIG_NO_HZ
         seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
  #endif /* #ifdef CONFIG_NO_HZ */
-       seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
+       seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n");
  #ifdef CONFIG_TREE_PREEMPT_RCU
         seq_puts(m, "\"rcu_preempt:\"\n");
         PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
@@ -262,7 +266,7 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
         struct rcu_data *rdp;
  
         for_each_possible_cpu(cpu) {
-               rdp = rsp->rda[cpu];
+               rdp = per_cpu_ptr(rsp->rda, cpu);
                 if (rdp->beenonline)
                         print_one_rcu_pending(m, rdp);
         }
diff --git a/kernel/sched.c b/kernel/sched.c

index 5998222f901c84cbc2984fb87961a389f62e3ce1..d42992bccdfae88569559f3e88f81bbcea8e9494 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3714,7 +3714,7 @@ void scheduler_tick(void)
         curr->sched_class->task_tick(rq, curr, 0);
         raw_spin_unlock(&rq->lock);
  
-       perf_event_task_tick(curr);
+       perf_event_task_tick();
  
  #ifdef CONFIG_SMP
         rq->idle_at_tick = idle_cpu(cpu);
@@ -4772,7 +4772,7 @@ recheck:
         }
  
         if (user) {
-               retval = security_task_setscheduler(p, policy, param);
+               retval = security_task_setscheduler(p);
                 if (retval)
                         return retval;
         }
@@ -5023,7 +5023,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
         if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
                 goto out_unlock;
  
-       retval = security_task_setscheduler(p, 0, NULL);
+       retval = security_task_setscheduler(p);
         if (retval)
                 goto out_unlock;
  
@@ -5473,7 +5473,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
         idle->se.exec_start = sched_clock();
  
         cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
+       /*
+        * We're having a chicken and egg problem, even though we are
+        * holding rq->lock, the cpu isn't yet set to this cpu so the
+        * lockdep check in task_group() will fail.
+        *
+        * Similar case to sched_fork(). / Alternatively we could
+        * use task_rq_lock() here and obtain the other rq->lock.
+        *
+        * Silence PROVE_RCU
+        */
+       rcu_read_lock();
         __set_task_cpu(idle, cpu);
+       rcu_read_unlock();
  
         rq->curr = rq->idle = idle;
  #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c

index 74cccfae87a8e9a6e6d0845374dab82cd367500c..933f3d1b62ea0affb63767f87152545f7659dc6f 100644 (file)
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3793,8 +3793,11 @@ static void task_fork_fair(struct task_struct *p)
  
         update_rq_clock(rq);
  
-       if (unlikely(task_cpu(p) != this_cpu))
+       if (unlikely(task_cpu(p) != this_cpu)) {
+               rcu_read_lock();
                 __set_task_cpu(p, this_cpu);
+               rcu_read_unlock();
+       }
  
         update_curr(cfs_rq);
  
diff --git a/kernel/srcu.c b/kernel/srcu.c

index 2980da3fd50925f7902a5ba42e64934f6c4b0650..c71e075005368eceff3aab4340f94beca4aee249 100644 (file)
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -46,11 +46,9 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
  int __init_srcu_struct(struct srcu_struct *sp, const char *name,
                        struct lock_class_key *key)
  {
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
         /* Don't re-initialize a lock while it is held. */
         debug_check_no_locks_freed((void *)sp, sizeof(*sp));
         lockdep_init_map(&sp->dep_map, name, key, 0);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
         return init_srcu_struct_fields(sp);
  }
  EXPORT_SYMBOL_GPL(__init_srcu_struct);
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c

index 04cdcf72c827e7601cdca63ab4c54a3f16473c50..10b90d8a03c48678258c6aaf3de353af7b06ed36 100644 (file)
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -143,15 +143,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
                                 if (!table->maxlen)
                                         set_fail(&fail, table, "No maxlen");
                         }
-                       if ((table->proc_handler == proc_doulongvec_minmax) ||
-                           (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
-                               if (table->maxlen > sizeof (unsigned long)) {
-                                       if (!table->extra1)
-                                               set_fail(&fail, table, "No min");
-                                       if (!table->extra2)
-                                               set_fail(&fail, table, "No max");
-                               }
-                       }
  #ifdef CONFIG_PROC_SYSCTL
                         if (table->procname && !table->proc_handler)
                                 set_fail(&fail, table, "No proc_handler");
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c

index 4f104515a19bcb18ca73226c745786a41d41b4ae..f8b11a283171b65849c5ed3bc0e162397f76fbea 100644 (file)
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -115,7 +115,9 @@ static int test_kprobes(void)
         int ret;
         struct kprobe *kps[2] = {&kp, &kp2};
  
-       kp.addr = 0; /* addr should be cleard for reusing kprobe. */
+       /* addr and flags should be cleard for reusing kprobe. */
+       kp.addr = NULL;
+       kp.flags = 0;
         ret = register_kprobes(kps, 2);
         if (ret < 0) {
                 printk(KERN_ERR "Kprobe smoke test failed: "
@@ -210,7 +212,9 @@ static int test_jprobes(void)
         int ret;
         struct jprobe *jps[2] = {&jp, &jp2};
  
-       jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */
+       /* addr and flags should be cleard for reusing kprobe. */
+       jp.kp.addr = NULL;
+       jp.kp.flags = 0;
         ret = register_jprobes(jps, 2);
         if (ret < 0) {
                 printk(KERN_ERR "Kprobe smoke test failed: "
@@ -323,7 +327,9 @@ static int test_kretprobes(void)
         int ret;
         struct kretprobe *rps[2] = {&rp, &rp2};
  
-       rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */
+       /* addr and flags should be cleard for reusing kprobe. */
+       rp.kp.addr = NULL;
+       rp.kp.flags = 0;
         ret = register_kretprobes(rps, 2);
         if (ret < 0) {
                 printk(KERN_ERR "Kprobe smoke test failed: "
diff --git a/kernel/timer.c b/kernel/timer.c

index 97bf05baade7cb4b9db4a5b76cf26255b6a67753..68a9ae7679b717f6eb4782ce5114ad405dec5a4c 100644 (file)
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,7 +37,7 @@
  #include <linux/delay.h>
  #include <linux/tick.h>
  #include <linux/kallsyms.h>
-#include <linux/perf_event.h>
+#include <linux/irq_work.h>
  #include <linux/sched.h>
  #include <linux/slab.h>
  
@@ -1279,7 +1279,10 @@ void update_process_times(int user_tick)
         run_local_timers();
         rcu_check_callbacks(cpu, user_tick);
         printk_tick();
-       perf_event_do_pending();
+#ifdef CONFIG_IRQ_WORK
+       if (in_irq())
+               irq_work_run();
+#endif
         scheduler_tick();
         run_posix_cpu_timers(p);
  }
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig

index 538501c6ea5058cf703eaa2608307f03f3aee89a..e550d2eda1dfdbf843e60c170d1fff3ec4e4eaf9 100644 (file)
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS
         help
           See Documentation/trace/ftrace-design.txt
  
+config HAVE_C_RECORDMCOUNT
+       bool
+       help
+         C version of recordmcount available?
+
  config TRACER_MAX_TRACE
         bool
  
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c

index fa7ece649fe1bcad7b0db621fa8579e91860fb04..ebd80d50c474e60fd84fd4a577da69b052fe14eb 100644 (file)
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -884,10 +884,8 @@ enum {
         FTRACE_ENABLE_CALLS             = (1 << 0),
         FTRACE_DISABLE_CALLS            = (1 << 1),
         FTRACE_UPDATE_TRACE_FUNC        = (1 << 2),
-       FTRACE_ENABLE_MCOUNT            = (1 << 3),
-       FTRACE_DISABLE_MCOUNT           = (1 << 4),
-       FTRACE_START_FUNC_RET           = (1 << 5),
-       FTRACE_STOP_FUNC_RET            = (1 << 6),
+       FTRACE_START_FUNC_RET           = (1 << 3),
+       FTRACE_STOP_FUNC_RET            = (1 << 4),
  };
  
  static int ftrace_filtered;
@@ -1226,8 +1224,6 @@ static void ftrace_shutdown(int command)
  
  static void ftrace_startup_sysctl(void)
  {
-       int command = FTRACE_ENABLE_MCOUNT;
-
         if (unlikely(ftrace_disabled))
                 return;
  
@@ -1235,23 +1231,17 @@ static void ftrace_startup_sysctl(void)
         saved_ftrace_func = NULL;
         /* ftrace_start_up is true if we want ftrace running */
         if (ftrace_start_up)
-               command |= FTRACE_ENABLE_CALLS;
-
-       ftrace_run_update_code(command);
+               ftrace_run_update_code(FTRACE_ENABLE_CALLS);
  }
  
  static void ftrace_shutdown_sysctl(void)
  {
-       int command = FTRACE_DISABLE_MCOUNT;
-
         if (unlikely(ftrace_disabled))
                 return;
  
         /* ftrace_start_up is true if ftrace is running */
         if (ftrace_start_up)
-               command |= FTRACE_DISABLE_CALLS;
-
-       ftrace_run_update_code(command);
+               ftrace_run_update_code(FTRACE_DISABLE_CALLS);
  }
  
  static cycle_t         ftrace_update_time;
@@ -1368,24 +1358,29 @@ enum {
  #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
  
  struct ftrace_iterator {
-       struct ftrace_page      *pg;
-       int                     hidx;
-       int                     idx;
-       unsigned                flags;
-       struct trace_parser     parser;
+       loff_t                          pos;
+       loff_t                          func_pos;
+       struct ftrace_page              *pg;
+       struct dyn_ftrace               *func;
+       struct ftrace_func_probe        *probe;
+       struct trace_parser             parser;
+       int                             hidx;
+       int                             idx;
+       unsigned                        flags;
  };
  
  static void *
-t_hash_next(struct seq_file *m, void *v, loff_t *pos)
+t_hash_next(struct seq_file *m, loff_t *pos)
  {
         struct ftrace_iterator *iter = m->private;
-       struct hlist_node *hnd = v;
+       struct hlist_node *hnd = NULL;
         struct hlist_head *hhd;
  
-       WARN_ON(!(iter->flags & FTRACE_ITER_HASH));
-
         (*pos)++;
+       iter->pos = *pos;
  
+       if (iter->probe)
+               hnd = &iter->probe->node;
   retry:
         if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
                 return NULL;
@@ -1408,7 +1403,12 @@ t_hash_next(struct seq_file *m, void *v, loff_t *pos)
                 }
         }
  
-       return hnd;
+       if (WARN_ON_ONCE(!hnd))
+               return NULL;
+
+       iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node);
+
+       return iter;
  }
  
  static void *t_hash_start(struct seq_file *m, loff_t *pos)
@@ -1417,26 +1417,32 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
         void *p = NULL;
         loff_t l;
  
-       if (!(iter->flags & FTRACE_ITER_HASH))
-               *pos = 0;
-
-       iter->flags |= FTRACE_ITER_HASH;
+       if (iter->func_pos > *pos)
+               return NULL;
  
         iter->hidx = 0;
-       for (l = 0; l <= *pos; ) {
-               p = t_hash_next(m, p, &l);
+       for (l = 0; l <= (*pos - iter->func_pos); ) {
+               p = t_hash_next(m, &l);
                 if (!p)
                         break;
         }
-       return p;
+       if (!p)
+               return NULL;
+
+       /* Only set this if we have an item */
+       iter->flags |= FTRACE_ITER_HASH;
+
+       return iter;
  }
  
-static int t_hash_show(struct seq_file *m, void *v)
+static int
+t_hash_show(struct seq_file *m, struct ftrace_iterator *iter)
  {
         struct ftrace_func_probe *rec;
-       struct hlist_node *hnd = v;
  
-       rec = hlist_entry(hnd, struct ftrace_func_probe, node);
+       rec = iter->probe;
+       if (WARN_ON_ONCE(!rec))
+               return -EIO;
  
         if (rec->ops->print)
                 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
@@ -1457,12 +1463,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
         struct dyn_ftrace *rec = NULL;
  
         if (iter->flags & FTRACE_ITER_HASH)
-               return t_hash_next(m, v, pos);
+               return t_hash_next(m, pos);
  
         (*pos)++;
+       iter->pos = *pos;
  
         if (iter->flags & FTRACE_ITER_PRINTALL)
-               return NULL;
+               return t_hash_start(m, pos);
  
   retry:
         if (iter->idx >= iter->pg->index) {
@@ -1491,7 +1498,20 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
                 }
         }
  
-       return rec;
+       if (!rec)
+               return t_hash_start(m, pos);
+
+       iter->func_pos = *pos;
+       iter->func = rec;
+
+       return iter;
+}
+
+static void reset_iter_read(struct ftrace_iterator *iter)
+{
+       iter->pos = 0;
+       iter->func_pos = 0;
+       iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH);
  }
  
  static void *t_start(struct seq_file *m, loff_t *pos)
@@ -1501,6 +1521,12 @@ static void *t_start(struct seq_file *m, loff_t *pos)
         loff_t l;
  
         mutex_lock(&ftrace_lock);
+       /*
+        * If an lseek was done, then reset and start from beginning.
+        */
+       if (*pos < iter->pos)
+               reset_iter_read(iter);
+
         /*
          * For set_ftrace_filter reading, if we have the filter
          * off, we can short cut and just print out that all
@@ -1518,6 +1544,11 @@ static void *t_start(struct seq_file *m, loff_t *pos)
         if (iter->flags & FTRACE_ITER_HASH)
                 return t_hash_start(m, pos);
  
+       /*
+        * Unfortunately, we need to restart at ftrace_pages_start
+        * every time we let go of the ftrace_mutex. This is because
+        * those pointers can change without the lock.
+        */
         iter->pg = ftrace_pages_start;
         iter->idx = 0;
         for (l = 0; l <= *pos; ) {
@@ -1526,10 +1557,14 @@ static void *t_start(struct seq_file *m, loff_t *pos)
                         break;
         }
  
-       if (!p && iter->flags & FTRACE_ITER_FILTER)
-               return t_hash_start(m, pos);
+       if (!p) {
+               if (iter->flags & FTRACE_ITER_FILTER)
+                       return t_hash_start(m, pos);
  
-       return p;
+               return NULL;
+       }
+
+       return iter;
  }
  
  static void t_stop(struct seq_file *m, void *p)
@@ -1540,16 +1575,18 @@ static void t_stop(struct seq_file *m, void *p)
  static int t_show(struct seq_file *m, void *v)
  {
         struct ftrace_iterator *iter = m->private;
-       struct dyn_ftrace *rec = v;
+       struct dyn_ftrace *rec;
  
         if (iter->flags & FTRACE_ITER_HASH)
-               return t_hash_show(m, v);
+               return t_hash_show(m, iter);
  
         if (iter->flags & FTRACE_ITER_PRINTALL) {
                 seq_printf(m, "#### all functions enabled ####\n");
                 return 0;
         }
  
+       rec = iter->func;
+
         if (!rec)
                 return 0;
  
@@ -1601,8 +1638,8 @@ ftrace_failures_open(struct inode *inode, struct file *file)
  
         ret = ftrace_avail_open(inode, file);
         if (!ret) {
-               m = (struct seq_file *)file->private_data;
-               iter = (struct ftrace_iterator *)m->private;
+               m = file->private_data;
+               iter = m->private;
                 iter->flags = FTRACE_ITER_FAILURES;
         }
  
@@ -2418,7 +2455,7 @@ static const struct file_operations ftrace_filter_fops = {
         .open = ftrace_filter_open,
         .read = seq_read,
         .write = ftrace_filter_write,
-       .llseek = no_llseek,
+       .llseek = ftrace_regex_lseek,
         .release = ftrace_filter_release,
  };
  
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c

index bca96377fd4e8667df513dfd91ea56931e35578f..c5a632a669e1f7a4ebc6a07889ef593265b3d76a 100644 (file)
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2606,6 +2606,19 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
  }
  EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
  
+/*
+ * The total entries in the ring buffer is the running counter
+ * of entries entered into the ring buffer, minus the sum of
+ * the entries read from the ring buffer and the number of
+ * entries that were overwritten.
+ */
+static inline unsigned long
+rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
+{
+       return local_read(&cpu_buffer->entries) -
+               (local_read(&cpu_buffer->overrun) + cpu_buffer->read);
+}
+
  /**
   * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
   * @buffer: The ring buffer
@@ -2614,16 +2627,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
  unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
  {
         struct ring_buffer_per_cpu *cpu_buffer;
-       unsigned long ret;
  
         if (!cpumask_test_cpu(cpu, buffer->cpumask))
                 return 0;
  
         cpu_buffer = buffer->buffers[cpu];
-       ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
-               - cpu_buffer->read;
  
-       return ret;
+       return rb_num_of_entries(cpu_buffer);
  }
  EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
  
@@ -2684,8 +2694,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
         /* if you care about this being correct, lock the buffer */
         for_each_buffer_cpu(buffer, cpu) {
                 cpu_buffer = buffer->buffers[cpu];
-               entries += (local_read(&cpu_buffer->entries) -
-                           local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
+               entries += rb_num_of_entries(cpu_buffer);
         }
  
         return entries;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c

index 9ec59f541156625b5c4b0aea9267086c928ae07a..001bcd2ccf4afb5170cb06d500bcff90844bf90e 100644 (file)
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2196,7 +2196,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
  
  static int tracing_release(struct inode *inode, struct file *file)
  {
-       struct seq_file *m = (struct seq_file *)file->private_data;
+       struct seq_file *m = file->private_data;
         struct trace_iterator *iter;
         int cpu;
  
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h

index d39b3c5454a5e684b8c720e0791894b36529f1cf..9021f8c0c0c3e379edbd8f39770bd9345794e266 100644 (file)
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -343,6 +343,10 @@ void trace_function(struct trace_array *tr,
                     unsigned long ip,
                     unsigned long parent_ip,
                     unsigned long flags, int pc);
+void trace_graph_function(struct trace_array *tr,
+                   unsigned long ip,
+                   unsigned long parent_ip,
+                   unsigned long flags, int pc);
  void trace_default_header(struct seq_file *m);
  void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
  int trace_empty(struct trace_iterator *iter);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c

index 31cc4cb0dbf2afaa49d5f7428f5cce31d535211d..39c059ca670e64156e6681782ffa708c6b8d720f 100644 (file)
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,7 +9,7 @@
  #include <linux/kprobes.h>
  #include "trace.h"
  
-static char *perf_trace_buf[4];
+static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
  
  /*
   * Force it to be aligned to unsigned long to avoid misaligned accesses
@@ -24,7 +24,7 @@ static int    total_ref_count;
  static int perf_trace_event_init(struct ftrace_event_call *tp_event,
                                  struct perf_event *p_event)
  {
-       struct hlist_head *list;
+       struct hlist_head __percpu *list;
         int ret = -ENOMEM;
         int cpu;
  
@@ -42,11 +42,11 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
         tp_event->perf_events = list;
  
         if (!total_ref_count) {
-               char *buf;
+               char __percpu *buf;
                 int i;
  
-               for (i = 0; i < 4; i++) {
-                       buf = (char *)alloc_percpu(perf_trace_t);
+               for (i = 0; i < PERF_NR_CONTEXTS; i++) {
+                       buf = (char __percpu *)alloc_percpu(perf_trace_t);
                         if (!buf)
                                 goto fail;
  
@@ -65,7 +65,7 @@ fail:
         if (!total_ref_count) {
                 int i;
  
-               for (i = 0; i < 4; i++) {
+               for (i = 0; i < PERF_NR_CONTEXTS; i++) {
                         free_percpu(perf_trace_buf[i]);
                         perf_trace_buf[i] = NULL;
                 }
@@ -101,22 +101,26 @@ int perf_trace_init(struct perf_event *p_event)
         return ret;
  }
  
-int perf_trace_enable(struct perf_event *p_event)
+int perf_trace_add(struct perf_event *p_event, int flags)
  {
         struct ftrace_event_call *tp_event = p_event->tp_event;
+       struct hlist_head __percpu *pcpu_list;
         struct hlist_head *list;
  
-       list = tp_event->perf_events;
-       if (WARN_ON_ONCE(!list))
+       pcpu_list = tp_event->perf_events;
+       if (WARN_ON_ONCE(!pcpu_list))
                 return -EINVAL;
  
-       list = this_cpu_ptr(list);
+       if (!(flags & PERF_EF_START))
+               p_event->hw.state = PERF_HES_STOPPED;
+
+       list = this_cpu_ptr(pcpu_list);
         hlist_add_head_rcu(&p_event->hlist_entry, list);
  
         return 0;
  }
  
-void perf_trace_disable(struct perf_event *p_event)
+void perf_trace_del(struct perf_event *p_event, int flags)
  {
         hlist_del_rcu(&p_event->hlist_entry);
  }
@@ -142,7 +146,7 @@ void perf_trace_destroy(struct perf_event *p_event)
         tp_event->perf_events = NULL;
  
         if (!--total_ref_count) {
-               for (i = 0; i < 4; i++) {
+               for (i = 0; i < PERF_NR_CONTEXTS; i++) {
                         free_percpu(perf_trace_buf[i]);
                         perf_trace_buf[i] = NULL;
                 }
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c

index 4c758f146328f18ce82a318fb60a0413006aca8f..398c0e8b332c1840e16bc0599c1230f9594d29a2 100644 (file)
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -600,21 +600,29 @@ out:
  
  enum {
         FORMAT_HEADER           = 1,
-       FORMAT_PRINTFMT         = 2,
+       FORMAT_FIELD_SEPERATOR  = 2,
+       FORMAT_PRINTFMT         = 3,
  };
  
  static void *f_next(struct seq_file *m, void *v, loff_t *pos)
  {
         struct ftrace_event_call *call = m->private;
         struct ftrace_event_field *field;
-       struct list_head *head;
+       struct list_head *common_head = &ftrace_common_fields;
+       struct list_head *head = trace_get_fields(call);
  
         (*pos)++;
  
         switch ((unsigned long)v) {
         case FORMAT_HEADER:
-               head = &ftrace_common_fields;
+               if (unlikely(list_empty(common_head)))
+                       return NULL;
+
+               field = list_entry(common_head->prev,
+                                  struct ftrace_event_field, link);
+               return field;
  
+       case FORMAT_FIELD_SEPERATOR:
                 if (unlikely(list_empty(head)))
                         return NULL;
  
@@ -626,31 +634,10 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos)
                 return NULL;
         }
  
-       head = trace_get_fields(call);
-
-       /*
-        * To separate common fields from event fields, the
-        * LSB is set on the first event field. Clear it in case.
-        */
-       v = (void *)((unsigned long)v & ~1L);
-
         field = v;
-       /*
-        * If this is a common field, and at the end of the list, then
-        * continue with main list.
-        */
-       if (field->link.prev == &ftrace_common_fields) {
-               if (unlikely(list_empty(head)))
-                       return NULL;
-               field = list_entry(head->prev, struct ftrace_event_field, link);
-               /* Set the LSB to notify f_show to print an extra newline */
-               field = (struct ftrace_event_field *)
-                       ((unsigned long)field | 1);
-               return field;
-       }
-
-       /* If we are done tell f_show to print the format */
-       if (field->link.prev == head)
+       if (field->link.prev == common_head)
+               return (void *)FORMAT_FIELD_SEPERATOR;
+       else if (field->link.prev == head)
                 return (void *)FORMAT_PRINTFMT;
  
         field = list_entry(field->link.prev, struct ftrace_event_field, link);
@@ -688,22 +675,16 @@ static int f_show(struct seq_file *m, void *v)
                 seq_printf(m, "format:\n");
                 return 0;
  
+       case FORMAT_FIELD_SEPERATOR:
+               seq_putc(m, '\n');
+               return 0;
+
         case FORMAT_PRINTFMT:
                 seq_printf(m, "\nprint fmt: %s\n",
                            call->print_fmt);
                 return 0;
         }
  
-       /*
-        * To separate common fields from event fields, the
-        * LSB is set on the first event field. Clear it and
-        * print a newline if it is set.
-        */
-       if ((unsigned long)v & 1) {
-               seq_putc(m, '\n');
-               v = (void *)((unsigned long)v & ~1L);
-       }
-
         field = v;
  
         /*
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c

index 6f233698518ede15cc9302e889de9f108aa0f1cb..76b05980225cb79df8bfad7bb35cd64bd52bcaf2 100644 (file)
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -15,15 +15,19 @@
  #include "trace.h"
  #include "trace_output.h"
  
+/* When set, irq functions will be ignored */
+static int ftrace_graph_skip_irqs;
+
  struct fgraph_cpu_data {
         pid_t           last_pid;
         int             depth;
+       int             depth_irq;
         int             ignore;
         unsigned long   enter_funcs[FTRACE_RETFUNC_DEPTH];
  };
  
  struct fgraph_data {
-       struct fgraph_cpu_data          *cpu_data;
+       struct fgraph_cpu_data __percpu *cpu_data;
  
         /* Place to preserve last processed entry. */
         struct ftrace_graph_ent_entry   ent;
@@ -41,6 +45,7 @@ struct fgraph_data {
  #define TRACE_GRAPH_PRINT_PROC         0x8
  #define TRACE_GRAPH_PRINT_DURATION     0x10
  #define TRACE_GRAPH_PRINT_ABS_TIME     0x20
+#define TRACE_GRAPH_PRINT_IRQS         0x40
  
  static struct tracer_opt trace_opts[] = {
         /* Display overruns? (for self-debug purpose) */
@@ -55,13 +60,15 @@ static struct tracer_opt trace_opts[] = {
         { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) },
         /* Display absolute time of an entry */
         { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
+       /* Display interrupts */
+       { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
         { } /* Empty entry */
  };
  
  static struct tracer_flags tracer_flags = {
         /* Don't display overruns and proc by default */
         .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
-              TRACE_GRAPH_PRINT_DURATION,
+              TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,
         .opts = trace_opts
  };
  
@@ -204,6 +211,14 @@ int __trace_graph_entry(struct trace_array *tr,
         return 1;
  }
  
+static inline int ftrace_graph_ignore_irqs(void)
+{
+       if (!ftrace_graph_skip_irqs)
+               return 0;
+
+       return in_irq();
+}
+
  int trace_graph_entry(struct ftrace_graph_ent *trace)
  {
         struct trace_array *tr = graph_array;
@@ -218,7 +233,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
                 return 0;
  
         /* trace it when it is-nested-in or is a function enabled. */
-       if (!(trace->depth || ftrace_graph_addr(trace->func)))
+       if (!(trace->depth || ftrace_graph_addr(trace->func)) ||
+             ftrace_graph_ignore_irqs())
                 return 0;
  
         local_irq_save(flags);
@@ -246,6 +262,34 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
                 return trace_graph_entry(trace);
  }
  
+static void
+__trace_graph_function(struct trace_array *tr,
+               unsigned long ip, unsigned long flags, int pc)
+{
+       u64 time = trace_clock_local();
+       struct ftrace_graph_ent ent = {
+               .func  = ip,
+               .depth = 0,
+       };
+       struct ftrace_graph_ret ret = {
+               .func     = ip,
+               .depth    = 0,
+               .calltime = time,
+               .rettime  = time,
+       };
+
+       __trace_graph_entry(tr, &ent, flags, pc);
+       __trace_graph_return(tr, &ret, flags, pc);
+}
+
+void
+trace_graph_function(struct trace_array *tr,
+               unsigned long ip, unsigned long parent_ip,
+               unsigned long flags, int pc)
+{
+       __trace_graph_function(tr, ip, flags, pc);
+}
+
  void __trace_graph_return(struct trace_array *tr,
                                 struct ftrace_graph_ret *trace,
                                 unsigned long flags,
@@ -649,8 +693,9 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
  
         /* Print nsecs (we don't want to exceed 7 numbers) */
         if (len < 7) {
-               snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu",
-                        nsecs_rem);
+               size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len);
+
+               snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
                 ret = trace_seq_printf(s, ".%s", nsecs_str);
                 if (!ret)
                         return TRACE_TYPE_PARTIAL_LINE;
@@ -855,6 +900,108 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
         return 0;
  }
  
+/*
+ * Entry check for irq code
+ *
+ * returns 1 if
+ *  - we are inside irq code
+ *  - we just extered irq code
+ *
+ * retunns 0 if
+ *  - funcgraph-interrupts option is set
+ *  - we are not inside irq code
+ */
+static int
+check_irq_entry(struct trace_iterator *iter, u32 flags,
+               unsigned long addr, int depth)
+{
+       int cpu = iter->cpu;
+       int *depth_irq;
+       struct fgraph_data *data = iter->private;
+
+       /*
+        * If we are either displaying irqs, or we got called as
+        * a graph event and private data does not exist,
+        * then we bypass the irq check.
+        */
+       if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
+           (!data))
+               return 0;
+
+       depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+
+       /*
+        * We are inside the irq code
+        */
+       if (*depth_irq >= 0)
+               return 1;
+
+       if ((addr < (unsigned long)__irqentry_text_start) ||
+           (addr >= (unsigned long)__irqentry_text_end))
+               return 0;
+
+       /*
+        * We are entering irq code.
+        */
+       *depth_irq = depth;
+       return 1;
+}
+
+/*
+ * Return check for irq code
+ *
+ * returns 1 if
+ *  - we are inside irq code
+ *  - we just left irq code
+ *
+ * returns 0 if
+ *  - funcgraph-interrupts option is set
+ *  - we are not inside irq code
+ */
+static int
+check_irq_return(struct trace_iterator *iter, u32 flags, int depth)
+{
+       int cpu = iter->cpu;
+       int *depth_irq;
+       struct fgraph_data *data = iter->private;
+
+       /*
+        * If we are either displaying irqs, or we got called as
+        * a graph event and private data does not exist,
+        * then we bypass the irq check.
+        */
+       if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
+           (!data))
+               return 0;
+
+       depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+
+       /*
+        * We are not inside the irq code.
+        */
+       if (*depth_irq == -1)
+               return 0;
+
+       /*
+        * We are inside the irq code, and this is returning entry.
+        * Let's not trace it and clear the entry depth, since
+        * we are out of irq code.
+        *
+        * This condition ensures that we 'leave the irq code' once
+        * we are out of the entry depth. Thus protecting us from
+        * the RETURN entry loss.
+        */
+       if (*depth_irq >= depth) {
+               *depth_irq = -1;
+               return 1;
+       }
+
+       /*
+        * We are inside the irq code, and this is not the entry.
+        */
+       return 1;
+}
+
  static enum print_line_t
  print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
                         struct trace_iterator *iter, u32 flags)
@@ -865,6 +1012,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
         static enum print_line_t ret;
         int cpu = iter->cpu;
  
+       if (check_irq_entry(iter, flags, call->func, call->depth))
+               return TRACE_TYPE_HANDLED;
+
         if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
                 return TRACE_TYPE_PARTIAL_LINE;
  
@@ -902,6 +1052,9 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
         int ret;
         int i;
  
+       if (check_irq_return(iter, flags, trace->depth))
+               return TRACE_TYPE_HANDLED;
+
         if (data) {
                 struct fgraph_cpu_data *cpu_data;
                 int cpu = iter->cpu;
@@ -1054,7 +1207,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
  
  
  enum print_line_t
-print_graph_function_flags(struct trace_iterator *iter, u32 flags)
+__print_graph_function_flags(struct trace_iterator *iter, u32 flags)
  {
         struct ftrace_graph_ent_entry *field;
         struct fgraph_data *data = iter->private;
@@ -1117,7 +1270,18 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
  static enum print_line_t
  print_graph_function(struct trace_iterator *iter)
  {
-       return print_graph_function_flags(iter, tracer_flags.val);
+       return __print_graph_function_flags(iter, tracer_flags.val);
+}
+
+enum print_line_t print_graph_function_flags(struct trace_iterator *iter,
+                                            u32 flags)
+{
+       if (trace_flags & TRACE_ITER_LATENCY_FMT)
+               flags |= TRACE_GRAPH_PRINT_DURATION;
+       else
+               flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+
+       return __print_graph_function_flags(iter, flags);
  }
  
  static enum print_line_t
@@ -1149,7 +1313,7 @@ static void print_lat_header(struct seq_file *s, u32 flags)
         seq_printf(s, "#%.*s|||| /                     \n", size, spaces);
  }
  
-void print_graph_headers_flags(struct seq_file *s, u32 flags)
+static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
  {
         int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
  
@@ -1190,6 +1354,23 @@ void print_graph_headers(struct seq_file *s)
         print_graph_headers_flags(s, tracer_flags.val);
  }
  
+void print_graph_headers_flags(struct seq_file *s, u32 flags)
+{
+       struct trace_iterator *iter = s->private;
+
+       if (trace_flags & TRACE_ITER_LATENCY_FMT) {
+               /* print nothing if the buffers are empty */
+               if (trace_empty(iter))
+                       return;
+
+               print_trace_header(s, iter);
+               flags |= TRACE_GRAPH_PRINT_DURATION;
+       } else
+               flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+
+       __print_graph_headers_flags(s, flags);
+}
+
  void graph_trace_open(struct trace_iterator *iter)
  {
         /* pid and depth on the last trace processed */
@@ -1210,9 +1391,12 @@ void graph_trace_open(struct trace_iterator *iter)
                 pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
                 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
                 int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
+               int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+
                 *pid = -1;
                 *depth = 0;
                 *ignore = 0;
+               *depth_irq = -1;
         }
  
         iter->private = data;
@@ -1235,6 +1419,14 @@ void graph_trace_close(struct trace_iterator *iter)
         }
  }
  
+static int func_graph_set_flag(u32 old_flags, u32 bit, int set)
+{
+       if (bit == TRACE_GRAPH_PRINT_IRQS)
+               ftrace_graph_skip_irqs = !set;
+
+       return 0;
+}
+
  static struct trace_event_functions graph_functions = {
         .trace          = print_graph_function_event,
  };
@@ -1261,6 +1453,7 @@ static struct tracer graph_trace __read_mostly = {
         .print_line     = print_graph_function,
         .print_header   = print_graph_headers,
         .flags          = &tracer_flags,
+       .set_flag       = func_graph_set_flag,
  #ifdef CONFIG_FTRACE_SELFTEST
         .selftest       = trace_selftest_startup_function_graph,
  #endif
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c

index 73a6b0601f2e301c0cd5575cc96f7f0b788853d9..5cf8c602b8804c054bbed65475b4815fd29d5b82 100644 (file)
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -87,14 +87,22 @@ static __cacheline_aligned_in_smp   unsigned long max_sequence;
  
  #ifdef CONFIG_FUNCTION_TRACER
  /*
- * irqsoff uses its own tracer function to keep the overhead down:
+ * Prologue for the preempt and irqs off function tracers.
+ *
+ * Returns 1 if it is OK to continue, and data->disabled is
+ *            incremented.
+ *         0 if the trace is to be ignored, and data->disabled
+ *            is kept the same.
+ *
+ * Note, this function is also used outside this ifdef but
+ *  inside the #ifdef of the function graph tracer below.
+ *  This is OK, since the function graph tracer is
+ *  dependent on the function tracer.
   */
-static void
-irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+static int func_prolog_dec(struct trace_array *tr,
+                          struct trace_array_cpu **data,
+                          unsigned long *flags)
  {
-       struct trace_array *tr = irqsoff_trace;
-       struct trace_array_cpu *data;
-       unsigned long flags;
         long disabled;
         int cpu;
  
@@ -106,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
          */
         cpu = raw_smp_processor_id();
         if (likely(!per_cpu(tracing_cpu, cpu)))
-               return;
+               return 0;
  
-       local_save_flags(flags);
+       local_save_flags(*flags);
         /* slight chance to get a false positive on tracing_cpu */
-       if (!irqs_disabled_flags(flags))
-               return;
+       if (!irqs_disabled_flags(*flags))
+               return 0;
  
-       data = tr->data[cpu];
-       disabled = atomic_inc_return(&data->disabled);
+       *data = tr->data[cpu];
+       disabled = atomic_inc_return(&(*data)->disabled);
  
         if (likely(disabled == 1))
-               trace_function(tr, ip, parent_ip, flags, preempt_count());
+               return 1;
+
+       atomic_dec(&(*data)->disabled);
+
+       return 0;
+}
+
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void
+irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+       struct trace_array *tr = irqsoff_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+
+       if (!func_prolog_dec(tr, &data, &flags))
+               return;
+
+       trace_function(tr, ip, parent_ip, flags, preempt_count());
  
         atomic_dec(&data->disabled);
  }
@@ -155,30 +183,16 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
         struct trace_array *tr = irqsoff_trace;
         struct trace_array_cpu *data;
         unsigned long flags;
-       long disabled;
         int ret;
-       int cpu;
         int pc;
  
-       cpu = raw_smp_processor_id();
-       if (likely(!per_cpu(tracing_cpu, cpu)))
+       if (!func_prolog_dec(tr, &data, &flags))
                 return 0;
  
-       local_save_flags(flags);
-       /* slight chance to get a false positive on tracing_cpu */
-       if (!irqs_disabled_flags(flags))
-               return 0;
-
-       data = tr->data[cpu];
-       disabled = atomic_inc_return(&data->disabled);
-
-       if (likely(disabled == 1)) {
-               pc = preempt_count();
-               ret = __trace_graph_entry(tr, trace, flags, pc);
-       } else
-               ret = 0;
-
+       pc = preempt_count();
+       ret = __trace_graph_entry(tr, trace, flags, pc);
         atomic_dec(&data->disabled);
+
         return ret;
  }
  
@@ -187,27 +201,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
         struct trace_array *tr = irqsoff_trace;
         struct trace_array_cpu *data;
         unsigned long flags;
-       long disabled;
-       int cpu;
         int pc;
  
-       cpu = raw_smp_processor_id();
-       if (likely(!per_cpu(tracing_cpu, cpu)))
+       if (!func_prolog_dec(tr, &data, &flags))
                 return;
  
-       local_save_flags(flags);
-       /* slight chance to get a false positive on tracing_cpu */
-       if (!irqs_disabled_flags(flags))
-               return;
-
-       data = tr->data[cpu];
-       disabled = atomic_inc_return(&data->disabled);
-
-       if (likely(disabled == 1)) {
-               pc = preempt_count();
-               __trace_graph_return(tr, trace, flags, pc);
-       }
-
+       pc = preempt_count();
+       __trace_graph_return(tr, trace, flags, pc);
         atomic_dec(&data->disabled);
  }
  
@@ -229,75 +229,33 @@ static void irqsoff_trace_close(struct trace_iterator *iter)
  
  static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
  {
-       u32 flags = GRAPH_TRACER_FLAGS;
-
-       if (trace_flags & TRACE_ITER_LATENCY_FMT)
-               flags |= TRACE_GRAPH_PRINT_DURATION;
-       else
-               flags |= TRACE_GRAPH_PRINT_ABS_TIME;
-
         /*
          * In graph mode call the graph tracer output function,
          * otherwise go with the TRACE_FN event handler
          */
         if (is_graph())
-               return print_graph_function_flags(iter, flags);
+               return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
  
         return TRACE_TYPE_UNHANDLED;
  }
  
  static void irqsoff_print_header(struct seq_file *s)
  {
-       if (is_graph()) {
-               struct trace_iterator *iter = s->private;
-               u32 flags = GRAPH_TRACER_FLAGS;
-
-               if (trace_flags & TRACE_ITER_LATENCY_FMT) {
-                       /* print nothing if the buffers are empty */
-                       if (trace_empty(iter))
-                               return;
-
-                       print_trace_header(s, iter);
-                       flags |= TRACE_GRAPH_PRINT_DURATION;
-               } else
-                       flags |= TRACE_GRAPH_PRINT_ABS_TIME;
-
-               print_graph_headers_flags(s, flags);
-       } else
+       if (is_graph())
+               print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
+       else
                 trace_default_header(s);
  }
  
-static void
-trace_graph_function(struct trace_array *tr,
-                unsigned long ip, unsigned long flags, int pc)
-{
-       u64 time = trace_clock_local();
-       struct ftrace_graph_ent ent = {
-               .func  = ip,
-               .depth = 0,
-       };
-       struct ftrace_graph_ret ret = {
-               .func     = ip,
-               .depth    = 0,
-               .calltime = time,
-               .rettime  = time,
-       };
-
-       __trace_graph_entry(tr, &ent, flags, pc);
-       __trace_graph_return(tr, &ret, flags, pc);
-}
-
  static void
  __trace_function(struct trace_array *tr,
                  unsigned long ip, unsigned long parent_ip,
                  unsigned long flags, int pc)
  {
-       if (!is_graph())
+       if (is_graph())
+               trace_graph_function(tr, ip, parent_ip, flags, pc);
+       else
                 trace_function(tr, ip, parent_ip, flags, pc);
-       else {
-               trace_graph_function(tr, parent_ip, flags, pc);
-               trace_graph_function(tr, ip, flags, pc);
-       }
  }
  
  #else
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c

index 4086eae6e81b1c1b9762edc01600d3294c6bc130..7319559ed59f0a10197a15dc9d62a99a1fa7aeb7 100644 (file)
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -31,48 +31,98 @@ static int                  wakeup_rt;
  static arch_spinlock_t wakeup_lock =
         (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
  
+static void wakeup_reset(struct trace_array *tr);
  static void __wakeup_reset(struct trace_array *tr);
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
+static void wakeup_graph_return(struct ftrace_graph_ret *trace);
  
  static int save_lat_flag;
  
+#define TRACE_DISPLAY_GRAPH     1
+
+static struct tracer_opt trace_opts[] = {
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       /* display latency trace as call graph */
+       { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
+#endif
+       { } /* Empty entry */
+};
+
+static struct tracer_flags tracer_flags = {
+       .val  = 0,
+       .opts = trace_opts,
+};
+
+#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
+
  #ifdef CONFIG_FUNCTION_TRACER
+
  /*
- * irqsoff uses its own tracer function to keep the overhead down:
+ * Prologue for the wakeup function tracers.
+ *
+ * Returns 1 if it is OK to continue, and preemption
+ *            is disabled and data->disabled is incremented.
+ *         0 if the trace is to be ignored, and preemption
+ *            is not disabled and data->disabled is
+ *            kept the same.
+ *
+ * Note, this function is also used outside this ifdef but
+ *  inside the #ifdef of the function graph tracer below.
+ *  This is OK, since the function graph tracer is
+ *  dependent on the function tracer.
   */
-static void
-wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+static int
+func_prolog_preempt_disable(struct trace_array *tr,
+                           struct trace_array_cpu **data,
+                           int *pc)
  {
-       struct trace_array *tr = wakeup_trace;
-       struct trace_array_cpu *data;
-       unsigned long flags;
         long disabled;
         int cpu;
-       int pc;
  
         if (likely(!wakeup_task))
-               return;
+               return 0;
  
-       pc = preempt_count();
+       *pc = preempt_count();
         preempt_disable_notrace();
  
         cpu = raw_smp_processor_id();
         if (cpu != wakeup_current_cpu)
                 goto out_enable;
  
-       data = tr->data[cpu];
-       disabled = atomic_inc_return(&data->disabled);
+       *data = tr->data[cpu];
+       disabled = atomic_inc_return(&(*data)->disabled);
         if (unlikely(disabled != 1))
                 goto out;
  
-       local_irq_save(flags);
+       return 1;
  
-       trace_function(tr, ip, parent_ip, flags, pc);
+out:
+       atomic_dec(&(*data)->disabled);
+
+out_enable:
+       preempt_enable_notrace();
+       return 0;
+}
  
+/*
+ * wakeup uses its own tracer function to keep the overhead down:
+ */
+static void
+wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+       struct trace_array *tr = wakeup_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       int pc;
+
+       if (!func_prolog_preempt_disable(tr, &data, &pc))
+               return;
+
+       local_irq_save(flags);
+       trace_function(tr, ip, parent_ip, flags, pc);
         local_irq_restore(flags);
  
- out:
         atomic_dec(&data->disabled);
- out_enable:
         preempt_enable_notrace();
  }
  
@@ -82,6 +132,156 @@ static struct ftrace_ops trace_ops __read_mostly =
  };
  #endif /* CONFIG_FUNCTION_TRACER */
  
+static int start_func_tracer(int graph)
+{
+       int ret;
+
+       if (!graph)
+               ret = register_ftrace_function(&trace_ops);
+       else
+               ret = register_ftrace_graph(&wakeup_graph_return,
+                                           &wakeup_graph_entry);
+
+       if (!ret && tracing_is_enabled())
+               tracer_enabled = 1;
+       else
+               tracer_enabled = 0;
+
+       return ret;
+}
+
+static void stop_func_tracer(int graph)
+{
+       tracer_enabled = 0;
+
+       if (!graph)
+               unregister_ftrace_function(&trace_ops);
+       else
+               unregister_ftrace_graph();
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
+{
+
+       if (!(bit & TRACE_DISPLAY_GRAPH))
+               return -EINVAL;
+
+       if (!(is_graph() ^ set))
+               return 0;
+
+       stop_func_tracer(!set);
+
+       wakeup_reset(wakeup_trace);
+       tracing_max_latency = 0;
+
+       return start_func_tracer(set);
+}
+
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+{
+       struct trace_array *tr = wakeup_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       int pc, ret = 0;
+
+       if (!func_prolog_preempt_disable(tr, &data, &pc))
+               return 0;
+
+       local_save_flags(flags);
+       ret = __trace_graph_entry(tr, trace, flags, pc);
+       atomic_dec(&data->disabled);
+       preempt_enable_notrace();
+
+       return ret;
+}
+
+static void wakeup_graph_return(struct ftrace_graph_ret *trace)
+{
+       struct trace_array *tr = wakeup_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       int pc;
+
+       if (!func_prolog_preempt_disable(tr, &data, &pc))
+               return;
+
+       local_save_flags(flags);
+       __trace_graph_return(tr, trace, flags, pc);
+       atomic_dec(&data->disabled);
+
+       preempt_enable_notrace();
+       return;
+}
+
+static void wakeup_trace_open(struct trace_iterator *iter)
+{
+       if (is_graph())
+               graph_trace_open(iter);
+}
+
+static void wakeup_trace_close(struct trace_iterator *iter)
+{
+       if (iter->private)
+               graph_trace_close(iter);
+}
+
+#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC)
+
+static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
+{
+       /*
+        * In graph mode call the graph tracer output function,
+        * otherwise go with the TRACE_FN event handler
+        */
+       if (is_graph())
+               return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
+
+       return TRACE_TYPE_UNHANDLED;
+}
+
+static void wakeup_print_header(struct seq_file *s)
+{
+       if (is_graph())
+               print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
+       else
+               trace_default_header(s);
+}
+
+static void
+__trace_function(struct trace_array *tr,
+                unsigned long ip, unsigned long parent_ip,
+                unsigned long flags, int pc)
+{
+       if (is_graph())
+               trace_graph_function(tr, ip, parent_ip, flags, pc);
+       else
+               trace_function(tr, ip, parent_ip, flags, pc);
+}
+#else
+#define __trace_function trace_function
+
+static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
+{
+       return -EINVAL;
+}
+
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+{
+       return -1;
+}
+
+static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
+{
+       return TRACE_TYPE_UNHANDLED;
+}
+
+static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
+static void wakeup_print_header(struct seq_file *s) { }
+static void wakeup_trace_open(struct trace_iterator *iter) { }
+static void wakeup_trace_close(struct trace_iterator *iter) { }
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
  /*
   * Should this new latency be reported/recorded?
   */
@@ -152,7 +352,7 @@ probe_wakeup_sched_switch(void *ignore,
         /* The task we are waiting for is waking up */
         data = wakeup_trace->data[wakeup_cpu];
  
-       trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
+       __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
         tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
  
         T0 = data->preempt_timestamp;
@@ -252,7 +452,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
          * is not called by an assembly function  (where as schedule is)
          * it should be safe to use it here.
          */
-       trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+       __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
  
  out_locked:
         arch_spin_unlock(&wakeup_lock);
@@ -303,12 +503,8 @@ static void start_wakeup_tracer(struct trace_array *tr)
          */
         smp_wmb();
  
-       register_ftrace_function(&trace_ops);
-
-       if (tracing_is_enabled())
-               tracer_enabled = 1;
-       else
-               tracer_enabled = 0;
+       if (start_func_tracer(is_graph()))
+               printk(KERN_ERR "failed to start wakeup tracer\n");
  
         return;
  fail_deprobe_wake_new:
@@ -320,7 +516,7 @@ fail_deprobe:
  static void stop_wakeup_tracer(struct trace_array *tr)
  {
         tracer_enabled = 0;
-       unregister_ftrace_function(&trace_ops);
+       stop_func_tracer(is_graph());
         unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
         unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
         unregister_trace_sched_wakeup(probe_wakeup, NULL);
@@ -379,9 +575,15 @@ static struct tracer wakeup_tracer __read_mostly =
         .start          = wakeup_tracer_start,
         .stop           = wakeup_tracer_stop,
         .print_max      = 1,
+       .print_header   = wakeup_print_header,
+       .print_line     = wakeup_print_line,
+       .flags          = &tracer_flags,
+       .set_flag       = wakeup_set_flag,
  #ifdef CONFIG_FTRACE_SELFTEST
         .selftest    = trace_selftest_startup_wakeup,
  #endif
+       .open           = wakeup_trace_open,
+       .close          = wakeup_trace_close,
         .use_max_tr     = 1,
  };
  
@@ -394,9 +596,15 @@ static struct tracer wakeup_rt_tracer __read_mostly =
         .stop           = wakeup_tracer_stop,
         .wait_pipe      = poll_wait_pipe,
         .print_max      = 1,
+       .print_header   = wakeup_print_header,
+       .print_line     = wakeup_print_line,
+       .flags          = &tracer_flags,
+       .set_flag       = wakeup_set_flag,
  #ifdef CONFIG_FTRACE_SELFTEST
         .selftest    = trace_selftest_startup_wakeup,
  #endif
+       .open           = wakeup_trace_open,
+       .close          = wakeup_trace_close,
         .use_max_tr     = 1,
  };
  
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c

index a7cc3793baf6897d2535737a52322552040d2737..209b379a47210dad4170e5914e5ecc68207f8784 100644 (file)
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -263,6 +263,11 @@ int __init trace_workqueue_early_init(void)
  {
         int ret, cpu;
  
+       for_each_possible_cpu(cpu) {
+               spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
+               INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
+       }
+
         ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
         if (ret)
                 goto out;
@@ -279,11 +284,6 @@ int __init trace_workqueue_early_init(void)
         if (ret)
                 goto no_creation;
  
-       for_each_possible_cpu(cpu) {
-               spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
-               INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
-       }
-
         return 0;
  
  no_creation:
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c

index c77f3eceea250e49b0ff001959f8df9eeb8e0716..e95ee7f31d43309949893d9e63d1703e619a6ca9 100644 (file)
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -25,6 +25,7 @@
  #include <linux/err.h>
  #include <linux/slab.h>
  #include <linux/sched.h>
+#include <linux/jump_label.h>
  
  extern struct tracepoint __start___tracepoints[];
  extern struct tracepoint __stop___tracepoints[];
@@ -263,7 +264,13 @@ static void set_tracepoint(struct tracepoint_entry **entry,
          * is used.
          */
         rcu_assign_pointer(elem->funcs, (*entry)->funcs);
-       elem->state = active;
+       if (!elem->state && active) {
+               jump_label_enable(&elem->state);
+               elem->state = active;
+       } else if (elem->state && !active) {
+               jump_label_disable(&elem->state);
+               elem->state = active;
+       }
  }
  
  /*
@@ -277,7 +284,10 @@ static void disable_tracepoint(struct tracepoint *elem)
         if (elem->unregfunc && elem->state)
                 elem->unregfunc();
  
-       elem->state = 0;
+       if (elem->state) {
+               jump_label_disable(&elem->state);
+               elem->state = 0;
+       }
         rcu_assign_pointer(elem->funcs, NULL);
  }
  
diff --git a/kernel/watchdog.c b/kernel/watchdog.c

index 7f9c3c52ecc12ef5d0de1728839218ff87c2dc7b..bafba687a6d849a11f0201acd3d7ae5e63e9301e 100644 (file)
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -43,7 +43,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
  static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
  #endif
  
-static int __read_mostly did_panic;
  static int __initdata no_watchdog;
  
  
@@ -187,18 +186,6 @@ static int is_softlockup(unsigned long touch_ts)
         return 0;
  }
  
-static int
-watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr)
-{
-       did_panic = 1;
-
-       return NOTIFY_DONE;
-}
-
-static struct notifier_block panic_block = {
-       .notifier_call = watchdog_panic,
-};
-
  #ifdef CONFIG_HARDLOCKUP_DETECTOR
  static struct perf_event_attr wd_hw_attr = {
         .type           = PERF_TYPE_HARDWARE,
@@ -209,7 +196,7 @@ static struct perf_event_attr wd_hw_attr = {
  };
  
  /* Callback function for perf event subsystem */
-void watchdog_overflow_callback(struct perf_event *event, int nmi,
+static void watchdog_overflow_callback(struct perf_event *event, int nmi,
                  struct perf_sample_data *data,
                  struct pt_regs *regs)
  {
@@ -371,14 +358,14 @@ static int watchdog_nmi_enable(int cpu)
         /* Try to register using hardware perf events */
         wd_attr = &wd_hw_attr;
         wd_attr->sample_period = hw_nmi_get_sample_period();
-       event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback);
+       event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
         if (!IS_ERR(event)) {
                 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
                 goto out_save;
         }
  
         printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
-       return -1;
+       return PTR_ERR(event);
  
         /* success path */
  out_save:
@@ -422,17 +409,19 @@ static int watchdog_prepare_cpu(int cpu)
  static int watchdog_enable(int cpu)
  {
         struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
+       int err;
  
         /* enable the perf event */
-       if (watchdog_nmi_enable(cpu) != 0)
-               return -1;
+       err = watchdog_nmi_enable(cpu);
+       if (err)
+               return err;
  
         /* create the watchdog thread */
         if (!p) {
                 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
                 if (IS_ERR(p)) {
                         printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
-                       return -1;
+                       return PTR_ERR(p);
                 }
                 kthread_bind(p, cpu);
                 per_cpu(watchdog_touch_ts, cpu) = 0;
@@ -484,6 +473,9 @@ static void watchdog_disable_all_cpus(void)
  {
         int cpu;
  
+       if (no_watchdog)
+               return;
+
         for_each_online_cpu(cpu)
                 watchdog_disable(cpu);
  
@@ -526,17 +518,16 @@ static int __cpuinit
  cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
  {
         int hotcpu = (unsigned long)hcpu;
+       int err = 0;
  
         switch (action) {
         case CPU_UP_PREPARE:
         case CPU_UP_PREPARE_FROZEN:
-               if (watchdog_prepare_cpu(hotcpu))
-                       return NOTIFY_BAD;
+               err = watchdog_prepare_cpu(hotcpu);
                 break;
         case CPU_ONLINE:
         case CPU_ONLINE_FROZEN:
-               if (watchdog_enable(hotcpu))
-                       return NOTIFY_BAD;
+               err = watchdog_enable(hotcpu);
                 break;
  #ifdef CONFIG_HOTPLUG_CPU
         case CPU_UP_CANCELED:
@@ -549,7 +540,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 break;
  #endif /* CONFIG_HOTPLUG_CPU */
         }
-       return NOTIFY_OK;
+       return notifier_from_errno(err);
  }
  
  static struct notifier_block __cpuinitdata cpu_nfb = {
@@ -565,13 +556,11 @@ static int __init spawn_watchdog_task(void)
                 return 0;
  
         err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
-       WARN_ON(err == NOTIFY_BAD);
+       WARN_ON(notifier_to_errno(err));
  
         cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
         register_cpu_notifier(&cpu_nfb);
  
-       atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
-
         return 0;
  }
  early_initcall(spawn_watchdog_task);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug

index 1b4afd2e6ca089de0babdacc5781426ef118da5c..21ac83070a805030150bab1a229be48c3f79485a 100644 (file)
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -482,6 +482,7 @@ config PROVE_LOCKING
         select DEBUG_SPINLOCK
         select DEBUG_MUTEXES
         select DEBUG_LOCK_ALLOC
+       select TRACE_IRQFLAGS
         default n
         help
          This feature enables the kernel to prove that all locking
@@ -539,6 +540,23 @@ config PROVE_RCU_REPEATEDLY
          disabling, allowing multiple RCU-lockdep warnings to be printed
          on a single reboot.
  
+        Say Y to allow multiple RCU-lockdep warnings per boot.
+
+        Say N if you are unsure.
+
+config SPARSE_RCU_POINTER
+       bool "RCU debugging: sparse-based checks for pointer usage"
+       default n
+       help
+        This feature enables the __rcu sparse annotation for
+        RCU-protected pointers.  This annotation will cause sparse
+        to flag any non-RCU used of annotated pointers.  This can be
+        helpful when debugging RCU usage.  Please note that this feature
+        is not intended to enforce code cleanliness; it is instead merely
+        a debugging aid.
+
+        Say Y to make sparse flag questionable use of RCU-protected pointers
+
          Say N if you are unsure.
  
  config LOCKDEP
@@ -579,11 +597,10 @@ config DEBUG_LOCKDEP
           of more runtime overhead.
  
  config TRACE_IRQFLAGS
-       depends on DEBUG_KERNEL
         bool
-       default y
-       depends on TRACE_IRQFLAGS_SUPPORT
-       depends on PROVE_LOCKING
+       help
+         Enables hooks to interrupt enabling and disabling for
+         either tracing or lock debugging.
  
  config DEBUG_SPINLOCK_SLEEP
         bool "Spinlock debugging: sleep-inside-spinlock checking"
@@ -832,6 +849,30 @@ config RCU_CPU_STALL_DETECTOR
  
           Say Y if you are unsure.
  
+config RCU_CPU_STALL_TIMEOUT
+       int "RCU CPU stall timeout in seconds"
+       depends on RCU_CPU_STALL_DETECTOR
+       range 3 300
+       default 60
+       help
+         If a given RCU grace period extends more than the specified
+         number of seconds, a CPU stall warning is printed.  If the
+         RCU grace period persists, additional CPU stall warnings are
+         printed at more widely spaced intervals.
+
+config RCU_CPU_STALL_DETECTOR_RUNNABLE
+       bool "RCU CPU stall checking starts automatically at boot"
+       depends on RCU_CPU_STALL_DETECTOR
+       default y
+       help
+         If set, start checking for RCU CPU stalls immediately on
+         boot.  Otherwise, RCU CPU stall checking must be manually
+         enabled.
+
+         Say Y if you are unsure.
+
+         Say N if you wish to suppress RCU CPU stall checking during boot.
+
  config RCU_CPU_STALL_VERBOSE
         bool "Print additional per-task information for RCU_CPU_STALL_DETECTOR"
         depends on RCU_CPU_STALL_DETECTOR && TREE_PREEMPT_RCU
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c

index 02afc25337284329d23a998e636240d5a41d337d..7bd6df781ce50198689fae375eef696751c0a47a 100644 (file)
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -26,19 +26,11 @@
  #include <linux/dynamic_debug.h>
  #include <linux/debugfs.h>
  #include <linux/slab.h>
+#include <linux/jump_label.h>
  
  extern struct _ddebug __start___verbose[];
  extern struct _ddebug __stop___verbose[];
  
-/* dynamic_debug_enabled, and dynamic_debug_enabled2 are bitmasks in which
- * bit n is set to 1 if any modname hashes into the bucket n, 0 otherwise. They
- * use independent hash functions, to reduce the chance of false positives.
- */
-long long dynamic_debug_enabled;
-EXPORT_SYMBOL_GPL(dynamic_debug_enabled);
-long long dynamic_debug_enabled2;
-EXPORT_SYMBOL_GPL(dynamic_debug_enabled2);
-
  struct ddebug_table {
         struct list_head link;
         char *mod_name;
@@ -87,26 +79,6 @@ static char *ddebug_describe_flags(struct _ddebug *dp, char *buf,
         return buf;
  }
  
-/*
- * must be called with ddebug_lock held
- */
-
-static int disabled_hash(char hash, bool first_table)
-{
-       struct ddebug_table *dt;
-       char table_hash_value;
-
-       list_for_each_entry(dt, &ddebug_tables, link) {
-               if (first_table)
-                       table_hash_value = dt->ddebugs->primary_hash;
-               else
-                       table_hash_value = dt->ddebugs->secondary_hash;
-               if (dt->num_enabled && (hash == table_hash_value))
-                       return 0;
-       }
-       return 1;
-}
-
  /*
   * Search the tables for _ddebug's which match the given
   * `query' and apply the `flags' and `mask' to them.  Tells
@@ -170,17 +142,9 @@ static void ddebug_change(const struct ddebug_query *query,
                                 dt->num_enabled++;
                         dp->flags = newflags;
                         if (newflags) {
-                               dynamic_debug_enabled |=
-                                               (1LL << dp->primary_hash);
-                               dynamic_debug_enabled2 |=
-                                               (1LL << dp->secondary_hash);
+                               jump_label_enable(&dp->enabled);
                         } else {
-                               if (disabled_hash(dp->primary_hash, true))
-                                       dynamic_debug_enabled &=
-                                               ~(1LL << dp->primary_hash);
-                               if (disabled_hash(dp->secondary_hash, false))
-                                       dynamic_debug_enabled2 &=
-                                               ~(1LL << dp->secondary_hash);
+                               jump_label_disable(&dp->enabled);
                         }
                         if (verbose)
                                 printk(KERN_INFO
diff --git a/lib/radix-tree.c b/lib/radix-tree.c

index efd16fa80b1cfd55f2e1f1295f1cd45765925abf..6f412ab4c24f812fc8b7290c4a1e06cc0250ea62 100644 (file)
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -49,7 +49,7 @@ struct radix_tree_node {
         unsigned int    height;         /* Height from the bottom */
         unsigned int    count;
         struct rcu_head rcu_head;
-       void            *slots[RADIX_TREE_MAP_SIZE];
+       void __rcu      *slots[RADIX_TREE_MAP_SIZE];
         unsigned long   tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
  };
  
diff --git a/net/Kconfig b/net/Kconfig

index e926884c1675c04c3d150ddc83eaa89f91a0cb4a..55fd82e9ffd91e9fd48878147f3068923373ce16 100644 (file)
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -293,6 +293,7 @@ source "net/wimax/Kconfig"
  source "net/rfkill/Kconfig"
  source "net/9p/Kconfig"
  source "net/caif/Kconfig"
+source "net/ceph/Kconfig"
  
  
  endif   # if NET
diff --git a/net/Makefile b/net/Makefile

index ea60fbce9b1ba3e623ee9f1ec9ce622169a3596d..6b7bfd7f1416d9950e90cb3ddd065d998c0e78ee 100644 (file)
--- a/net/Makefile
+++ b/net/Makefile
@@ -68,3 +68,4 @@ obj-$(CONFIG_SYSCTL)          += sysctl_net.o
  endif
  obj-$(CONFIG_WIMAX)            += wimax/
  obj-$(CONFIG_DNS_RESOLVER)     += dns_resolver/
+obj-$(CONFIG_CEPH_LIB)         += ceph/
diff --git a/net/atm/mpc.c b/net/atm/mpc.c

index 622b471e14e03dbc3752697851022a59aebffbe0..74bcc662c3dd8c5e7ec33a393560cc9bbf313b57 100644 (file)
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -778,7 +778,7 @@ static void mpc_push(struct atm_vcc *vcc, struct sk_buff *skb)
         eg->packets_rcvd++;
         mpc->eg_ops->put(eg);
  
-       memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data));
+       memset(ATM_SKB(new_skb), 0, sizeof(struct atm_skb_data));
         netif_rx(new_skb);
  }
  
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig

new file mode 100644 (file)

index 0000000..ad42404
--- /dev/null
+++ b/net/ceph/Kconfig
@@ -0,0 +1,28 @@
+config CEPH_LIB
+        tristate "Ceph core library (EXPERIMENTAL)"
+       depends on INET && EXPERIMENTAL
+       select LIBCRC32C
+       select CRYPTO_AES
+       select CRYPTO
+       default n
+       help
+         Choose Y or M here to include cephlib, which provides the
+         common functionality to both the Ceph filesystem and
+         to the rados block device (rbd).
+
+         More information at http://ceph.newdream.net/.
+
+         If unsure, say N.
+
+config CEPH_LIB_PRETTYDEBUG
+       bool "Include file:line in ceph debug output"
+       depends on CEPH_LIB
+       default n
+       help
+         If you say Y here, debug output will include a filename and
+         line to aid debugging.  This increases kernel size and slows
+         execution slightly when debug call sites are enabled (e.g.,
+         via CONFIG_DYNAMIC_DEBUG).
+
+         If unsure, say N.
+
diff --git a/net/ceph/Makefile b/net/ceph/Makefile

new file mode 100644 (file)

index 0000000..aab1cab
--- /dev/null
+++ b/net/ceph/Makefile
@@ -0,0 +1,37 @@
+#
+# Makefile for CEPH filesystem.
+#
+
+ifneq ($(KERNELRELEASE),)
+
+obj-$(CONFIG_CEPH_LIB) += libceph.o
+
+libceph-objs := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
+       mon_client.o \
+       osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
+       debugfs.o \
+       auth.o auth_none.o \
+       crypto.o armor.o \
+       auth_x.o \
+       ceph_fs.o ceph_strings.o ceph_hash.o \
+       pagevec.o
+
+else
+#Otherwise we were called directly from the command
+# line; invoke the kernel build system.
+
+KERNELDIR ?= /lib/modules/$(shell uname -r)/build
+PWD := $(shell pwd)
+
+default: all
+
+all:
+       $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules
+
+modules_install:
+       $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules_install
+
+clean:
+       $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
+
+endif
diff --git a/fs/ceph/armor.c b/net/ceph/armor.c

similarity index 100%

rename from fs/ceph/armor.c

rename to net/ceph/armor.c
diff --git a/fs/ceph/auth.c b/net/ceph/auth.c

similarity index 97%

rename from fs/ceph/auth.c

rename to net/ceph/auth.c

index 6d2e30600627e71d25c2f93d5862952a078472b8..549c1f43e1d53b134c045954e103260fee9033b7 100644 (file)
--- a/fs/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -1,16 +1,16 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <linux/module.h>
  #include <linux/err.h>
  #include <linux/slab.h>
  
-#include "types.h"
+#include <linux/ceph/types.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
  #include "auth_none.h"
  #include "auth_x.h"
-#include "decode.h"
-#include "super.h"
  
-#include "messenger.h"
  
  /*
   * get protocol handler
diff --git a/fs/ceph/auth_none.c b/net/ceph/auth_none.c

similarity index 96%

rename from fs/ceph/auth_none.c

rename to net/ceph/auth_none.c

index ad1dc21286c7f3ca9c84d75d7a2630bc0069dae4..214c2bb43d6252056a7a77dcd2d8eb5877c719d1 100644 (file)
--- a/fs/ceph/auth_none.c
+++ b/net/ceph/auth_none.c
@@ -1,14 +1,15 @@
  
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <linux/err.h>
  #include <linux/module.h>
  #include <linux/random.h>
  #include <linux/slab.h>
  
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+
  #include "auth_none.h"
-#include "auth.h"
-#include "decode.h"
  
  static void reset(struct ceph_auth_client *ac)
  {
diff --git a/fs/ceph/auth_none.h b/net/ceph/auth_none.h

similarity index 94%

rename from fs/ceph/auth_none.h

rename to net/ceph/auth_none.h

index 8164df1a08be5060aa065106bdbeb539f9caabec..ed7d088b1bc92e14372e2f24c8f5016750e07014 100644 (file)
--- a/fs/ceph/auth_none.h
+++ b/net/ceph/auth_none.h
@@ -2,8 +2,7 @@
  #define _FS_CEPH_AUTH_NONE_H
  
  #include <linux/slab.h>
-
-#include "auth.h"
+#include <linux/ceph/auth.h>
  
  /*
   * null security mode.
diff --git a/fs/ceph/auth_x.c b/net/ceph/auth_x.c

similarity index 99%

rename from fs/ceph/auth_x.c

rename to net/ceph/auth_x.c

index a2d002cbdec23d15d30624a345ae6090ea452a53..7fd5dfcf6e188551f9a25638129239d79df9d874 100644 (file)
--- a/fs/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -1,16 +1,17 @@
  
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <linux/err.h>
  #include <linux/module.h>
  #include <linux/random.h>
  #include <linux/slab.h>
  
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+
+#include "crypto.h"
  #include "auth_x.h"
  #include "auth_x_protocol.h"
-#include "crypto.h"
-#include "auth.h"
-#include "decode.h"
  
  #define TEMP_TICKET_BUF_LEN    256
  
diff --git a/fs/ceph/auth_x.h b/net/ceph/auth_x.h

similarity index 96%

rename from fs/ceph/auth_x.h

rename to net/ceph/auth_x.h

index ff6f8180e6816a352ca5184d8db1b47e110b64f4..e02da7a5c5a1052b881f383e7fca318da0675421 100644 (file)
--- a/fs/ceph/auth_x.h
+++ b/net/ceph/auth_x.h
@@ -3,8 +3,9 @@
  
  #include <linux/rbtree.h>
  
+#include <linux/ceph/auth.h>
+
  #include "crypto.h"
-#include "auth.h"
  #include "auth_x_protocol.h"
  
  /*
diff --git a/fs/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h

similarity index 100%

rename from fs/ceph/auth_x_protocol.h

rename to net/ceph/auth_x_protocol.h
diff --git a/fs/ceph/buffer.c b/net/ceph/buffer.c

similarity index 86%

rename from fs/ceph/buffer.c

rename to net/ceph/buffer.c

index cd39f17021de9ab6cde35ce444a727970f4f8fab..53d8abfa25d5ede4b487c88a5341b376c4921c26 100644 (file)
--- a/fs/ceph/buffer.c
+++ b/net/ceph/buffer.c
@@ -1,10 +1,11 @@
  
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
+#include <linux/module.h>
  #include <linux/slab.h>
  
-#include "buffer.h"
-#include "decode.h"
+#include <linux/ceph/buffer.h>
+#include <linux/ceph/decode.h>
  
  struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
  {
@@ -32,6 +33,7 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
         dout("buffer_new %p\n", b);
         return b;
  }
+EXPORT_SYMBOL(ceph_buffer_new);
  
  void ceph_buffer_release(struct kref *kref)
  {
@@ -46,6 +48,7 @@ void ceph_buffer_release(struct kref *kref)
         }
         kfree(b);
  }
+EXPORT_SYMBOL(ceph_buffer_release);
  
  int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
  {
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c

new file mode 100644 (file)

index 0000000..f3e4a13
--- /dev/null
+++ b/net/ceph/ceph_common.c
@@ -0,0 +1,529 @@
+
+#include <linux/ceph/ceph_debug.h>
+#include <linux/backing-dev.h>
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/parser.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/statfs.h>
+#include <linux/string.h>
+
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/debugfs.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+
+
+
+/*
+ * find filename portion of a path (/foo/bar/baz -> baz)
+ */
+const char *ceph_file_part(const char *s, int len)
+{
+       const char *e = s + len;
+
+       while (e != s && *(e-1) != '/')
+               e--;
+       return e;
+}
+EXPORT_SYMBOL(ceph_file_part);
+
+const char *ceph_msg_type_name(int type)
+{
+       switch (type) {
+       case CEPH_MSG_SHUTDOWN: return "shutdown";
+       case CEPH_MSG_PING: return "ping";
+       case CEPH_MSG_AUTH: return "auth";
+       case CEPH_MSG_AUTH_REPLY: return "auth_reply";
+       case CEPH_MSG_MON_MAP: return "mon_map";
+       case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
+       case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
+       case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
+       case CEPH_MSG_STATFS: return "statfs";
+       case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
+       case CEPH_MSG_MDS_MAP: return "mds_map";
+       case CEPH_MSG_CLIENT_SESSION: return "client_session";
+       case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
+       case CEPH_MSG_CLIENT_REQUEST: return "client_request";
+       case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
+       case CEPH_MSG_CLIENT_REPLY: return "client_reply";
+       case CEPH_MSG_CLIENT_CAPS: return "client_caps";
+       case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
+       case CEPH_MSG_CLIENT_SNAP: return "client_snap";
+       case CEPH_MSG_CLIENT_LEASE: return "client_lease";
+       case CEPH_MSG_OSD_MAP: return "osd_map";
+       case CEPH_MSG_OSD_OP: return "osd_op";
+       case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
+       default: return "unknown";
+       }
+}
+EXPORT_SYMBOL(ceph_msg_type_name);
+
+/*
+ * Initially learn our fsid, or verify an fsid matches.
+ */
+int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
+{
+       if (client->have_fsid) {
+               if (ceph_fsid_compare(&client->fsid, fsid)) {
+                       pr_err("bad fsid, had %pU got %pU",
+                              &client->fsid, fsid);
+                       return -1;
+               }
+       } else {
+               pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid);
+               memcpy(&client->fsid, fsid, sizeof(*fsid));
+               ceph_debugfs_client_init(client);
+               client->have_fsid = true;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(ceph_check_fsid);
+
+static int strcmp_null(const char *s1, const char *s2)
+{
+       if (!s1 && !s2)
+               return 0;
+       if (s1 && !s2)
+               return -1;
+       if (!s1 && s2)
+               return 1;
+       return strcmp(s1, s2);
+}
+
+int ceph_compare_options(struct ceph_options *new_opt,
+                        struct ceph_client *client)
+{
+       struct ceph_options *opt1 = new_opt;
+       struct ceph_options *opt2 = client->options;
+       int ofs = offsetof(struct ceph_options, mon_addr);
+       int i;
+       int ret;
+
+       ret = memcmp(opt1, opt2, ofs);
+       if (ret)
+               return ret;
+
+       ret = strcmp_null(opt1->name, opt2->name);
+       if (ret)
+               return ret;
+
+       ret = strcmp_null(opt1->secret, opt2->secret);
+       if (ret)
+               return ret;
+
+       /* any matching mon ip implies a match */
+       for (i = 0; i < opt1->num_mon; i++) {
+               if (ceph_monmap_contains(client->monc.monmap,
+                                &opt1->mon_addr[i]))
+                       return 0;
+       }
+       return -1;
+}
+EXPORT_SYMBOL(ceph_compare_options);
+
+
+static int parse_fsid(const char *str, struct ceph_fsid *fsid)
+{
+       int i = 0;
+       char tmp[3];
+       int err = -EINVAL;
+       int d;
+
+       dout("parse_fsid '%s'\n", str);
+       tmp[2] = 0;
+       while (*str && i < 16) {
+               if (ispunct(*str)) {
+                       str++;
+                       continue;
+               }
+               if (!isxdigit(str[0]) || !isxdigit(str[1]))
+                       break;
+               tmp[0] = str[0];
+               tmp[1] = str[1];
+               if (sscanf(tmp, "%x", &d) < 1)
+                       break;
+               fsid->fsid[i] = d & 0xff;
+               i++;
+               str += 2;
+       }
+
+       if (i == 16)
+               err = 0;
+       dout("parse_fsid ret %d got fsid %pU", err, fsid);
+       return err;
+}
+
+/*
+ * ceph options
+ */
+enum {
+       Opt_osdtimeout,
+       Opt_osdkeepalivetimeout,
+       Opt_mount_timeout,
+       Opt_osd_idle_ttl,
+       Opt_last_int,
+       /* int args above */
+       Opt_fsid,
+       Opt_name,
+       Opt_secret,
+       Opt_ip,
+       Opt_last_string,
+       /* string args above */
+       Opt_noshare,
+       Opt_nocrc,
+};
+
+static match_table_t opt_tokens = {
+       {Opt_osdtimeout, "osdtimeout=%d"},
+       {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
+       {Opt_mount_timeout, "mount_timeout=%d"},
+       {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
+       /* int args above */
+       {Opt_fsid, "fsid=%s"},
+       {Opt_name, "name=%s"},
+       {Opt_secret, "secret=%s"},
+       {Opt_ip, "ip=%s"},
+       /* string args above */
+       {Opt_noshare, "noshare"},
+       {Opt_nocrc, "nocrc"},
+       {-1, NULL}
+};
+
+void ceph_destroy_options(struct ceph_options *opt)
+{
+       dout("destroy_options %p\n", opt);
+       kfree(opt->name);
+       kfree(opt->secret);
+       kfree(opt);
+}
+EXPORT_SYMBOL(ceph_destroy_options);
+
+int ceph_parse_options(struct ceph_options **popt, char *options,
+                      const char *dev_name, const char *dev_name_end,
+                      int (*parse_extra_token)(char *c, void *private),
+                      void *private)
+{
+       struct ceph_options *opt;
+       const char *c;
+       int err = -ENOMEM;
+       substring_t argstr[MAX_OPT_ARGS];
+
+       opt = kzalloc(sizeof(*opt), GFP_KERNEL);
+       if (!opt)
+               return err;
+       opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
+                               GFP_KERNEL);
+       if (!opt->mon_addr)
+               goto out;
+
+       dout("parse_options %p options '%s' dev_name '%s'\n", opt, options,
+            dev_name);
+
+       /* start with defaults */
+       opt->flags = CEPH_OPT_DEFAULT;
+       opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
+       opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
+       opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
+       opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
+
+       /* get mon ip(s) */
+       /* ip1[:port1][,ip2[:port2]...] */
+       err = ceph_parse_ips(dev_name, dev_name_end, opt->mon_addr,
+                            CEPH_MAX_MON, &opt->num_mon);
+       if (err < 0)
+               goto out;
+
+       /* parse mount options */
+       while ((c = strsep(&options, ",")) != NULL) {
+               int token, intval, ret;
+               if (!*c)
+                       continue;
+               err = -EINVAL;
+               token = match_token((char *)c, opt_tokens, argstr);
+               if (token < 0 && parse_extra_token) {
+                       /* extra? */
+                       err = parse_extra_token((char *)c, private);
+                       if (err < 0) {
+                               pr_err("bad option at '%s'\n", c);
+                               goto out;
+                       }
+                       continue;
+               }
+               if (token < Opt_last_int) {
+                       ret = match_int(&argstr[0], &intval);
+                       if (ret < 0) {
+                               pr_err("bad mount option arg (not int) "
+                                      "at '%s'\n", c);
+                               continue;
+                       }
+                       dout("got int token %d val %d\n", token, intval);
+               } else if (token > Opt_last_int && token < Opt_last_string) {
+                       dout("got string token %d val %s\n", token,
+                            argstr[0].from);
+               } else {
+                       dout("got token %d\n", token);
+               }
+               switch (token) {
+               case Opt_ip:
+                       err = ceph_parse_ips(argstr[0].from,
+                                            argstr[0].to,
+                                            &opt->my_addr,
+                                            1, NULL);
+                       if (err < 0)
+                               goto out;
+                       opt->flags |= CEPH_OPT_MYIP;
+                       break;
+
+               case Opt_fsid:
+                       err = parse_fsid(argstr[0].from, &opt->fsid);
+                       if (err == 0)
+                               opt->flags |= CEPH_OPT_FSID;
+                       break;
+               case Opt_name:
+                       opt->name = kstrndup(argstr[0].from,
+                                             argstr[0].to-argstr[0].from,
+                                             GFP_KERNEL);
+                       break;
+               case Opt_secret:
+                       opt->secret = kstrndup(argstr[0].from,
+                                               argstr[0].to-argstr[0].from,
+                                               GFP_KERNEL);
+                       break;
+
+                       /* misc */
+               case Opt_osdtimeout:
+                       opt->osd_timeout = intval;
+                       break;
+               case Opt_osdkeepalivetimeout:
+                       opt->osd_keepalive_timeout = intval;
+                       break;
+               case Opt_osd_idle_ttl:
+                       opt->osd_idle_ttl = intval;
+                       break;
+               case Opt_mount_timeout:
+                       opt->mount_timeout = intval;
+                       break;
+
+               case Opt_noshare:
+                       opt->flags |= CEPH_OPT_NOSHARE;
+                       break;
+
+               case Opt_nocrc:
+                       opt->flags |= CEPH_OPT_NOCRC;
+                       break;
+
+               default:
+                       BUG_ON(token);
+               }
+       }
+
+       /* success */
+       *popt = opt;
+       return 0;
+
+out:
+       ceph_destroy_options(opt);
+       return err;
+}
+EXPORT_SYMBOL(ceph_parse_options);
+
+u64 ceph_client_id(struct ceph_client *client)
+{
+       return client->monc.auth->global_id;
+}
+EXPORT_SYMBOL(ceph_client_id);
+
+/*
+ * create a fresh client instance
+ */
+struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
+{
+       struct ceph_client *client;
+       int err = -ENOMEM;
+
+       client = kzalloc(sizeof(*client), GFP_KERNEL);
+       if (client == NULL)
+               return ERR_PTR(-ENOMEM);
+
+       client->private = private;
+       client->options = opt;
+
+       mutex_init(&client->mount_mutex);
+       init_waitqueue_head(&client->auth_wq);
+       client->auth_err = 0;
+
+       client->extra_mon_dispatch = NULL;
+       client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT;
+       client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT;
+
+       client->msgr = NULL;
+
+       /* subsystems */
+       err = ceph_monc_init(&client->monc, client);
+       if (err < 0)
+               goto fail;
+       err = ceph_osdc_init(&client->osdc, client);
+       if (err < 0)
+               goto fail_monc;
+
+       return client;
+
+fail_monc:
+       ceph_monc_stop(&client->monc);
+fail:
+       kfree(client);
+       return ERR_PTR(err);
+}
+EXPORT_SYMBOL(ceph_create_client);
+
+void ceph_destroy_client(struct ceph_client *client)
+{
+       dout("destroy_client %p\n", client);
+
+       /* unmount */
+       ceph_osdc_stop(&client->osdc);
+
+       /*
+        * make sure mds and osd connections close out before destroying
+        * the auth module, which is needed to free those connections'
+        * ceph_authorizers.
+        */
+       ceph_msgr_flush();
+
+       ceph_monc_stop(&client->monc);
+
+       ceph_debugfs_client_cleanup(client);
+
+       if (client->msgr)
+               ceph_messenger_destroy(client->msgr);
+
+       ceph_destroy_options(client->options);
+
+       kfree(client);
+       dout("destroy_client %p done\n", client);
+}
+EXPORT_SYMBOL(ceph_destroy_client);
+
+/*
+ * true if we have the mon map (and have thus joined the cluster)
+ */
+static int have_mon_and_osd_map(struct ceph_client *client)
+{
+       return client->monc.monmap && client->monc.monmap->epoch &&
+              client->osdc.osdmap && client->osdc.osdmap->epoch;
+}
+
+/*
+ * mount: join the ceph cluster, and open root directory.
+ */
+int __ceph_open_session(struct ceph_client *client, unsigned long started)
+{
+       struct ceph_entity_addr *myaddr = NULL;
+       int err;
+       unsigned long timeout = client->options->mount_timeout * HZ;
+
+       /* initialize the messenger */
+       if (client->msgr == NULL) {
+               if (ceph_test_opt(client, MYIP))
+                       myaddr = &client->options->my_addr;
+               client->msgr = ceph_messenger_create(myaddr,
+                                       client->supported_features,
+                                       client->required_features);
+               if (IS_ERR(client->msgr)) {
+                       client->msgr = NULL;
+                       return PTR_ERR(client->msgr);
+               }
+               client->msgr->nocrc = ceph_test_opt(client, NOCRC);
+       }
+
+       /* open session, and wait for mon and osd maps */
+       err = ceph_monc_open_session(&client->monc);
+       if (err < 0)
+               return err;
+
+       while (!have_mon_and_osd_map(client)) {
+               err = -EIO;
+               if (timeout && time_after_eq(jiffies, started + timeout))
+                       return err;
+
+               /* wait */
+               dout("mount waiting for mon_map\n");
+               err = wait_event_interruptible_timeout(client->auth_wq,
+                       have_mon_and_osd_map(client) || (client->auth_err < 0),
+                       timeout);
+               if (err == -EINTR || err == -ERESTARTSYS)
+                       return err;
+               if (client->auth_err < 0)
+                       return client->auth_err;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(__ceph_open_session);
+
+
+int ceph_open_session(struct ceph_client *client)
+{
+       int ret;
+       unsigned long started = jiffies;  /* note the start time */
+
+       dout("open_session start\n");
+       mutex_lock(&client->mount_mutex);
+
+       ret = __ceph_open_session(client, started);
+
+       mutex_unlock(&client->mount_mutex);
+       return ret;
+}
+EXPORT_SYMBOL(ceph_open_session);
+
+
+static int __init init_ceph_lib(void)
+{
+       int ret = 0;
+
+       ret = ceph_debugfs_init();
+       if (ret < 0)
+               goto out;
+
+       ret = ceph_msgr_init();
+       if (ret < 0)
+               goto out_debugfs;
+
+       pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n",
+               CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL,
+               CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
+               CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
+
+       return 0;
+
+out_debugfs:
+       ceph_debugfs_cleanup();
+out:
+       return ret;
+}
+
+static void __exit exit_ceph_lib(void)
+{
+       dout("exit_ceph_lib\n");
+       ceph_msgr_exit();
+       ceph_debugfs_cleanup();
+}
+
+module_init(init_ceph_lib);
+module_exit(exit_ceph_lib);
+
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
+MODULE_DESCRIPTION("Ceph filesystem for Linux");
+MODULE_LICENSE("GPL");
diff --git a/fs/ceph/ceph_fs.c b/net/ceph/ceph_fs.c

similarity index 92%

rename from fs/ceph/ceph_fs.c

rename to net/ceph/ceph_fs.c

index 3ac6cc7c1156dafd78262e7290edf914c3f01e31..a3a3a31d3c37b0006d0e9bfe26fd06a7ebe3b47d 100644 (file)
--- a/fs/ceph/ceph_fs.c
+++ b/net/ceph/ceph_fs.c
@@ -1,7 +1,8 @@
  /*
   * Some non-inline ceph helpers
   */
-#include "types.h"
+#include <linux/module.h>
+#include <linux/ceph/types.h>
  
  /*
   * return true if @layout appears to be valid
@@ -52,6 +53,7 @@ int ceph_flags_to_mode(int flags)
  
         return mode;
  }
+EXPORT_SYMBOL(ceph_flags_to_mode);
  
  int ceph_caps_for_mode(int mode)
  {
@@ -70,3 +72,4 @@ int ceph_caps_for_mode(int mode)
  
         return caps;
  }
+EXPORT_SYMBOL(ceph_caps_for_mode);
diff --git a/fs/ceph/ceph_hash.c b/net/ceph/ceph_hash.c

similarity index 98%

rename from fs/ceph/ceph_hash.c

rename to net/ceph/ceph_hash.c

index bd570015d147ca2b1ce52061d359a56bb936a4df..815ef8826796a82b6b80030830a1663e9421b417 100644 (file)
--- a/fs/ceph/ceph_hash.c
+++ b/net/ceph/ceph_hash.c
@@ -1,5 +1,5 @@
  
-#include "types.h"
+#include <linux/ceph/types.h>
  
  /*
   * Robert Jenkin's hash function.
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c

new file mode 100644 (file)

index 0000000..3fbda04
--- /dev/null
+++ b/net/ceph/ceph_strings.c
@@ -0,0 +1,84 @@
+/*
+ * Ceph string constants
+ */
+#include <linux/module.h>
+#include <linux/ceph/types.h>
+
+const char *ceph_entity_type_name(int type)
+{
+       switch (type) {
+       case CEPH_ENTITY_TYPE_MDS: return "mds";
+       case CEPH_ENTITY_TYPE_OSD: return "osd";
+       case CEPH_ENTITY_TYPE_MON: return "mon";
+       case CEPH_ENTITY_TYPE_CLIENT: return "client";
+       case CEPH_ENTITY_TYPE_AUTH: return "auth";
+       default: return "unknown";
+       }
+}
+
+const char *ceph_osd_op_name(int op)
+{
+       switch (op) {
+       case CEPH_OSD_OP_READ: return "read";
+       case CEPH_OSD_OP_STAT: return "stat";
+
+       case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
+
+       case CEPH_OSD_OP_WRITE: return "write";
+       case CEPH_OSD_OP_DELETE: return "delete";
+       case CEPH_OSD_OP_TRUNCATE: return "truncate";
+       case CEPH_OSD_OP_ZERO: return "zero";
+       case CEPH_OSD_OP_WRITEFULL: return "writefull";
+       case CEPH_OSD_OP_ROLLBACK: return "rollback";
+
+       case CEPH_OSD_OP_APPEND: return "append";
+       case CEPH_OSD_OP_STARTSYNC: return "startsync";
+       case CEPH_OSD_OP_SETTRUNC: return "settrunc";
+       case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
+
+       case CEPH_OSD_OP_TMAPUP: return "tmapup";
+       case CEPH_OSD_OP_TMAPGET: return "tmapget";
+       case CEPH_OSD_OP_TMAPPUT: return "tmapput";
+
+       case CEPH_OSD_OP_GETXATTR: return "getxattr";
+       case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
+       case CEPH_OSD_OP_SETXATTR: return "setxattr";
+       case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
+       case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
+       case CEPH_OSD_OP_RMXATTR: return "rmxattr";
+       case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
+
+       case CEPH_OSD_OP_PULL: return "pull";
+       case CEPH_OSD_OP_PUSH: return "push";
+       case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
+       case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
+       case CEPH_OSD_OP_SCRUB: return "scrub";
+
+       case CEPH_OSD_OP_WRLOCK: return "wrlock";
+       case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
+       case CEPH_OSD_OP_RDLOCK: return "rdlock";
+       case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
+       case CEPH_OSD_OP_UPLOCK: return "uplock";
+       case CEPH_OSD_OP_DNLOCK: return "dnlock";
+
+       case CEPH_OSD_OP_CALL: return "call";
+
+       case CEPH_OSD_OP_PGLS: return "pgls";
+       }
+       return "???";
+}
+
+
+const char *ceph_pool_op_name(int op)
+{
+       switch (op) {
+       case POOL_OP_CREATE: return "create";
+       case POOL_OP_DELETE: return "delete";
+       case POOL_OP_AUID_CHANGE: return "auid change";
+       case POOL_OP_CREATE_SNAP: return "create snap";
+       case POOL_OP_DELETE_SNAP: return "delete snap";
+       case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
+       case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
+       }
+       return "???";
+}
diff --git a/fs/ceph/crush/crush.c b/net/ceph/crush/crush.c

similarity index 99%

rename from fs/ceph/crush/crush.c

rename to net/ceph/crush/crush.c

index fabd302e5779c0188b55916be81d7913dce84105..d6ebb13a18a4bc787eb249ad3e4b62ec1c174f97 100644 (file)
--- a/fs/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -8,7 +8,7 @@
  # define BUG_ON(x) assert(!(x))
  #endif
  
-#include "crush.h"
+#include <linux/crush/crush.h>
  
  const char *crush_bucket_alg_name(int alg)
  {
diff --git a/fs/ceph/crush/hash.c b/net/ceph/crush/hash.c

similarity index 99%

rename from fs/ceph/crush/hash.c

rename to net/ceph/crush/hash.c

index 5873aed694bf5a06a092102a2209c5ad71a9aff8..5bb63e37a8a10f3419a399b32339cf5baf279762 100644 (file)
--- a/fs/ceph/crush/hash.c
+++ b/net/ceph/crush/hash.c
@@ -1,6 +1,6 @@
  
  #include <linux/types.h>
-#include "hash.h"
+#include <linux/crush/hash.h>
  
  /*
   * Robert Jenkins' function for mixing 32-bit values
diff --git a/fs/ceph/crush/mapper.c b/net/ceph/crush/mapper.c

similarity index 99%

rename from fs/ceph/crush/mapper.c

rename to net/ceph/crush/mapper.c

index a4eec133258e80a880a8cecca0282c54e11982ac..42599e31dcad8a6ceb1f3b29d9171f6a249e56a3 100644 (file)
--- a/fs/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -18,8 +18,8 @@
  # define kfree(x) free(x)
  #endif
  
-#include "crush.h"
-#include "hash.h"
+#include <linux/crush/crush.h>
+#include <linux/crush/hash.h>
  
  /*
   * Implement the core CRUSH mapping algorithm.
diff --git a/fs/ceph/crypto.c b/net/ceph/crypto.c

similarity index 99%

rename from fs/ceph/crypto.c

rename to net/ceph/crypto.c

index a3e627f63293b2d329f620525328280a9ae15291..7b505b0c983f74eaffa51e306e38f2cd90aad3fb 100644 (file)
--- a/fs/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -1,13 +1,13 @@
  
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <linux/err.h>
  #include <linux/scatterlist.h>
  #include <linux/slab.h>
  #include <crypto/hash.h>
  
+#include <linux/ceph/decode.h>
  #include "crypto.h"
-#include "decode.h"
  
  int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
  {
diff --git a/fs/ceph/crypto.h b/net/ceph/crypto.h

similarity index 95%

rename from fs/ceph/crypto.h

rename to net/ceph/crypto.h

index bdf38607323c2da2eb87d2c3ebf7b3d158bd7d0d..f9eccace592b63b10a68c8f7d5d99abe2b2eba55 100644 (file)
--- a/fs/ceph/crypto.h
+++ b/net/ceph/crypto.h
@@ -1,8 +1,8 @@
  #ifndef _FS_CEPH_CRYPTO_H
  #define _FS_CEPH_CRYPTO_H
  
-#include "types.h"
-#include "buffer.h"
+#include <linux/ceph/types.h>
+#include <linux/ceph/buffer.h>
  
  /*
   * cryptographic secret
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c

new file mode 100644 (file)

index 0000000..27d4ea3
--- /dev/null
+++ b/net/ceph/debugfs.c
@@ -0,0 +1,267 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
+
+#ifdef CONFIG_DEBUG_FS
+
+/*
+ * Implement /sys/kernel/debug/ceph fun
+ *
+ * /sys/kernel/debug/ceph/client*  - an instance of the ceph client
+ *      .../osdmap      - current osdmap
+ *      .../monmap      - current monmap
+ *      .../osdc        - active osd requests
+ *      .../monc        - mon client state
+ *      .../dentry_lru  - dump contents of dentry lru
+ *      .../caps        - expose cap (reservation) stats
+ *      .../bdi         - symlink to ../../bdi/something
+ */
+
+static struct dentry *ceph_debugfs_dir;
+
+static int monmap_show(struct seq_file *s, void *p)
+{
+       int i;
+       struct ceph_client *client = s->private;
+
+       if (client->monc.monmap == NULL)
+               return 0;
+
+       seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
+       for (i = 0; i < client->monc.monmap->num_mon; i++) {
+               struct ceph_entity_inst *inst =
+                       &client->monc.monmap->mon_inst[i];
+
+               seq_printf(s, "\t%s%lld\t%s\n",
+                          ENTITY_NAME(inst->name),
+                          ceph_pr_addr(&inst->addr.in_addr));
+       }
+       return 0;
+}
+
+static int osdmap_show(struct seq_file *s, void *p)
+{
+       int i;
+       struct ceph_client *client = s->private;
+       struct rb_node *n;
+
+       if (client->osdc.osdmap == NULL)
+               return 0;
+       seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
+       seq_printf(s, "flags%s%s\n",
+                  (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
+                  " NEARFULL" : "",
+                  (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
+                  " FULL" : "");
+       for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
+               struct ceph_pg_pool_info *pool =
+                       rb_entry(n, struct ceph_pg_pool_info, node);
+               seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
+                          pool->id, pool->v.pg_num, pool->pg_num_mask,
+                          pool->v.lpg_num, pool->lpg_num_mask);
+       }
+       for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
+               struct ceph_entity_addr *addr =
+                       &client->osdc.osdmap->osd_addr[i];
+               int state = client->osdc.osdmap->osd_state[i];
+               char sb[64];
+
+               seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
+                          i, ceph_pr_addr(&addr->in_addr),
+                          ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
+                          ceph_osdmap_state_str(sb, sizeof(sb), state));
+       }
+       return 0;
+}
+
+static int monc_show(struct seq_file *s, void *p)
+{
+       struct ceph_client *client = s->private;
+       struct ceph_mon_generic_request *req;
+       struct ceph_mon_client *monc = &client->monc;
+       struct rb_node *rp;
+
+       mutex_lock(&monc->mutex);
+
+       if (monc->have_mdsmap)
+               seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
+       if (monc->have_osdmap)
+               seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
+       if (monc->want_next_osdmap)
+               seq_printf(s, "want next osdmap\n");
+
+       for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
+               __u16 op;
+               req = rb_entry(rp, struct ceph_mon_generic_request, node);
+               op = le16_to_cpu(req->request->hdr.type);
+               if (op == CEPH_MSG_STATFS)
+                       seq_printf(s, "%lld statfs\n", req->tid);
+               else
+                       seq_printf(s, "%lld unknown\n", req->tid);
+       }
+
+       mutex_unlock(&monc->mutex);
+       return 0;
+}
+
+static int osdc_show(struct seq_file *s, void *pp)
+{
+       struct ceph_client *client = s->private;
+       struct ceph_osd_client *osdc = &client->osdc;
+       struct rb_node *p;
+
+       mutex_lock(&osdc->request_mutex);
+       for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+               struct ceph_osd_request *req;
+               struct ceph_osd_request_head *head;
+               struct ceph_osd_op *op;
+               int num_ops;
+               int opcode, olen;
+               int i;
+
+               req = rb_entry(p, struct ceph_osd_request, r_node);
+
+               seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
+                          req->r_osd ? req->r_osd->o_osd : -1,
+                          le32_to_cpu(req->r_pgid.pool),
+                          le16_to_cpu(req->r_pgid.ps));
+
+               head = req->r_request->front.iov_base;
+               op = (void *)(head + 1);
+
+               num_ops = le16_to_cpu(head->num_ops);
+               olen = le32_to_cpu(head->object_len);
+               seq_printf(s, "%.*s", olen,
+                          (const char *)(head->ops + num_ops));
+
+               if (req->r_reassert_version.epoch)
+                       seq_printf(s, "\t%u'%llu",
+                          (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
+                          le64_to_cpu(req->r_reassert_version.version));
+               else
+                       seq_printf(s, "\t");
+
+               for (i = 0; i < num_ops; i++) {
+                       opcode = le16_to_cpu(op->op);
+                       seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
+                       op++;
+               }
+
+               seq_printf(s, "\n");
+       }
+       mutex_unlock(&osdc->request_mutex);
+       return 0;
+}
+
+CEPH_DEFINE_SHOW_FUNC(monmap_show)
+CEPH_DEFINE_SHOW_FUNC(osdmap_show)
+CEPH_DEFINE_SHOW_FUNC(monc_show)
+CEPH_DEFINE_SHOW_FUNC(osdc_show)
+
+int ceph_debugfs_init(void)
+{
+       ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
+       if (!ceph_debugfs_dir)
+               return -ENOMEM;
+       return 0;
+}
+
+void ceph_debugfs_cleanup(void)
+{
+       debugfs_remove(ceph_debugfs_dir);
+}
+
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+       int ret = -ENOMEM;
+       char name[80];
+
+       snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
+                client->monc.auth->global_id);
+
+       client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
+       if (!client->debugfs_dir)
+               goto out;
+
+       client->monc.debugfs_file = debugfs_create_file("monc",
+                                                     0600,
+                                                     client->debugfs_dir,
+                                                     client,
+                                                     &monc_show_fops);
+       if (!client->monc.debugfs_file)
+               goto out;
+
+       client->osdc.debugfs_file = debugfs_create_file("osdc",
+                                                     0600,
+                                                     client->debugfs_dir,
+                                                     client,
+                                                     &osdc_show_fops);
+       if (!client->osdc.debugfs_file)
+               goto out;
+
+       client->debugfs_monmap = debugfs_create_file("monmap",
+                                       0600,
+                                       client->debugfs_dir,
+                                       client,
+                                       &monmap_show_fops);
+       if (!client->debugfs_monmap)
+               goto out;
+
+       client->debugfs_osdmap = debugfs_create_file("osdmap",
+                                       0600,
+                                       client->debugfs_dir,
+                                       client,
+                                       &osdmap_show_fops);
+       if (!client->debugfs_osdmap)
+               goto out;
+
+       return 0;
+
+out:
+       ceph_debugfs_client_cleanup(client);
+       return ret;
+}
+
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+       debugfs_remove(client->debugfs_osdmap);
+       debugfs_remove(client->debugfs_monmap);
+       debugfs_remove(client->osdc.debugfs_file);
+       debugfs_remove(client->monc.debugfs_file);
+       debugfs_remove(client->debugfs_dir);
+}
+
+#else  /* CONFIG_DEBUG_FS */
+
+int ceph_debugfs_init(void)
+{
+       return 0;
+}
+
+void ceph_debugfs_cleanup(void)
+{
+}
+
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+       return 0;
+}
+
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+}
+
+#endif  /* CONFIG_DEBUG_FS */
+
+EXPORT_SYMBOL(ceph_debugfs_init);
+EXPORT_SYMBOL(ceph_debugfs_cleanup);
diff --git a/fs/ceph/messenger.c b/net/ceph/messenger.c

similarity index 89%

rename from fs/ceph/messenger.c

rename to net/ceph/messenger.c

index 2502d76fcec175ddb8500bcd1863c52add1208e5..0e8157ee5d4382a32992f34c2b1cf0d78f2868b4 100644 (file)
--- a/fs/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <linux/crc32c.h>
  #include <linux/ctype.h>
@@ -9,12 +9,14 @@
  #include <linux/slab.h>
  #include <linux/socket.h>
  #include <linux/string.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
  #include <net/tcp.h>
  
-#include "super.h"
-#include "messenger.h"
-#include "decode.h"
-#include "pagelist.h"
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/pagelist.h>
  
  /*
   * Ceph uses the messenger to exchange ceph_msg messages with other
@@ -48,7 +50,7 @@ static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
  static DEFINE_SPINLOCK(addr_str_lock);
  static int last_addr_str;
  
-const char *pr_addr(const struct sockaddr_storage *ss)
+const char *ceph_pr_addr(const struct sockaddr_storage *ss)
  {
         int i;
         char *s;
@@ -79,6 +81,7 @@ const char *pr_addr(const struct sockaddr_storage *ss)
  
         return s;
  }
+EXPORT_SYMBOL(ceph_pr_addr);
  
  static void encode_my_addr(struct ceph_messenger *msgr)
  {
@@ -91,7 +94,7 @@ static void encode_my_addr(struct ceph_messenger *msgr)
   */
  struct workqueue_struct *ceph_msgr_wq;
  
-int __init ceph_msgr_init(void)
+int ceph_msgr_init(void)
  {
         ceph_msgr_wq = create_workqueue("ceph-msgr");
         if (IS_ERR(ceph_msgr_wq)) {
@@ -102,16 +105,19 @@ int __init ceph_msgr_init(void)
         }
         return 0;
  }
+EXPORT_SYMBOL(ceph_msgr_init);
  
  void ceph_msgr_exit(void)
  {
         destroy_workqueue(ceph_msgr_wq);
  }
+EXPORT_SYMBOL(ceph_msgr_exit);
  
  void ceph_msgr_flush(void)
  {
         flush_workqueue(ceph_msgr_wq);
  }
+EXPORT_SYMBOL(ceph_msgr_flush);
  
  
  /*
@@ -221,19 +227,19 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con)
  
         set_sock_callbacks(sock, con);
  
-       dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
+       dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr));
  
         ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
                                  O_NONBLOCK);
         if (ret == -EINPROGRESS) {
                 dout("connect %s EINPROGRESS sk_state = %u\n",
-                    pr_addr(&con->peer_addr.in_addr),
+                    ceph_pr_addr(&con->peer_addr.in_addr),
                      sock->sk->sk_state);
                 ret = 0;
         }
         if (ret < 0) {
                 pr_err("connect %s error %d\n",
-                      pr_addr(&con->peer_addr.in_addr), ret);
+                      ceph_pr_addr(&con->peer_addr.in_addr), ret);
                 sock_release(sock);
                 con->sock = NULL;
                 con->error_msg = "connect error";
@@ -334,7 +340,8 @@ static void reset_connection(struct ceph_connection *con)
   */
  void ceph_con_close(struct ceph_connection *con)
  {
-       dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
+       dout("con_close %p peer %s\n", con,
+            ceph_pr_addr(&con->peer_addr.in_addr));
         set_bit(CLOSED, &con->state);  /* in case there's queued work */
         clear_bit(STANDBY, &con->state);  /* avoid connect_seq bump */
         clear_bit(LOSSYTX, &con->state);  /* so we retry next connect */
@@ -347,19 +354,21 @@ void ceph_con_close(struct ceph_connection *con)
         mutex_unlock(&con->mutex);
         queue_con(con);
  }
+EXPORT_SYMBOL(ceph_con_close);
  
  /*
   * Reopen a closed connection, with a new peer address.
   */
  void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
  {
-       dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
+       dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
         set_bit(OPENING, &con->state);
         clear_bit(CLOSED, &con->state);
         memcpy(&con->peer_addr, addr, sizeof(*addr));
         con->delay = 0;      /* reset backoff memory */
         queue_con(con);
  }
+EXPORT_SYMBOL(ceph_con_open);
  
  /*
   * return true if this connection ever successfully opened
@@ -406,6 +415,7 @@ void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
         INIT_LIST_HEAD(&con->out_sent);
         INIT_DELAYED_WORK(&con->work, con_work);
  }
+EXPORT_SYMBOL(ceph_con_init);
  
  
  /*
@@ -529,8 +539,11 @@ static void prepare_write_message(struct ceph_connection *con)
         if (le32_to_cpu(m->hdr.data_len) > 0) {
                 /* initialize page iterator */
                 con->out_msg_pos.page = 0;
-               con->out_msg_pos.page_pos =
-                       le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
+               if (m->pages)
+                       con->out_msg_pos.page_pos =
+                               le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
+               else
+                       con->out_msg_pos.page_pos = 0;
                 con->out_msg_pos.data_pos = 0;
                 con->out_msg_pos.did_page_crc = 0;
                 con->out_more = 1;  /* data + footer will follow */
@@ -647,7 +660,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
         dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
              con->connect_seq, global_seq, proto);
  
-       con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED);
+       con->out_connect.features = cpu_to_le64(msgr->supported_features);
         con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
         con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
         con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -712,6 +725,31 @@ out:
         return ret;  /* done! */
  }
  
+#ifdef CONFIG_BLOCK
+static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg)
+{
+       if (!bio) {
+               *iter = NULL;
+               *seg = 0;
+               return;
+       }
+       *iter = bio;
+       *seg = bio->bi_idx;
+}
+
+static void iter_bio_next(struct bio **bio_iter, int *seg)
+{
+       if (*bio_iter == NULL)
+               return;
+
+       BUG_ON(*seg >= (*bio_iter)->bi_vcnt);
+
+       (*seg)++;
+       if (*seg == (*bio_iter)->bi_vcnt)
+               init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
+}
+#endif
+
  /*
   * Write as much message data payload as we can.  If we finish, queue
   * up the footer.
@@ -726,21 +764,46 @@ static int write_partial_msg_pages(struct ceph_connection *con)
         size_t len;
         int crc = con->msgr->nocrc;
         int ret;
+       int total_max_write;
+       int in_trail = 0;
+       size_t trail_len = (msg->trail ? msg->trail->length : 0);
  
         dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
              con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
              con->out_msg_pos.page_pos);
  
-       while (con->out_msg_pos.page < con->out_msg->nr_pages) {
+#ifdef CONFIG_BLOCK
+       if (msg->bio && !msg->bio_iter)
+               init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
+#endif
+
+       while (data_len > con->out_msg_pos.data_pos) {
                 struct page *page = NULL;
                 void *kaddr = NULL;
+               int max_write = PAGE_SIZE;
+               int page_shift = 0;
+
+               total_max_write = data_len - trail_len -
+                       con->out_msg_pos.data_pos;
  
                 /*
                  * if we are calculating the data crc (the default), we need
                  * to map the page.  if our pages[] has been revoked, use the
                  * zero page.
                  */
-               if (msg->pages) {
+
+               /* have we reached the trail part of the data? */
+               if (con->out_msg_pos.data_pos >= data_len - trail_len) {
+                       in_trail = 1;
+
+                       total_max_write = data_len - con->out_msg_pos.data_pos;
+
+                       page = list_first_entry(&msg->trail->head,
+                                               struct page, lru);
+                       if (crc)
+                               kaddr = kmap(page);
+                       max_write = PAGE_SIZE;
+               } else if (msg->pages) {
                         page = msg->pages[con->out_msg_pos.page];
                         if (crc)
                                 kaddr = kmap(page);
@@ -749,13 +812,25 @@ static int write_partial_msg_pages(struct ceph_connection *con)
                                                 struct page, lru);
                         if (crc)
                                 kaddr = kmap(page);
+#ifdef CONFIG_BLOCK
+               } else if (msg->bio) {
+                       struct bio_vec *bv;
+
+                       bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg);
+                       page = bv->bv_page;
+                       page_shift = bv->bv_offset;
+                       if (crc)
+                               kaddr = kmap(page) + page_shift;
+                       max_write = bv->bv_len;
+#endif
                 } else {
                         page = con->msgr->zero_page;
                         if (crc)
                                 kaddr = page_address(con->msgr->zero_page);
                 }
-               len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
-                         (int)(data_len - con->out_msg_pos.data_pos));
+               len = min_t(int, max_write - con->out_msg_pos.page_pos,
+                           total_max_write);
+
                 if (crc && !con->out_msg_pos.did_page_crc) {
                         void *base = kaddr + con->out_msg_pos.page_pos;
                         u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
@@ -765,13 +840,14 @@ static int write_partial_msg_pages(struct ceph_connection *con)
                                 cpu_to_le32(crc32c(tmpcrc, base, len));
                         con->out_msg_pos.did_page_crc = 1;
                 }
-
                 ret = kernel_sendpage(con->sock, page,
-                                     con->out_msg_pos.page_pos, len,
+                                     con->out_msg_pos.page_pos + page_shift,
+                                     len,
                                       MSG_DONTWAIT | MSG_NOSIGNAL |
                                       MSG_MORE);
  
-               if (crc && (msg->pages || msg->pagelist))
+               if (crc &&
+                   (msg->pages || msg->pagelist || msg->bio || in_trail))
                         kunmap(page);
  
                 if (ret <= 0)
@@ -783,9 +859,16 @@ static int write_partial_msg_pages(struct ceph_connection *con)
                         con->out_msg_pos.page_pos = 0;
                         con->out_msg_pos.page++;
                         con->out_msg_pos.did_page_crc = 0;
-                       if (msg->pagelist)
+                       if (in_trail)
+                               list_move_tail(&page->lru,
+                                              &msg->trail->head);
+                       else if (msg->pagelist)
                                 list_move_tail(&page->lru,
                                                &msg->pagelist->head);
+#ifdef CONFIG_BLOCK
+                       else if (msg->bio)
+                               iter_bio_next(&msg->bio_iter, &msg->bio_seg);
+#endif
                 }
         }
  
@@ -938,7 +1021,7 @@ static int verify_hello(struct ceph_connection *con)
  {
         if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
                 pr_err("connect to %s got bad banner\n",
-                      pr_addr(&con->peer_addr.in_addr));
+                      ceph_pr_addr(&con->peer_addr.in_addr));
                 con->error_msg = "protocol error, bad banner";
                 return -1;
         }
@@ -1041,7 +1124,7 @@ int ceph_parse_ips(const char *c, const char *end,
  
                 addr_set_port(ss, port);
  
-               dout("parse_ips got %s\n", pr_addr(ss));
+               dout("parse_ips got %s\n", ceph_pr_addr(ss));
  
                 if (p == end)
                         break;
@@ -1061,6 +1144,7 @@ bad:
         pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
         return -EINVAL;
  }
+EXPORT_SYMBOL(ceph_parse_ips);
  
  static int process_banner(struct ceph_connection *con)
  {
@@ -1082,9 +1166,9 @@ static int process_banner(struct ceph_connection *con)
             !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
               con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
                 pr_warning("wrong peer, want %s/%d, got %s/%d\n",
-                          pr_addr(&con->peer_addr.in_addr),
+                          ceph_pr_addr(&con->peer_addr.in_addr),
                            (int)le32_to_cpu(con->peer_addr.nonce),
-                          pr_addr(&con->actual_peer_addr.in_addr),
+                          ceph_pr_addr(&con->actual_peer_addr.in_addr),
                            (int)le32_to_cpu(con->actual_peer_addr.nonce));
                 con->error_msg = "wrong peer at address";
                 return -1;
@@ -1102,7 +1186,7 @@ static int process_banner(struct ceph_connection *con)
                 addr_set_port(&con->msgr->inst.addr.in_addr, port);
                 encode_my_addr(con->msgr);
                 dout("process_banner learned my addr is %s\n",
-                    pr_addr(&con->msgr->inst.addr.in_addr));
+                    ceph_pr_addr(&con->msgr->inst.addr.in_addr));
         }
  
         set_bit(NEGOTIATING, &con->state);
@@ -1123,8 +1207,8 @@ static void fail_protocol(struct ceph_connection *con)
  
  static int process_connect(struct ceph_connection *con)
  {
-       u64 sup_feat = CEPH_FEATURE_SUPPORTED;
-       u64 req_feat = CEPH_FEATURE_REQUIRED;
+       u64 sup_feat = con->msgr->supported_features;
+       u64 req_feat = con->msgr->required_features;
         u64 server_feat = le64_to_cpu(con->in_reply.features);
  
         dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
@@ -1134,7 +1218,7 @@ static int process_connect(struct ceph_connection *con)
                 pr_err("%s%lld %s feature set mismatch,"
                        " my %llx < server's %llx, missing %llx\n",
                        ENTITY_NAME(con->peer_name),
-                      pr_addr(&con->peer_addr.in_addr),
+                      ceph_pr_addr(&con->peer_addr.in_addr),
                        sup_feat, server_feat, server_feat & ~sup_feat);
                 con->error_msg = "missing required protocol features";
                 fail_protocol(con);
@@ -1144,7 +1228,7 @@ static int process_connect(struct ceph_connection *con)
                 pr_err("%s%lld %s protocol version mismatch,"
                        " my %d != server's %d\n",
                        ENTITY_NAME(con->peer_name),
-                      pr_addr(&con->peer_addr.in_addr),
+                      ceph_pr_addr(&con->peer_addr.in_addr),
                        le32_to_cpu(con->out_connect.protocol_version),
                        le32_to_cpu(con->in_reply.protocol_version));
                 con->error_msg = "protocol version mismatch";
@@ -1178,7 +1262,7 @@ static int process_connect(struct ceph_connection *con)
                      le32_to_cpu(con->in_connect.connect_seq));
                 pr_err("%s%lld %s connection reset\n",
                        ENTITY_NAME(con->peer_name),
-                      pr_addr(&con->peer_addr.in_addr));
+                      ceph_pr_addr(&con->peer_addr.in_addr));
                 reset_connection(con);
                 prepare_write_connect(con->msgr, con, 0);
                 prepare_read_connect(con);
@@ -1223,7 +1307,7 @@ static int process_connect(struct ceph_connection *con)
                         pr_err("%s%lld %s protocol feature mismatch,"
                                " my required %llx > server's %llx, need %llx\n",
                                ENTITY_NAME(con->peer_name),
-                              pr_addr(&con->peer_addr.in_addr),
+                              ceph_pr_addr(&con->peer_addr.in_addr),
                                req_feat, server_feat, req_feat & ~server_feat);
                         con->error_msg = "missing required protocol features";
                         fail_protocol(con);
@@ -1305,8 +1389,7 @@ static int read_partial_message_section(struct ceph_connection *con,
                                         struct kvec *section,
                                         unsigned int sec_len, u32 *crc)
  {
-       int left;
-       int ret;
+       int ret, left;
  
         BUG_ON(!section);
  
@@ -1329,13 +1412,83 @@ static int read_partial_message_section(struct ceph_connection *con,
  static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
                                 struct ceph_msg_header *hdr,
                                 int *skip);
+
+
+static int read_partial_message_pages(struct ceph_connection *con,
+                                     struct page **pages,
+                                     unsigned data_len, int datacrc)
+{
+       void *p;
+       int ret;
+       int left;
+
+       left = min((int)(data_len - con->in_msg_pos.data_pos),
+                  (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
+       /* (page) data */
+       BUG_ON(pages == NULL);
+       p = kmap(pages[con->in_msg_pos.page]);
+       ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
+                              left);
+       if (ret > 0 && datacrc)
+               con->in_data_crc =
+                       crc32c(con->in_data_crc,
+                                 p + con->in_msg_pos.page_pos, ret);
+       kunmap(pages[con->in_msg_pos.page]);
+       if (ret <= 0)
+               return ret;
+       con->in_msg_pos.data_pos += ret;
+       con->in_msg_pos.page_pos += ret;
+       if (con->in_msg_pos.page_pos == PAGE_SIZE) {
+               con->in_msg_pos.page_pos = 0;
+               con->in_msg_pos.page++;
+       }
+
+       return ret;
+}
+
+#ifdef CONFIG_BLOCK
+static int read_partial_message_bio(struct ceph_connection *con,
+                                   struct bio **bio_iter, int *bio_seg,
+                                   unsigned data_len, int datacrc)
+{
+       struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg);
+       void *p;
+       int ret, left;
+
+       if (IS_ERR(bv))
+               return PTR_ERR(bv);
+
+       left = min((int)(data_len - con->in_msg_pos.data_pos),
+                  (int)(bv->bv_len - con->in_msg_pos.page_pos));
+
+       p = kmap(bv->bv_page) + bv->bv_offset;
+
+       ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
+                              left);
+       if (ret > 0 && datacrc)
+               con->in_data_crc =
+                       crc32c(con->in_data_crc,
+                                 p + con->in_msg_pos.page_pos, ret);
+       kunmap(bv->bv_page);
+       if (ret <= 0)
+               return ret;
+       con->in_msg_pos.data_pos += ret;
+       con->in_msg_pos.page_pos += ret;
+       if (con->in_msg_pos.page_pos == bv->bv_len) {
+               con->in_msg_pos.page_pos = 0;
+               iter_bio_next(bio_iter, bio_seg);
+       }
+
+       return ret;
+}
+#endif
+
  /*
   * read (part of) a message.
   */
  static int read_partial_message(struct ceph_connection *con)
  {
         struct ceph_msg *m = con->in_msg;
-       void *p;
         int ret;
         int to, left;
         unsigned front_len, middle_len, data_len, data_off;
@@ -1381,7 +1534,7 @@ static int read_partial_message(struct ceph_connection *con)
         if ((s64)seq - (s64)con->in_seq < 1) {
                 pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
                         ENTITY_NAME(con->peer_name),
-                       pr_addr(&con->peer_addr.in_addr),
+                       ceph_pr_addr(&con->peer_addr.in_addr),
                         seq, con->in_seq + 1);
                 con->in_base_pos = -front_len - middle_len - data_len -
                         sizeof(m->footer);
@@ -1422,7 +1575,10 @@ static int read_partial_message(struct ceph_connection *con)
                         m->middle->vec.iov_len = 0;
  
                 con->in_msg_pos.page = 0;
-               con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
+               if (m->pages)
+                       con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
+               else
+                       con->in_msg_pos.page_pos = 0;
                 con->in_msg_pos.data_pos = 0;
         }
  
@@ -1440,27 +1596,29 @@ static int read_partial_message(struct ceph_connection *con)
                 if (ret <= 0)
                         return ret;
         }
+#ifdef CONFIG_BLOCK
+       if (m->bio && !m->bio_iter)
+               init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
+#endif
  
         /* (page) data */
         while (con->in_msg_pos.data_pos < data_len) {
-               left = min((int)(data_len - con->in_msg_pos.data_pos),
-                          (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
-               BUG_ON(m->pages == NULL);
-               p = kmap(m->pages[con->in_msg_pos.page]);
-               ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
-                                      left);
-               if (ret > 0 && datacrc)
-                       con->in_data_crc =
-                               crc32c(con->in_data_crc,
-                                         p + con->in_msg_pos.page_pos, ret);
-               kunmap(m->pages[con->in_msg_pos.page]);
-               if (ret <= 0)
-                       return ret;
-               con->in_msg_pos.data_pos += ret;
-               con->in_msg_pos.page_pos += ret;
-               if (con->in_msg_pos.page_pos == PAGE_SIZE) {
-                       con->in_msg_pos.page_pos = 0;
-                       con->in_msg_pos.page++;
+               if (m->pages) {
+                       ret = read_partial_message_pages(con, m->pages,
+                                                data_len, datacrc);
+                       if (ret <= 0)
+                               return ret;
+#ifdef CONFIG_BLOCK
+               } else if (m->bio) {
+
+                       ret = read_partial_message_bio(con,
+                                                &m->bio_iter, &m->bio_seg,
+                                                data_len, datacrc);
+                       if (ret <= 0)
+                               return ret;
+#endif
+               } else {
+                       BUG_ON(1);
                 }
         }
  
@@ -1874,9 +2032,9 @@ out:
  static void ceph_fault(struct ceph_connection *con)
  {
         pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
-              pr_addr(&con->peer_addr.in_addr), con->error_msg);
+              ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
         dout("fault %p state %lu to peer %s\n",
-            con, con->state, pr_addr(&con->peer_addr.in_addr));
+            con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
  
         if (test_bit(LOSSYTX, &con->state)) {
                 dout("fault on LOSSYTX channel\n");
@@ -1936,7 +2094,9 @@ out:
  /*
   * create a new messenger instance
   */
-struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
+struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr,
+                                            u32 supported_features,
+                                            u32 required_features)
  {
         struct ceph_messenger *msgr;
  
@@ -1944,6 +2104,9 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
         if (msgr == NULL)
                 return ERR_PTR(-ENOMEM);
  
+       msgr->supported_features = supported_features;
+       msgr->required_features = required_features;
+
         spin_lock_init(&msgr->global_seq_lock);
  
         /* the zero page is needed if a request is "canceled" while the message
@@ -1966,6 +2129,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
         dout("messenger_create %p\n", msgr);
         return msgr;
  }
+EXPORT_SYMBOL(ceph_messenger_create);
  
  void ceph_messenger_destroy(struct ceph_messenger *msgr)
  {
@@ -1975,6 +2139,7 @@ void ceph_messenger_destroy(struct ceph_messenger *msgr)
         kfree(msgr);
         dout("destroyed messenger %p\n", msgr);
  }
+EXPORT_SYMBOL(ceph_messenger_destroy);
  
  /*
   * Queue up an outgoing message on the given connection.
@@ -2011,6 +2176,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
         if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
                 queue_con(con);
  }
+EXPORT_SYMBOL(ceph_con_send);
  
  /*
   * Revoke a message that was previously queued for send
@@ -2076,6 +2242,7 @@ void ceph_con_keepalive(struct ceph_connection *con)
             test_and_set_bit(WRITE_PENDING, &con->state) == 0)
                 queue_con(con);
  }
+EXPORT_SYMBOL(ceph_con_keepalive);
  
  
  /*
@@ -2136,6 +2303,10 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
         m->nr_pages = 0;
         m->pages = NULL;
         m->pagelist = NULL;
+       m->bio = NULL;
+       m->bio_iter = NULL;
+       m->bio_seg = 0;
+       m->trail = NULL;
  
         dout("ceph_msg_new %p front %d\n", m, front_len);
         return m;
@@ -2146,6 +2317,7 @@ out:
         pr_err("msg_new can't create type %d front %d\n", type, front_len);
         return NULL;
  }
+EXPORT_SYMBOL(ceph_msg_new);
  
  /*
   * Allocate "middle" portion of a message, if it is needed and wasn't
@@ -2250,11 +2422,14 @@ void ceph_msg_last_put(struct kref *kref)
                 m->pagelist = NULL;
         }
  
+       m->trail = NULL;
+
         if (m->pool)
                 ceph_msgpool_put(m->pool, m);
         else
                 ceph_msg_kfree(m);
  }
+EXPORT_SYMBOL(ceph_msg_last_put);
  
  void ceph_msg_dump(struct ceph_msg *msg)
  {
@@ -2275,3 +2450,4 @@ void ceph_msg_dump(struct ceph_msg *msg)
                        DUMP_PREFIX_OFFSET, 16, 1,
                        &msg->footer, sizeof(msg->footer), true);
  }
+EXPORT_SYMBOL(ceph_msg_dump);
diff --git a/fs/ceph/mon_client.c b/net/ceph/mon_client.c

similarity index 94%

rename from fs/ceph/mon_client.c

rename to net/ceph/mon_client.c

index b2a5a3e4a671c336fc7495d473792ddc25f71110..8a079399174a27fc0c86e50e0891318cbb33cf97 100644 (file)
--- a/fs/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -1,14 +1,16 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
+#include <linux/module.h>
  #include <linux/types.h>
  #include <linux/slab.h>
  #include <linux/random.h>
  #include <linux/sched.h>
  
-#include "mon_client.h"
-#include "super.h"
-#include "auth.h"
-#include "decode.h"
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/decode.h>
+
+#include <linux/ceph/auth.h>
  
  /*
   * Interact with Ceph monitor cluster.  Handle requests for new map
@@ -74,7 +76,7 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
              m->num_mon);
         for (i = 0; i < m->num_mon; i++)
                 dout("monmap_decode  mon%d is %s\n", i,
-                    pr_addr(&m->mon_inst[i].addr.in_addr));
+                    ceph_pr_addr(&m->mon_inst[i].addr.in_addr));
         return m;
  
  bad:
@@ -191,30 +193,33 @@ static void __send_subscribe(struct ceph_mon_client *monc)
                 struct ceph_msg *msg = monc->m_subscribe;
                 struct ceph_mon_subscribe_item *i;
                 void *p, *end;
+               int num;
  
                 p = msg->front.iov_base;
                 end = p + msg->front_max;
  
-               dout("__send_subscribe to 'mdsmap' %u+\n",
-                    (unsigned)monc->have_mdsmap);
+               num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;
+               ceph_encode_32(&p, num);
+
                 if (monc->want_next_osdmap) {
                         dout("__send_subscribe to 'osdmap' %u\n",
                              (unsigned)monc->have_osdmap);
-                       ceph_encode_32(&p, 3);
                         ceph_encode_string(&p, end, "osdmap", 6);
                         i = p;
                         i->have = cpu_to_le64(monc->have_osdmap);
                         i->onetime = 1;
                         p += sizeof(*i);
                         monc->want_next_osdmap = 2;  /* requested */
-               } else {
-                       ceph_encode_32(&p, 2);
                 }
-               ceph_encode_string(&p, end, "mdsmap", 6);
-               i = p;
-               i->have = cpu_to_le64(monc->have_mdsmap);
-               i->onetime = 0;
-               p += sizeof(*i);
+               if (monc->want_mdsmap) {
+                       dout("__send_subscribe to 'mdsmap' %u+\n",
+                            (unsigned)monc->have_mdsmap);
+                       ceph_encode_string(&p, end, "mdsmap", 6);
+                       i = p;
+                       i->have = cpu_to_le64(monc->have_mdsmap);
+                       i->onetime = 0;
+                       p += sizeof(*i);
+               }
                 ceph_encode_string(&p, end, "monmap", 6);
                 i = p;
                 i->have = 0;
@@ -243,7 +248,8 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc,
         mutex_lock(&monc->mutex);
         if (monc->hunting) {
                 pr_info("mon%d %s session established\n",
-                       monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
+                       monc->cur_mon,
+                       ceph_pr_addr(&monc->con->peer_addr.in_addr));
                 monc->hunting = false;
         }
         dout("handle_subscribe_ack after %d seconds\n", seconds);
@@ -266,6 +272,7 @@ int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
         mutex_unlock(&monc->mutex);
         return 0;
  }
+EXPORT_SYMBOL(ceph_monc_got_mdsmap);
  
  int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
  {
@@ -310,6 +317,7 @@ int ceph_monc_open_session(struct ceph_mon_client *monc)
         mutex_unlock(&monc->mutex);
         return 0;
  }
+EXPORT_SYMBOL(ceph_monc_open_session);
  
  /*
   * The monitor responds with mount ack indicate mount success.  The
@@ -540,6 +548,7 @@ out:
         kref_put(&req->kref, release_generic_request);
         return err;
  }
+EXPORT_SYMBOL(ceph_monc_do_statfs);
  
  /*
   * pool ops
@@ -651,6 +660,7 @@ int ceph_monc_create_snapid(struct ceph_mon_client *monc,
                                    pool, 0, (char *)snapid, sizeof(*snapid));
  
  }
+EXPORT_SYMBOL(ceph_monc_create_snapid);
  
  int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
                             u32 pool, u64 snapid)
@@ -708,9 +718,9 @@ static void delayed_work(struct work_struct *work)
   */
  static int build_initial_monmap(struct ceph_mon_client *monc)
  {
-       struct ceph_mount_args *args = monc->client->mount_args;
-       struct ceph_entity_addr *mon_addr = args->mon_addr;
-       int num_mon = args->num_mon;
+       struct ceph_options *opt = monc->client->options;
+       struct ceph_entity_addr *mon_addr = opt->mon_addr;
+       int num_mon = opt->num_mon;
         int i;
  
         /* build initial monmap */
@@ -728,11 +738,6 @@ static int build_initial_monmap(struct ceph_mon_client *monc)
         }
         monc->monmap->num_mon = num_mon;
         monc->have_fsid = false;
-
-       /* release addr memory */
-       kfree(args->mon_addr);
-       args->mon_addr = NULL;
-       args->num_mon = 0;
         return 0;
  }
  
@@ -753,8 +758,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
         monc->con = NULL;
  
         /* authentication */
-       monc->auth = ceph_auth_init(cl->mount_args->name,
-                                   cl->mount_args->secret);
+       monc->auth = ceph_auth_init(cl->options->name,
+                                   cl->options->secret);
         if (IS_ERR(monc->auth))
                 return PTR_ERR(monc->auth);
         monc->auth->want_keys =
@@ -808,6 +813,7 @@ out_monmap:
  out:
         return err;
  }
+EXPORT_SYMBOL(ceph_monc_init);
  
  void ceph_monc_stop(struct ceph_mon_client *monc)
  {
@@ -832,6 +838,7 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
  
         kfree(monc->monmap);
  }
+EXPORT_SYMBOL(ceph_monc_stop);
  
  static void handle_auth_reply(struct ceph_mon_client *monc,
                               struct ceph_msg *msg)
@@ -889,6 +896,7 @@ int ceph_monc_validate_auth(struct ceph_mon_client *monc)
         mutex_unlock(&monc->mutex);
         return ret;
  }
+EXPORT_SYMBOL(ceph_monc_validate_auth);
  
  /*
   * handle incoming message
@@ -922,15 +930,16 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
                 ceph_monc_handle_map(monc, msg);
                 break;
  
-       case CEPH_MSG_MDS_MAP:
-               ceph_mdsc_handle_map(&monc->client->mdsc, msg);
-               break;
-
         case CEPH_MSG_OSD_MAP:
                 ceph_osdc_handle_map(&monc->client->osdc, msg);
                 break;
  
         default:
+               /* can the chained handler handle it? */
+               if (monc->client->extra_mon_dispatch &&
+                   monc->client->extra_mon_dispatch(monc->client, msg) == 0)
+                       break;
+                       
                 pr_err("received unknown message type %d %s\n", type,
                        ceph_msg_type_name(type));
         }
@@ -994,7 +1003,7 @@ static void mon_fault(struct ceph_connection *con)
         if (monc->con && !monc->hunting)
                 pr_info("mon%d %s session lost, "
                         "hunting for new mon\n", monc->cur_mon,
-                       pr_addr(&monc->con->peer_addr.in_addr));
+                       ceph_pr_addr(&monc->con->peer_addr.in_addr));
  
         __close_session(monc);
         if (!monc->hunting) {
diff --git a/fs/ceph/msgpool.c b/net/ceph/msgpool.c

similarity index 95%

rename from fs/ceph/msgpool.c

rename to net/ceph/msgpool.c

index dd65a6438131c5f08cbe97085b6b1ba3f671076d..d5f2d97ac05caf40124fe4f628efe77a514d8e55 100644 (file)
--- a/fs/ceph/msgpool.c
+++ b/net/ceph/msgpool.c
@@ -1,11 +1,11 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <linux/err.h>
  #include <linux/sched.h>
  #include <linux/types.h>
  #include <linux/vmalloc.h>
  
-#include "msgpool.h"
+#include <linux/ceph/msgpool.h>
  
  static void *alloc_fn(gfp_t gfp_mask, void *arg)
  {
diff --git a/fs/ceph/osd_client.c b/net/ceph/osd_client.c

similarity index 84%

rename from fs/ceph/osd_client.c

rename to net/ceph/osd_client.c

index 3b5571b8ce22e2ef8abdcf3d2dd4da1598bf6233..79391994b3edee7d531b823c0720262090db6618 100644 (file)
--- a/fs/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1,17 +1,22 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
+#include <linux/module.h>
  #include <linux/err.h>
  #include <linux/highmem.h>
  #include <linux/mm.h>
  #include <linux/pagemap.h>
  #include <linux/slab.h>
  #include <linux/uaccess.h>
+#ifdef CONFIG_BLOCK
+#include <linux/bio.h>
+#endif
  
-#include "super.h"
-#include "osd_client.h"
-#include "messenger.h"
-#include "decode.h"
-#include "auth.h"
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osd_client.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/pagelist.h>
  
  #define OSD_OP_FRONT_LEN       4096
  #define OSD_OPREPLY_FRONT_LEN  512
@@ -22,6 +27,59 @@ static int __kick_requests(struct ceph_osd_client *osdc,
  
  static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
  
+static int op_needs_trail(int op)
+{
+       switch (op) {
+       case CEPH_OSD_OP_GETXATTR:
+       case CEPH_OSD_OP_SETXATTR:
+       case CEPH_OSD_OP_CMPXATTR:
+       case CEPH_OSD_OP_CALL:
+               return 1;
+       default:
+               return 0;
+       }
+}
+
+static int op_has_extent(int op)
+{
+       return (op == CEPH_OSD_OP_READ ||
+               op == CEPH_OSD_OP_WRITE);
+}
+
+void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
+                       struct ceph_file_layout *layout,
+                       u64 snapid,
+                       u64 off, u64 *plen, u64 *bno,
+                       struct ceph_osd_request *req,
+                       struct ceph_osd_req_op *op)
+{
+       struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+       u64 orig_len = *plen;
+       u64 objoff, objlen;    /* extent in object */
+
+       reqhead->snapid = cpu_to_le64(snapid);
+
+       /* object extent? */
+       ceph_calc_file_object_mapping(layout, off, plen, bno,
+                                     &objoff, &objlen);
+       if (*plen < orig_len)
+               dout(" skipping last %llu, final file extent %llu~%llu\n",
+                    orig_len - *plen, off, *plen);
+
+       if (op_has_extent(op->op)) {
+               op->extent.offset = objoff;
+               op->extent.length = objlen;
+       }
+       req->r_num_pages = calc_pages_for(off, *plen);
+       if (op->op == CEPH_OSD_OP_WRITE)
+               op->payload_len = *plen;
+
+       dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
+            *bno, objoff, objlen, req->r_num_pages);
+
+}
+EXPORT_SYMBOL(ceph_calc_raw_layout);
+
  /*
   * Implement client access to distributed object storage cluster.
   *
@@ -48,34 +106,19 @@ static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
   * fill osd op in request message.
   */
  static void calc_layout(struct ceph_osd_client *osdc,
-                       struct ceph_vino vino, struct ceph_file_layout *layout,
+                       struct ceph_vino vino,
+                       struct ceph_file_layout *layout,
                         u64 off, u64 *plen,
-                       struct ceph_osd_request *req)
+                       struct ceph_osd_request *req,
+                       struct ceph_osd_req_op *op)
  {
-       struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
-       struct ceph_osd_op *op = (void *)(reqhead + 1);
-       u64 orig_len = *plen;
-       u64 objoff, objlen;    /* extent in object */
         u64 bno;
  
-       reqhead->snapid = cpu_to_le64(vino.snap);
-
-       /* object extent? */
-       ceph_calc_file_object_mapping(layout, off, plen, &bno,
-                                     &objoff, &objlen);
-       if (*plen < orig_len)
-               dout(" skipping last %llu, final file extent %llu~%llu\n",
-                    orig_len - *plen, off, *plen);
+       ceph_calc_raw_layout(osdc, layout, vino.snap, off,
+                            plen, &bno, req, op);
  
         sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
         req->r_oid_len = strlen(req->r_oid);
-
-       op->extent.offset = cpu_to_le64(objoff);
-       op->extent.length = cpu_to_le64(objlen);
-       req->r_num_pages = calc_pages_for(off, *plen);
-
-       dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
-            req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
  }
  
  /*
@@ -101,56 +144,66 @@ void ceph_osdc_release_request(struct kref *kref)
         if (req->r_own_pages)
                 ceph_release_page_vector(req->r_pages,
                                          req->r_num_pages);
+#ifdef CONFIG_BLOCK
+       if (req->r_bio)
+               bio_put(req->r_bio);
+#endif
         ceph_put_snap_context(req->r_snapc);
+       if (req->r_trail) {
+               ceph_pagelist_release(req->r_trail);
+               kfree(req->r_trail);
+       }
         if (req->r_mempool)
                 mempool_free(req, req->r_osdc->req_mempool);
         else
                 kfree(req);
  }
+EXPORT_SYMBOL(ceph_osdc_release_request);
  
-/*
- * build new request AND message, calculate layout, and adjust file
- * extent as needed.
- *
- * if the file was recently truncated, we include information about its
- * old and new size so that the object can be updated appropriately.  (we
- * avoid synchronously deleting truncated objects because it's slow.)
- *
- * if @do_sync, include a 'startsync' command so that the osd will flush
- * data quickly.
- */
-struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
-                                              struct ceph_file_layout *layout,
-                                              struct ceph_vino vino,
-                                              u64 off, u64 *plen,
-                                              int opcode, int flags,
+static int get_num_ops(struct ceph_osd_req_op *ops, int *needs_trail)
+{
+       int i = 0;
+
+       if (needs_trail)
+               *needs_trail = 0;
+       while (ops[i].op) {
+               if (needs_trail && op_needs_trail(ops[i].op))
+                       *needs_trail = 1;
+               i++;
+       }
+
+       return i;
+}
+
+struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
+                                              int flags,
                                                struct ceph_snap_context *snapc,
-                                              int do_sync,
-                                              u32 truncate_seq,
-                                              u64 truncate_size,
-                                              struct timespec *mtime,
-                                              bool use_mempool, int num_reply)
+                                              struct ceph_osd_req_op *ops,
+                                              bool use_mempool,
+                                              gfp_t gfp_flags,
+                                              struct page **pages,
+                                              struct bio *bio)
  {
         struct ceph_osd_request *req;
         struct ceph_msg *msg;
-       struct ceph_osd_request_head *head;
-       struct ceph_osd_op *op;
-       void *p;
-       int num_op = 1 + do_sync;
-       size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
-       int i;
+       int needs_trail;
+       int num_op = get_num_ops(ops, &needs_trail);
+       size_t msg_size = sizeof(struct ceph_osd_request_head);
+
+       msg_size += num_op*sizeof(struct ceph_osd_op);
  
         if (use_mempool) {
-               req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
+               req = mempool_alloc(osdc->req_mempool, gfp_flags);
                 memset(req, 0, sizeof(*req));
         } else {
-               req = kzalloc(sizeof(*req), GFP_NOFS);
+               req = kzalloc(sizeof(*req), gfp_flags);
         }
         if (req == NULL)
                 return NULL;
  
         req->r_osdc = osdc;
         req->r_mempool = use_mempool;
+
         kref_init(&req->r_kref);
         init_completion(&req->r_completion);
         init_completion(&req->r_safe_completion);
@@ -164,13 +217,22 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
                 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
         else
                 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
-                                  OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
+                                  OSD_OPREPLY_FRONT_LEN, gfp_flags);
         if (!msg) {
                 ceph_osdc_put_request(req);
                 return NULL;
         }
         req->r_reply = msg;
  
+       /* allocate space for the trailing data */
+       if (needs_trail) {
+               req->r_trail = kmalloc(sizeof(struct ceph_pagelist), gfp_flags);
+               if (!req->r_trail) {
+                       ceph_osdc_put_request(req);
+                       return NULL;
+               }
+               ceph_pagelist_init(req->r_trail);
+       }
         /* create request message; allow space for oid */
         msg_size += 40;
         if (snapc)
@@ -178,18 +240,115 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
         if (use_mempool)
                 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
         else
-               msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
+               msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags);
         if (!msg) {
                 ceph_osdc_put_request(req);
                 return NULL;
         }
+
         msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
         memset(msg->front.iov_base, 0, msg->front.iov_len);
+
+       req->r_request = msg;
+       req->r_pages = pages;
+#ifdef CONFIG_BLOCK
+       if (bio) {
+               req->r_bio = bio;
+               bio_get(req->r_bio);
+       }
+#endif
+
+       return req;
+}
+EXPORT_SYMBOL(ceph_osdc_alloc_request);
+
+static void osd_req_encode_op(struct ceph_osd_request *req,
+                             struct ceph_osd_op *dst,
+                             struct ceph_osd_req_op *src)
+{
+       dst->op = cpu_to_le16(src->op);
+
+       switch (dst->op) {
+       case CEPH_OSD_OP_READ:
+       case CEPH_OSD_OP_WRITE:
+               dst->extent.offset =
+                       cpu_to_le64(src->extent.offset);
+               dst->extent.length =
+                       cpu_to_le64(src->extent.length);
+               dst->extent.truncate_size =
+                       cpu_to_le64(src->extent.truncate_size);
+               dst->extent.truncate_seq =
+                       cpu_to_le32(src->extent.truncate_seq);
+               break;
+
+       case CEPH_OSD_OP_GETXATTR:
+       case CEPH_OSD_OP_SETXATTR:
+       case CEPH_OSD_OP_CMPXATTR:
+               BUG_ON(!req->r_trail);
+
+               dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
+               dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
+               dst->xattr.cmp_op = src->xattr.cmp_op;
+               dst->xattr.cmp_mode = src->xattr.cmp_mode;
+               ceph_pagelist_append(req->r_trail, src->xattr.name,
+                                    src->xattr.name_len);
+               ceph_pagelist_append(req->r_trail, src->xattr.val,
+                                    src->xattr.value_len);
+               break;
+       case CEPH_OSD_OP_CALL:
+               BUG_ON(!req->r_trail);
+
+               dst->cls.class_len = src->cls.class_len;
+               dst->cls.method_len = src->cls.method_len;
+               dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
+
+               ceph_pagelist_append(req->r_trail, src->cls.class_name,
+                                    src->cls.class_len);
+               ceph_pagelist_append(req->r_trail, src->cls.method_name,
+                                    src->cls.method_len);
+               ceph_pagelist_append(req->r_trail, src->cls.indata,
+                                    src->cls.indata_len);
+               break;
+       case CEPH_OSD_OP_ROLLBACK:
+               dst->snap.snapid = cpu_to_le64(src->snap.snapid);
+               break;
+       case CEPH_OSD_OP_STARTSYNC:
+               break;
+       default:
+               pr_err("unrecognized osd opcode %d\n", dst->op);
+               WARN_ON(1);
+               break;
+       }
+       dst->payload_len = cpu_to_le32(src->payload_len);
+}
+
+/*
+ * build new request AND message
+ *
+ */
+void ceph_osdc_build_request(struct ceph_osd_request *req,
+                            u64 off, u64 *plen,
+                            struct ceph_osd_req_op *src_ops,
+                            struct ceph_snap_context *snapc,
+                            struct timespec *mtime,
+                            const char *oid,
+                            int oid_len)
+{
+       struct ceph_msg *msg = req->r_request;
+       struct ceph_osd_request_head *head;
+       struct ceph_osd_req_op *src_op;
+       struct ceph_osd_op *op;
+       void *p;
+       int num_op = get_num_ops(src_ops, NULL);
+       size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
+       int flags = req->r_flags;
+       u64 data_len = 0;
+       int i;
+
         head = msg->front.iov_base;
         op = (void *)(head + 1);
         p = (void *)(op + num_op);
  
-       req->r_request = msg;
         req->r_snapc = ceph_get_snap_context(snapc);
  
         head->client_inc = cpu_to_le32(1); /* always, for now. */
@@ -197,29 +356,23 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
         if (flags & CEPH_OSD_FLAG_WRITE)
                 ceph_encode_timespec(&head->mtime, mtime);
         head->num_ops = cpu_to_le16(num_op);
-       op->op = cpu_to_le16(opcode);
  
-       /* calculate max write size */
-       calc_layout(osdc, vino, layout, off, plen, req);
-       req->r_file_layout = *layout;  /* keep a copy */
-
-       if (flags & CEPH_OSD_FLAG_WRITE) {
-               req->r_request->hdr.data_off = cpu_to_le16(off);
-               req->r_request->hdr.data_len = cpu_to_le32(*plen);
-               op->payload_len = cpu_to_le32(*plen);
-       }
-       op->extent.truncate_size = cpu_to_le64(truncate_size);
-       op->extent.truncate_seq = cpu_to_le32(truncate_seq);
  
         /* fill in oid */
-       head->object_len = cpu_to_le32(req->r_oid_len);
-       memcpy(p, req->r_oid, req->r_oid_len);
-       p += req->r_oid_len;
-
-       if (do_sync) {
+       head->object_len = cpu_to_le32(oid_len);
+       memcpy(p, oid, oid_len);
+       p += oid_len;
+
+       src_op = src_ops;
+       while (src_op->op) {
+               osd_req_encode_op(req, op, src_op);
+               src_op++;
                 op++;
-               op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
         }
+
+       if (req->r_trail)
+               data_len += req->r_trail->length;
+
         if (snapc) {
                 head->snap_seq = cpu_to_le64(snapc->seq);
                 head->num_snaps = cpu_to_le32(snapc->num_snaps);
@@ -229,12 +382,79 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
                 }
         }
  
+       if (flags & CEPH_OSD_FLAG_WRITE) {
+               req->r_request->hdr.data_off = cpu_to_le16(off);
+               req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len);
+       } else if (data_len) {
+               req->r_request->hdr.data_off = 0;
+               req->r_request->hdr.data_len = cpu_to_le32(data_len);
+       }
+
         BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
         msg_size = p - msg->front.iov_base;
         msg->front.iov_len = msg_size;
         msg->hdr.front_len = cpu_to_le32(msg_size);
+       return;
+}
+EXPORT_SYMBOL(ceph_osdc_build_request);
+
+/*
+ * build new request AND message, calculate layout, and adjust file
+ * extent as needed.
+ *
+ * if the file was recently truncated, we include information about its
+ * old and new size so that the object can be updated appropriately.  (we
+ * avoid synchronously deleting truncated objects because it's slow.)
+ *
+ * if @do_sync, include a 'startsync' command so that the osd will flush
+ * data quickly.
+ */
+struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
+                                              struct ceph_file_layout *layout,
+                                              struct ceph_vino vino,
+                                              u64 off, u64 *plen,
+                                              int opcode, int flags,
+                                              struct ceph_snap_context *snapc,
+                                              int do_sync,
+                                              u32 truncate_seq,
+                                              u64 truncate_size,
+                                              struct timespec *mtime,
+                                              bool use_mempool, int num_reply)
+{
+       struct ceph_osd_req_op ops[3];
+       struct ceph_osd_request *req;
+
+       ops[0].op = opcode;
+       ops[0].extent.truncate_seq = truncate_seq;
+       ops[0].extent.truncate_size = truncate_size;
+       ops[0].payload_len = 0;
+
+       if (do_sync) {
+               ops[1].op = CEPH_OSD_OP_STARTSYNC;
+               ops[1].payload_len = 0;
+               ops[2].op = 0;
+       } else
+               ops[1].op = 0;
+
+       req = ceph_osdc_alloc_request(osdc, flags,
+                                        snapc, ops,
+                                        use_mempool,
+                                        GFP_NOFS, NULL, NULL);
+       if (IS_ERR(req))
+               return req;
+
+       /* calculate max write size */
+       calc_layout(osdc, vino, layout, off, plen, req, ops);
+       req->r_file_layout = *layout;  /* keep a copy */
+
+       ceph_osdc_build_request(req, off, plen, ops,
+                               snapc,
+                               mtime,
+                               req->r_oid, req->r_oid_len);
+
         return req;
  }
+EXPORT_SYMBOL(ceph_osdc_new_request);
  
  /*
   * We keep osd requests in an rbtree, sorted by ->r_tid.
@@ -389,7 +609,7 @@ static void __move_osd_to_lru(struct ceph_osd_client *osdc,
         dout("__move_osd_to_lru %p\n", osd);
         BUG_ON(!list_empty(&osd->o_osd_lru));
         list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
-       osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
+       osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ;
  }
  
  static void __remove_osd_from_lru(struct ceph_osd *osd)
@@ -483,7 +703,7 @@ static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
  static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
  {
         schedule_delayed_work(&osdc->timeout_work,
-                       osdc->client->mount_args->osd_keepalive_timeout * HZ);
+                       osdc->client->options->osd_keepalive_timeout * HZ);
  }
  
  static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
@@ -684,9 +904,9 @@ static void handle_timeout(struct work_struct *work)
                 container_of(work, struct ceph_osd_client, timeout_work.work);
         struct ceph_osd_request *req, *last_req = NULL;
         struct ceph_osd *osd;
-       unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
+       unsigned long timeout = osdc->client->options->osd_timeout * HZ;
         unsigned long keepalive =
-               osdc->client->mount_args->osd_keepalive_timeout * HZ;
+               osdc->client->options->osd_keepalive_timeout * HZ;
         unsigned long last_stamp = 0;
         struct rb_node *p;
         struct list_head slow_osds;
@@ -773,7 +993,7 @@ static void handle_osds_timeout(struct work_struct *work)
                 container_of(work, struct ceph_osd_client,
                              osds_timeout_work.work);
         unsigned long delay =
-               osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
+               osdc->client->options->osd_idle_ttl * HZ >> 2;
  
         dout("osds timeout\n");
         down_read(&osdc->map_sem);
@@ -1104,6 +1324,10 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
  
         req->r_request->pages = req->r_pages;
         req->r_request->nr_pages = req->r_num_pages;
+#ifdef CONFIG_BLOCK
+       req->r_request->bio = req->r_bio;
+#endif
+       req->r_request->trail = req->r_trail;
  
         register_request(osdc, req);
  
@@ -1131,6 +1355,7 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
         up_read(&osdc->map_sem);
         return rc;
  }
+EXPORT_SYMBOL(ceph_osdc_start_request);
  
  /*
   * wait for a request to complete
@@ -1153,6 +1378,7 @@ int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
         dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
         return req->r_result;
  }
+EXPORT_SYMBOL(ceph_osdc_wait_request);
  
  /*
   * sync - wait for all in-flight requests to flush.  avoid starvation.
@@ -1186,6 +1412,7 @@ void ceph_osdc_sync(struct ceph_osd_client *osdc)
         mutex_unlock(&osdc->request_mutex);
         dout("sync done (thru tid %llu)\n", last_tid);
  }
+EXPORT_SYMBOL(ceph_osdc_sync);
  
  /*
   * init, shutdown
@@ -1211,7 +1438,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
         INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
  
         schedule_delayed_work(&osdc->osds_timeout_work,
-          round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
+          round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ));
  
         err = -ENOMEM;
         osdc->req_mempool = mempool_create_kmalloc_pool(10,
@@ -1237,6 +1464,7 @@ out_mempool:
  out:
         return err;
  }
+EXPORT_SYMBOL(ceph_osdc_init);
  
  void ceph_osdc_stop(struct ceph_osd_client *osdc)
  {
@@ -1251,6 +1479,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
         ceph_msgpool_destroy(&osdc->msgpool_op);
         ceph_msgpool_destroy(&osdc->msgpool_op_reply);
  }
+EXPORT_SYMBOL(ceph_osdc_stop);
  
  /*
   * Read some contiguous pages.  If we cross a stripe boundary, shorten
@@ -1288,6 +1517,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
         dout("readpages result %d\n", rc);
         return rc;
  }
+EXPORT_SYMBOL(ceph_osdc_readpages);
  
  /*
   * do a synchronous write on N pages
@@ -1330,6 +1560,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
         dout("writepages result %d\n", rc);
         return rc;
  }
+EXPORT_SYMBOL(ceph_osdc_writepages);
  
  /*
   * handle incoming message
@@ -1420,6 +1651,9 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
                 }
                 m->pages = req->r_pages;
                 m->nr_pages = req->r_num_pages;
+#ifdef CONFIG_BLOCK
+               m->bio = req->r_bio;
+#endif
         }
         *skip = 0;
         req->r_con_filling_msg = ceph_con_get(con);
diff --git a/fs/ceph/osdmap.c b/net/ceph/osdmap.c

similarity index 97%

rename from fs/ceph/osdmap.c

rename to net/ceph/osdmap.c

index e31f118f1392988351257d2385863c37228bc7df..d73f3f6efa36ff6cf9efc33cada1853afa8073e7 100644 (file)
--- a/fs/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1,14 +1,15 @@
  
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
+#include <linux/module.h>
  #include <linux/slab.h>
  #include <asm/div64.h>
  
-#include "super.h"
-#include "osdmap.h"
-#include "crush/hash.h"
-#include "crush/mapper.h"
-#include "decode.h"
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osdmap.h>
+#include <linux/ceph/decode.h>
+#include <linux/crush/hash.h>
+#include <linux/crush/mapper.h>
  
  char *ceph_osdmap_state_str(char *str, int len, int state)
  {
@@ -417,6 +418,20 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
         return NULL;
  }
  
+int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
+{
+       struct rb_node *rbp;
+
+       for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
+               struct ceph_pg_pool_info *pi =
+                       rb_entry(rbp, struct ceph_pg_pool_info, node);
+               if (pi->name && strcmp(pi->name, name) == 0)
+                       return pi->id;
+       }
+       return -ENOENT;
+}
+EXPORT_SYMBOL(ceph_pg_poolid_by_name);
+
  static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
  {
         rb_erase(&pi->node, root);
@@ -966,6 +981,7 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
  
         dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
  }
+EXPORT_SYMBOL(ceph_calc_file_object_mapping);
  
  /*
   * calculate an object layout (i.e. pgid) from an oid,
@@ -1011,6 +1027,7 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol,
         ol->ol_stripe_unit = fl->fl_object_stripe_unit;
         return 0;
  }
+EXPORT_SYMBOL(ceph_calc_object_layout);
  
  /*
   * Calculate raw osd vector for the given pgid.  Return pointer to osd
@@ -1108,3 +1125,4 @@ int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
                         return osds[i];
         return -1;
  }
+EXPORT_SYMBOL(ceph_calc_pg_primary);
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c

new file mode 100644 (file)

index 0000000..13cb409
--- /dev/null
+++ b/net/ceph/pagelist.c
@@ -0,0 +1,154 @@
+
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/ceph/pagelist.h>
+
+static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
+{
+       if (pl->mapped_tail) {
+               struct page *page = list_entry(pl->head.prev, struct page, lru);
+               kunmap(page);
+               pl->mapped_tail = NULL;
+       }
+}
+
+int ceph_pagelist_release(struct ceph_pagelist *pl)
+{
+       ceph_pagelist_unmap_tail(pl);
+       while (!list_empty(&pl->head)) {
+               struct page *page = list_first_entry(&pl->head, struct page,
+                                                    lru);
+               list_del(&page->lru);
+               __free_page(page);
+       }
+       ceph_pagelist_free_reserve(pl);
+       return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_release);
+
+static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
+{
+       struct page *page;
+
+       if (!pl->num_pages_free) {
+               page = __page_cache_alloc(GFP_NOFS);
+       } else {
+               page = list_first_entry(&pl->free_list, struct page, lru);
+               list_del(&page->lru);
+               --pl->num_pages_free;
+       }
+       if (!page)
+               return -ENOMEM;
+       pl->room += PAGE_SIZE;
+       ceph_pagelist_unmap_tail(pl);
+       list_add_tail(&page->lru, &pl->head);
+       pl->mapped_tail = kmap(page);
+       return 0;
+}
+
+int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len)
+{
+       while (pl->room < len) {
+               size_t bit = pl->room;
+               int ret;
+
+               memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
+                      buf, bit);
+               pl->length += bit;
+               pl->room -= bit;
+               buf += bit;
+               len -= bit;
+               ret = ceph_pagelist_addpage(pl);
+               if (ret)
+                       return ret;
+       }
+
+       memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
+       pl->length += len;
+       pl->room -= len;
+       return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_append);
+
+/**
+ * Allocate enough pages for a pagelist to append the given amount
+ * of data without without allocating.
+ * Returns: 0 on success, -ENOMEM on error.
+ */
+int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space)
+{
+       if (space <= pl->room)
+               return 0;
+       space -= pl->room;
+       space = (space + PAGE_SIZE - 1) >> PAGE_SHIFT;   /* conv to num pages */
+
+       while (space > pl->num_pages_free) {
+               struct page *page = __page_cache_alloc(GFP_NOFS);
+               if (!page)
+                       return -ENOMEM;
+               list_add_tail(&page->lru, &pl->free_list);
+               ++pl->num_pages_free;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_reserve);
+
+/**
+ * Free any pages that have been preallocated.
+ */
+int ceph_pagelist_free_reserve(struct ceph_pagelist *pl)
+{
+       while (!list_empty(&pl->free_list)) {
+               struct page *page = list_first_entry(&pl->free_list,
+                                                    struct page, lru);
+               list_del(&page->lru);
+               __free_page(page);
+               --pl->num_pages_free;
+       }
+       BUG_ON(pl->num_pages_free);
+       return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_free_reserve);
+
+/**
+ * Create a truncation point.
+ */
+void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
+                             struct ceph_pagelist_cursor *c)
+{
+       c->pl = pl;
+       c->page_lru = pl->head.prev;
+       c->room = pl->room;
+}
+EXPORT_SYMBOL(ceph_pagelist_set_cursor);
+
+/**
+ * Truncate a pagelist to the given point. Move extra pages to reserve.
+ * This won't sleep.
+ * Returns: 0 on success,
+ *          -EINVAL if the pagelist doesn't match the trunc point pagelist
+ */
+int ceph_pagelist_truncate(struct ceph_pagelist *pl,
+                          struct ceph_pagelist_cursor *c)
+{
+       struct page *page;
+
+       if (pl != c->pl)
+               return -EINVAL;
+       ceph_pagelist_unmap_tail(pl);
+       while (pl->head.prev != c->page_lru) {
+               page = list_entry(pl->head.prev, struct page, lru);
+               list_del(&page->lru);                /* remove from pagelist */
+               list_add_tail(&page->lru, &pl->free_list); /* add to reserve */
+               ++pl->num_pages_free;
+       }
+       pl->room = c->room;
+       if (!list_empty(&pl->head)) {
+               page = list_entry(pl->head.prev, struct page, lru);
+               pl->mapped_tail = kmap(page);
+       }
+       return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_truncate);
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c

new file mode 100644 (file)

index 0000000..54caf06
--- /dev/null
+++ b/net/ceph/pagevec.c
@@ -0,0 +1,223 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+
+#include <linux/ceph/libceph.h>
+
+/*
+ * build a vector of user pages
+ */
+struct page **ceph_get_direct_page_vector(const char __user *data,
+                                                int num_pages,
+                                                loff_t off, size_t len)
+{
+       struct page **pages;
+       int rc;
+
+       pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+       if (!pages)
+               return ERR_PTR(-ENOMEM);
+
+       down_read(&current->mm->mmap_sem);
+       rc = get_user_pages(current, current->mm, (unsigned long)data,
+                           num_pages, 0, 0, pages, NULL);
+       up_read(&current->mm->mmap_sem);
+       if (rc < 0)
+               goto fail;
+       return pages;
+
+fail:
+       kfree(pages);
+       return ERR_PTR(rc);
+}
+EXPORT_SYMBOL(ceph_get_direct_page_vector);
+
+void ceph_put_page_vector(struct page **pages, int num_pages)
+{
+       int i;
+
+       for (i = 0; i < num_pages; i++)
+               put_page(pages[i]);
+       kfree(pages);
+}
+EXPORT_SYMBOL(ceph_put_page_vector);
+
+void ceph_release_page_vector(struct page **pages, int num_pages)
+{
+       int i;
+
+       for (i = 0; i < num_pages; i++)
+               __free_pages(pages[i], 0);
+       kfree(pages);
+}
+EXPORT_SYMBOL(ceph_release_page_vector);
+
+/*
+ * allocate a vector new pages
+ */
+struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
+{
+       struct page **pages;
+       int i;
+
+       pages = kmalloc(sizeof(*pages) * num_pages, flags);
+       if (!pages)
+               return ERR_PTR(-ENOMEM);
+       for (i = 0; i < num_pages; i++) {
+               pages[i] = __page_cache_alloc(flags);
+               if (pages[i] == NULL) {
+                       ceph_release_page_vector(pages, i);
+                       return ERR_PTR(-ENOMEM);
+               }
+       }
+       return pages;
+}
+EXPORT_SYMBOL(ceph_alloc_page_vector);
+
+/*
+ * copy user data into a page vector
+ */
+int ceph_copy_user_to_page_vector(struct page **pages,
+                                        const char __user *data,
+                                        loff_t off, size_t len)
+{
+       int i = 0;
+       int po = off & ~PAGE_CACHE_MASK;
+       int left = len;
+       int l, bad;
+
+       while (left > 0) {
+               l = min_t(int, PAGE_CACHE_SIZE-po, left);
+               bad = copy_from_user(page_address(pages[i]) + po, data, l);
+               if (bad == l)
+                       return -EFAULT;
+               data += l - bad;
+               left -= l - bad;
+               po += l - bad;
+               if (po == PAGE_CACHE_SIZE) {
+                       po = 0;
+                       i++;
+               }
+       }
+       return len;
+}
+EXPORT_SYMBOL(ceph_copy_user_to_page_vector);
+
+int ceph_copy_to_page_vector(struct page **pages,
+                                   const char *data,
+                                   loff_t off, size_t len)
+{
+       int i = 0;
+       size_t po = off & ~PAGE_CACHE_MASK;
+       size_t left = len;
+       size_t l;
+
+       while (left > 0) {
+               l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
+               memcpy(page_address(pages[i]) + po, data, l);
+               data += l;
+               left -= l;
+               po += l;
+               if (po == PAGE_CACHE_SIZE) {
+                       po = 0;
+                       i++;
+               }
+       }
+       return len;
+}
+EXPORT_SYMBOL(ceph_copy_to_page_vector);
+
+int ceph_copy_from_page_vector(struct page **pages,
+                                   char *data,
+                                   loff_t off, size_t len)
+{
+       int i = 0;
+       size_t po = off & ~PAGE_CACHE_MASK;
+       size_t left = len;
+       size_t l;
+
+       while (left > 0) {
+               l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
+               memcpy(data, page_address(pages[i]) + po, l);
+               data += l;
+               left -= l;
+               po += l;
+               if (po == PAGE_CACHE_SIZE) {
+                       po = 0;
+                       i++;
+               }
+       }
+       return len;
+}
+EXPORT_SYMBOL(ceph_copy_from_page_vector);
+
+/*
+ * copy user data from a page vector into a user pointer
+ */
+int ceph_copy_page_vector_to_user(struct page **pages,
+                                        char __user *data,
+                                        loff_t off, size_t len)
+{
+       int i = 0;
+       int po = off & ~PAGE_CACHE_MASK;
+       int left = len;
+       int l, bad;
+
+       while (left > 0) {
+               l = min_t(int, left, PAGE_CACHE_SIZE-po);
+               bad = copy_to_user(data, page_address(pages[i]) + po, l);
+               if (bad == l)
+                       return -EFAULT;
+               data += l - bad;
+               left -= l - bad;
+               if (po) {
+                       po += l - bad;
+                       if (po == PAGE_CACHE_SIZE)
+                               po = 0;
+               }
+               i++;
+       }
+       return len;
+}
+EXPORT_SYMBOL(ceph_copy_page_vector_to_user);
+
+/*
+ * Zero an extent within a page vector.  Offset is relative to the
+ * start of the first page.
+ */
+void ceph_zero_page_vector_range(int off, int len, struct page **pages)
+{
+       int i = off >> PAGE_CACHE_SHIFT;
+
+       off &= ~PAGE_CACHE_MASK;
+
+       dout("zero_page_vector_page %u~%u\n", off, len);
+
+       /* leading partial page? */
+       if (off) {
+               int end = min((int)PAGE_CACHE_SIZE, off + len);
+               dout("zeroing %d %p head from %d\n", i, pages[i],
+                    (int)off);
+               zero_user_segment(pages[i], off, end);
+               len -= (end - off);
+               i++;
+       }
+       while (len >= PAGE_CACHE_SIZE) {
+               dout("zeroing %d %p len=%d\n", i, pages[i], len);
+               zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
+               len -= PAGE_CACHE_SIZE;
+               i++;
+       }
+       /* trailing partial page? */
+       if (len) {
+               dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
+               zero_user_segment(pages[i], 0, len);
+       }
+}
+EXPORT_SYMBOL(ceph_zero_page_vector_range);
+
diff --git a/net/core/datagram.c b/net/core/datagram.c

index 251997a9548362c5ebc8a0a34842971a0198539a..282806ba7a57e60991f2f7806bc3015d9b8596a5 100644 (file)
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -243,6 +243,7 @@ void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
         unlock_sock_fast(sk, slow);
  
         /* skb is now orphaned, can be freed outside of locked section */
+       trace_kfree_skb(skb, skb_free_datagram_locked);
         __kfree_skb(skb);
  }
  EXPORT_SYMBOL(skb_free_datagram_locked);
diff --git a/net/core/dev.c b/net/core/dev.c

index 660dd41aaaa6629c0d59547e04d23353ba935af8..7ec85e27beed840b8da5f9dae620bcd48453f3f7 100644 (file)
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -128,6 +128,8 @@
  #include <linux/jhash.h>
  #include <linux/random.h>
  #include <trace/events/napi.h>
+#include <trace/events/net.h>
+#include <trace/events/skb.h>
  #include <linux/pci.h>
  
  #include "net-sysfs.h"
@@ -1978,6 +1980,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
                 }
  
                 rc = ops->ndo_start_xmit(skb, dev);
+               trace_net_dev_xmit(skb, rc);
                 if (rc == NETDEV_TX_OK)
                         txq_trans_update(txq);
                 return rc;
@@ -1998,6 +2001,7 @@ gso:
                         skb_dst_drop(nskb);
  
                 rc = ops->ndo_start_xmit(nskb, dev);
+               trace_net_dev_xmit(nskb, rc);
                 if (unlikely(rc != NETDEV_TX_OK)) {
                         if (rc & ~NETDEV_TX_MASK)
                                 goto out_kfree_gso_skb;
@@ -2186,6 +2190,7 @@ int dev_queue_xmit(struct sk_buff *skb)
  #ifdef CONFIG_NET_CLS_ACT
         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
  #endif
+       trace_net_dev_queue(skb);
         if (q->enqueue) {
                 rc = __dev_xmit_skb(skb, q, dev, txq);
                 goto out;
@@ -2512,6 +2517,7 @@ int netif_rx(struct sk_buff *skb)
         if (netdev_tstamp_prequeue)
                 net_timestamp_check(skb);
  
+       trace_netif_rx(skb);
  #ifdef CONFIG_RPS
         {
                 struct rps_dev_flow voidflow, *rflow = &voidflow;
@@ -2571,6 +2577,7 @@ static void net_tx_action(struct softirq_action *h)
                         clist = clist->next;
  
                         WARN_ON(atomic_read(&skb->users));
+                       trace_kfree_skb(skb, net_tx_action);
                         __kfree_skb(skb);
                 }
         }
@@ -2828,6 +2835,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
         if (!netdev_tstamp_prequeue)
                 net_timestamp_check(skb);
  
+       trace_netif_receive_skb(skb);
         if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
                 return NET_RX_SUCCESS;
  
diff --git a/net/core/ethtool.c b/net/core/ethtool.c

index 4016ac6bdd5eb9c910192aab7cc3235835296c4f..8451ab481095fc523c47fa01ccb11392b15dbc30 100644 (file)
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -397,7 +397,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
             (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index))
                 return -ENOMEM;
         full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size;
-       indir = kmalloc(full_size, GFP_USER);
+       indir = kzalloc(full_size, GFP_USER);
         if (!indir)
                 return -ENOMEM;
  
@@ -538,7 +538,7 @@ static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr)
  
         gstrings.len = ret;
  
-       data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER);
+       data = kzalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER);
         if (!data)
                 return -ENOMEM;
  
@@ -775,7 +775,7 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
         if (regs.len > reglen)
                 regs.len = reglen;
  
-       regbuf = kmalloc(reglen, GFP_USER);
+       regbuf = kzalloc(reglen, GFP_USER);
         if (!regbuf)
                 return -ENOMEM;
  
diff --git a/net/core/net-traces.c b/net/core/net-traces.c

index afa6380ed88ac2ee0b5cd8c8a731dcb9f3dcb809..7f1bb2aba03bf0e501ee1625ba6f07163b528ab9 100644 (file)
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -26,6 +26,7 @@
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/skb.h>
+#include <trace/events/net.h>
  #include <trace/events/napi.h>
  
  EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c

index c83b421341c01b4a1de702e42d3380fac2f0aa2b..56ba3c4e4761c6584375a8acf45022bc5c40d6e8 100644 (file)
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -466,6 +466,7 @@ void consume_skb(struct sk_buff *skb)
                 smp_rmb();
         else if (likely(!atomic_dec_and_test(&skb->users)))
                 return;
+       trace_consume_skb(skb);
         __kfree_skb(skb);
  }
  EXPORT_SYMBOL(consume_skb);
diff --git a/net/core/sock.c b/net/core/sock.c

index ef30e9d286e703cccaffbd5ee10394cc167fe057..7d99e13148e6287f1cca0f5e3417ee5d535c366c 100644 (file)
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1078,8 +1078,11 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
  #ifdef CONFIG_CGROUPS
  void sock_update_classid(struct sock *sk)
  {
-       u32 classid = task_cls_classid(current);
+       u32 classid;
  
+       rcu_read_lock();  /* doing current task, which cannot vanish. */
+       classid = task_cls_classid(current);
+       rcu_read_unlock();
         if (classid && classid != sk->sk_classid)
                 sk->sk_classid = classid;
  }
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c

index 244f7cb08d681d35f9a08744ca449ce90f5b8972..37f8adb68c79e8619d1a45a496ca6e1ef37c7c8a 100644 (file)
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -11,6 +11,7 @@
  #include <linux/proc_fs.h>
  #include <linux/seq_file.h>
  #include <linux/percpu.h>
+#include <linux/security.h>
  #include <net/net_namespace.h>
  
  #include <linux/netfilter.h>
@@ -87,6 +88,29 @@ static void ct_seq_stop(struct seq_file *s, void *v)
         rcu_read_unlock();
  }
  
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+       int ret;
+       u32 len;
+       char *secctx;
+
+       ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+       if (ret)
+               return ret;
+
+       ret = seq_printf(s, "secctx=%s ", secctx);
+
+       security_release_secctx(secctx, len);
+       return ret;
+}
+#else
+static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+       return 0;
+}
+#endif
+
  static int ct_seq_show(struct seq_file *s, void *v)
  {
         struct nf_conntrack_tuple_hash *hash = v;
@@ -148,10 +172,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
                 goto release;
  #endif
  
-#ifdef CONFIG_NF_CONNTRACK_SECMARK
-       if (seq_printf(s, "secmark=%u ", ct->secmark))
+       if (ct_show_secctx(s, ct))
                 goto release;
-#endif
  
         if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
                 goto release;
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c

index 8c8632d9b93cead0cd115945a9566d1e57829667..957c9241fb0ce1d91cde6c1b098f81b52337690a 100644 (file)
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -38,7 +38,7 @@ static DEFINE_SPINLOCK(nf_nat_lock);
  static struct nf_conntrack_l3proto *l3proto __read_mostly;
  
  #define MAX_IP_NAT_PROTO 256
-static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO]
+static const struct nf_nat_protocol __rcu *nf_nat_protos[MAX_IP_NAT_PROTO]
                                                 __read_mostly;
  
  static inline const struct nf_nat_protocol *
diff --git a/net/netfilter/core.c b/net/netfilter/core.c

index 78b505d33bfb42cdf1033be323c2fdb1a359a833..fdaec7daff1d539038ef6ee6c0d0acfeae6e7cf6 100644 (file)
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -27,7 +27,7 @@
  
  static DEFINE_MUTEX(afinfo_mutex);
  
-const struct nf_afinfo *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
+const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
  EXPORT_SYMBOL(nf_afinfo);
  
  int nf_register_afinfo(const struct nf_afinfo *afinfo)
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c

index cdcc7649476b60e0e8584a7c2b2d31e7509a4bd4..5702de35e2bb327ea0e9bd346f57bf8ebf36c951 100644 (file)
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -26,10 +26,10 @@
  
  static DEFINE_MUTEX(nf_ct_ecache_mutex);
  
-struct nf_ct_event_notifier *nf_conntrack_event_cb __read_mostly;
+struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb __read_mostly;
  EXPORT_SYMBOL_GPL(nf_conntrack_event_cb);
  
-struct nf_exp_event_notifier *nf_expect_event_cb __read_mostly;
+struct nf_exp_event_notifier __rcu *nf_expect_event_cb __read_mostly;
  EXPORT_SYMBOL_GPL(nf_expect_event_cb);
  
  /* deliver cached events and clear cache entry - must be called with locally
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c

index 8d9e4c949b9618eafc4a7b301d305a15cab805b6..bd82450c193f5dbb4895f4fc7565040686fa740d 100644 (file)
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -16,7 +16,7 @@
  #include <linux/skbuff.h>
  #include <net/netfilter/nf_conntrack_extend.h>
  
-static struct nf_ct_ext_type *nf_ct_ext_types[NF_CT_EXT_NUM];
+static struct nf_ct_ext_type __rcu *nf_ct_ext_types[NF_CT_EXT_NUM];
  static DEFINE_MUTEX(nf_ct_ext_type_mutex);
  
  void __nf_ct_ext_destroy(struct nf_conn *ct)
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c

index 5bae1cd15eea93ee3f74cb51dab972c10c96d33c..146476c6441a9ea8894d78bc5a00c558c84a0874 100644 (file)
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -22,6 +22,7 @@
  #include <linux/rculist_nulls.h>
  #include <linux/types.h>
  #include <linux/timer.h>
+#include <linux/security.h>
  #include <linux/skbuff.h>
  #include <linux/errno.h>
  #include <linux/netlink.h>
@@ -245,16 +246,31 @@ nla_put_failure:
  
  #ifdef CONFIG_NF_CONNTRACK_SECMARK
  static inline int
-ctnetlink_dump_secmark(struct sk_buff *skb, const struct nf_conn *ct)
+ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
  {
-       NLA_PUT_BE32(skb, CTA_SECMARK, htonl(ct->secmark));
-       return 0;
+       struct nlattr *nest_secctx;
+       int len, ret;
+       char *secctx;
+
+       ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+       if (ret)
+               return ret;
+
+       ret = -1;
+       nest_secctx = nla_nest_start(skb, CTA_SECCTX | NLA_F_NESTED);
+       if (!nest_secctx)
+               goto nla_put_failure;
+
+       NLA_PUT_STRING(skb, CTA_SECCTX_NAME, secctx);
+       nla_nest_end(skb, nest_secctx);
  
+       ret = 0;
  nla_put_failure:
-       return -1;
+       security_release_secctx(secctx, len);
+       return ret;
  }
  #else
-#define ctnetlink_dump_secmark(a, b) (0)
+#define ctnetlink_dump_secctx(a, b) (0)
  #endif
  
  #define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
@@ -391,7 +407,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
             ctnetlink_dump_protoinfo(skb, ct) < 0 ||
             ctnetlink_dump_helpinfo(skb, ct) < 0 ||
             ctnetlink_dump_mark(skb, ct) < 0 ||
-           ctnetlink_dump_secmark(skb, ct) < 0 ||
+           ctnetlink_dump_secctx(skb, ct) < 0 ||
             ctnetlink_dump_id(skb, ct) < 0 ||
             ctnetlink_dump_use(skb, ct) < 0 ||
             ctnetlink_dump_master(skb, ct) < 0 ||
@@ -437,6 +453,17 @@ ctnetlink_counters_size(const struct nf_conn *ct)
                ;
  }
  
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static int ctnetlink_nlmsg_secctx_size(const struct nf_conn *ct)
+{
+       int len;
+
+       security_secid_to_secctx(ct->secmark, NULL, &len);
+
+       return sizeof(char) * len;
+}
+#endif
+
  static inline size_t
  ctnetlink_nlmsg_size(const struct nf_conn *ct)
  {
@@ -453,7 +480,8 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
                + nla_total_size(0) /* CTA_HELP */
                + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
  #ifdef CONFIG_NF_CONNTRACK_SECMARK
-              + nla_total_size(sizeof(u_int32_t)) /* CTA_SECMARK */
+              + nla_total_size(0) /* CTA_SECCTX */
+              + nla_total_size(ctnetlink_nlmsg_secctx_size(ct)) /* CTA_SECCTX_NAME */
  #endif
  #ifdef CONFIG_NF_NAT_NEEDED
                + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
@@ -556,7 +584,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
  
  #ifdef CONFIG_NF_CONNTRACK_SECMARK
                 if ((events & (1 << IPCT_SECMARK) || ct->secmark)
-                   && ctnetlink_dump_secmark(skb, ct) < 0)
+                   && ctnetlink_dump_secctx(skb, ct) < 0)
                         goto nla_put_failure;
  #endif
  
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c

index 5886ba1d52a0c353a2538313c717328cacfdcac3..ed6d929580236c1b4aa77a42db959c9e522f2fc5 100644 (file)
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -28,8 +28,8 @@
  #include <net/netfilter/nf_conntrack_l4proto.h>
  #include <net/netfilter/nf_conntrack_core.h>
  
-static struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX] __read_mostly;
-struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX] __read_mostly;
+static struct nf_conntrack_l4proto __rcu **nf_ct_protos[PF_MAX] __read_mostly;
+struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX] __read_mostly;
  EXPORT_SYMBOL_GPL(nf_ct_l3protos);
  
  static DEFINE_MUTEX(nf_ct_proto_mutex);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c

index eb973fcd67ab4273a3cdee566a5f4a4fb44a24f2..0fb65705b44b522e3ba4de6c02d06f119f43f2d9 100644 (file)
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -15,6 +15,7 @@
  #include <linux/seq_file.h>
  #include <linux/percpu.h>
  #include <linux/netdevice.h>
+#include <linux/security.h>
  #include <net/net_namespace.h>
  #ifdef CONFIG_SYSCTL
  #include <linux/sysctl.h>
@@ -108,6 +109,29 @@ static void ct_seq_stop(struct seq_file *s, void *v)
         rcu_read_unlock();
  }
  
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+       int ret;
+       u32 len;
+       char *secctx;
+
+       ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+       if (ret)
+               return ret;
+
+       ret = seq_printf(s, "secctx=%s ", secctx);
+
+       security_release_secctx(secctx, len);
+       return ret;
+}
+#else
+static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+       return 0;
+}
+#endif
+
  /* return 0 on success, 1 in case of error */
  static int ct_seq_show(struct seq_file *s, void *v)
  {
@@ -168,10 +192,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
                 goto release;
  #endif
  
-#ifdef CONFIG_NF_CONNTRACK_SECMARK
-       if (seq_printf(s, "secmark=%u ", ct->secmark))
+       if (ct_show_secctx(s, ct))
                 goto release;
-#endif
  
  #ifdef CONFIG_NF_CONNTRACK_ZONES
         if (seq_printf(s, "zone=%u ", nf_ct_zone(ct)))
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c

index 7df37fd786bc19406a5ed8a8558df95cd26d93be..b07393eab88e2fb86a21d7556f7ce532c807a172 100644 (file)
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -16,7 +16,7 @@
  #define NF_LOG_PREFIXLEN               128
  #define NFLOGGER_NAME_LEN              64
  
-static const struct nf_logger *nf_loggers[NFPROTO_NUMPROTO] __read_mostly;
+static const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO] __read_mostly;
  static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly;
  static DEFINE_MUTEX(nf_log_mutex);
  
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c

index 78b3cf9c519ca86e66b0ba4ae88c6797a7518c3f..74aebed5bd28bb5c0c924cec7d908615b82ffdd6 100644 (file)
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -18,7 +18,7 @@
   * long term mutex.  The handler must provide an an outfn() to accept packets
   * for queueing and must reinject all packets it receives, no matter what.
   */
-static const struct nf_queue_handler *queue_handler[NFPROTO_NUMPROTO] __read_mostly;
+static const struct nf_queue_handler __rcu *queue_handler[NFPROTO_NUMPROTO] __read_mostly;
  
  static DEFINE_MUTEX(queue_handler_mutex);
  
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c

index 0cb6053f02fdf04723254bfe90d8ca9bff29edfc..782e51986a6f670ce6c033c1a1d99e5a26d46885 100644 (file)
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -9,7 +9,6 @@
  #include <linux/module.h>
  #include <linux/gfp.h>
  #include <linux/skbuff.h>
-#include <linux/selinux.h>
  #include <linux/netfilter_ipv4/ip_tables.h>
  #include <linux/netfilter_ipv6/ip6_tables.h>
  #include <linux/netfilter/x_tables.h>
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c

index 23b2d6c486b573927dcefd00b575546b35376bfa..9faf5e050b796186b3204a02ece181726a26cb1a 100644 (file)
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -14,8 +14,8 @@
   */
  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  #include <linux/module.h>
+#include <linux/security.h>
  #include <linux/skbuff.h>
-#include <linux/selinux.h>
  #include <linux/netfilter/x_tables.h>
  #include <linux/netfilter/xt_SECMARK.h>
  
@@ -39,9 +39,8 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
  
         switch (mode) {
         case SECMARK_MODE_SEL:
-               secmark = info->u.sel.selsid;
+               secmark = info->secid;
                 break;
-
         default:
                 BUG();
         }
@@ -50,33 +49,33 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
         return XT_CONTINUE;
  }
  
-static int checkentry_selinux(struct xt_secmark_target_info *info)
+static int checkentry_lsm(struct xt_secmark_target_info *info)
  {
         int err;
-       struct xt_secmark_target_selinux_info *sel = &info->u.sel;
  
-       sel->selctx[SECMARK_SELCTX_MAX - 1] = '\0';
+       info->secctx[SECMARK_SECCTX_MAX - 1] = '\0';
+       info->secid = 0;
  
-       err = selinux_string_to_sid(sel->selctx, &sel->selsid);
+       err = security_secctx_to_secid(info->secctx, strlen(info->secctx),
+                                      &info->secid);
         if (err) {
                 if (err == -EINVAL)
-                       pr_info("invalid SELinux context \'%s\'\n",
-                               sel->selctx);
+                       pr_info("invalid security context \'%s\'\n", info->secctx);
                 return err;
         }
  
-       if (!sel->selsid) {
-               pr_info("unable to map SELinux context \'%s\'\n", sel->selctx);
+       if (!info->secid) {
+               pr_info("unable to map security context \'%s\'\n", info->secctx);
                 return -ENOENT;
         }
  
-       err = selinux_secmark_relabel_packet_permission(sel->selsid);
+       err = security_secmark_relabel_packet(info->secid);
         if (err) {
                 pr_info("unable to obtain relabeling permission\n");
                 return err;
         }
  
-       selinux_secmark_refcount_inc();
+       security_secmark_refcount_inc();
         return 0;
  }
  
@@ -100,16 +99,16 @@ static int secmark_tg_check(const struct xt_tgchk_param *par)
  
         switch (info->mode) {
         case SECMARK_MODE_SEL:
-               err = checkentry_selinux(info);
-               if (err <= 0)
-                       return err;
                 break;
-
         default:
                 pr_info("invalid mode: %hu\n", info->mode);
                 return -EINVAL;
         }
  
+       err = checkentry_lsm(info);
+       if (err)
+               return err;
+
         if (!mode)
                 mode = info->mode;
         return 0;
@@ -119,7 +118,7 @@ static void secmark_tg_destroy(const struct xt_tgdtor_param *par)
  {
         switch (mode) {
         case SECMARK_MODE_SEL:
-               selinux_secmark_refcount_dec();
+               security_secmark_refcount_dec();
         }
  }
  
diff --git a/net/rds/page.c b/net/rds/page.c

index 595a952d4b17f069c60a457701d6e207f68e621b..1dfbfea12e9bc82ba8482277b4331289effa275b 100644 (file)
--- a/net/rds/page.c
+++ b/net/rds/page.c
@@ -57,30 +57,17 @@ int rds_page_copy_user(struct page *page, unsigned long offset,
         unsigned long ret;
         void *addr;
  
-       if (to_user)
+       addr = kmap(page);
+       if (to_user) {
                 rds_stats_add(s_copy_to_user, bytes);
-       else
+               ret = copy_to_user(ptr, addr + offset, bytes);
+       } else {
                 rds_stats_add(s_copy_from_user, bytes);
-
-       addr = kmap_atomic(page, KM_USER0);
-       if (to_user)
-               ret = __copy_to_user_inatomic(ptr, addr + offset, bytes);
-       else
-               ret = __copy_from_user_inatomic(addr + offset, ptr, bytes);
-       kunmap_atomic(addr, KM_USER0);
-
-       if (ret) {
-               addr = kmap(page);
-               if (to_user)
-                       ret = copy_to_user(ptr, addr + offset, bytes);
-               else
-                       ret = copy_from_user(addr + offset, ptr, bytes);
-               kunmap(page);
-               if (ret)
-                       return -EFAULT;
+               ret = copy_from_user(addr + offset, ptr, bytes);
         }
+       kunmap(page);
  
-       return 0;
+       return ret ? -EFAULT : 0;
  }
  EXPORT_SYMBOL_GPL(rds_page_copy_user);
  
diff --git a/scripts/Makefile b/scripts/Makefile

index 842dbc2d5aeda0af27ce18af9914deea370dc329..2e088109fbd5238f3e4f6d293848606d0ac95a58 100644 (file)
--- a/scripts/Makefile
+++ b/scripts/Makefile
@@ -11,6 +11,7 @@ hostprogs-$(CONFIG_KALLSYMS)     += kallsyms
  hostprogs-$(CONFIG_LOGO)         += pnmtologo
  hostprogs-$(CONFIG_VT)           += conmakehash
  hostprogs-$(CONFIG_IKCONFIG)     += bin2c
+hostprogs-$(BUILD_C_RECORDMCOUNT) += recordmcount
  
  always         := $(hostprogs-y) $(hostprogs-m)
  
diff --git a/scripts/Makefile.build b/scripts/Makefile.build

index a1a5cf95a68d73bd5179474e07b274f77eb5efaf..843bd4f4ffc931face60ece3907743ba766a1b08 100644 (file)
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -209,12 +209,22 @@ cmd_modversions =                                                         \
  endif
  
  ifdef CONFIG_FTRACE_MCOUNT_RECORD
+ifdef BUILD_C_RECORDMCOUNT
+# Due to recursion, we must skip empty.o.
+# The empty.o file is created in the make process in order to determine
+#  the target endianness and word size. It is made before all other C
+#  files, including recordmcount.
+cmd_record_mcount = if [ $(@) != "scripts/mod/empty.o" ]; then                 \
+                       $(objtree)/scripts/recordmcount "$(@)";                 \
+                   fi;
+else
  cmd_record_mcount = set -e ; perl $(srctree)/scripts/recordmcount.pl "$(ARCH)" \
         "$(if $(CONFIG_CPU_BIG_ENDIAN),big,little)" \
         "$(if $(CONFIG_64BIT),64,32)" \
         "$(OBJDUMP)" "$(OBJCOPY)" "$(CC)" "$(LD)" "$(NM)" "$(RM)" "$(MV)" \
         "$(if $(part-of-module),1,0)" "$(@)";
  endif
+endif
  
  define rule_cc_o_c
         $(call echo-cmd,checksrc) $(cmd_checksrc)                         \
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib

index 54fd1b700131e1e1fcb0ddd13d89ee3a06983ecb..7bfcf1a09ac599be43a125ddb5cabf17d3a37e65 100644 (file)
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -101,14 +101,6 @@ basename_flags = -D"KBUILD_BASENAME=KBUILD_STR($(call name-fix,$(basetarget)))"
  modname_flags  = $(if $(filter 1,$(words $(modname))),\
                   -D"KBUILD_MODNAME=KBUILD_STR($(call name-fix,$(modname)))")
  
-#hash values
-ifdef CONFIG_DYNAMIC_DEBUG
-debug_flags = -D"DEBUG_HASH=$(shell ./scripts/basic/hash djb2 $(@D)$(modname))"\
-              -D"DEBUG_HASH2=$(shell ./scripts/basic/hash r5 $(@D)$(modname))"
-else
-debug_flags =
-endif
-
  orig_c_flags   = $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(KBUILD_SUBDIR_CCFLAGS) \
                   $(ccflags-y) $(CFLAGS_$(basetarget).o)
  _c_flags       = $(filter-out $(CFLAGS_REMOVE_$(basetarget).o), $(orig_c_flags))
@@ -152,8 +144,7 @@ endif
  
  c_flags        = -Wp,-MD,$(depfile) $(NOSTDINC_FLAGS) $(LINUXINCLUDE)     \
                  $(__c_flags) $(modkern_cflags)                           \
-                -D"KBUILD_STR(s)=\#s" $(basename_flags) $(modname_flags) \
-                 $(debug_flags)
+                -D"KBUILD_STR(s)=\#s" $(basename_flags) $(modname_flags)
  
  a_flags        = -Wp,-MD,$(depfile) $(NOSTDINC_FLAGS) $(LINUXINCLUDE)     \
                  $(__a_flags) $(modkern_aflags)
diff --git a/scripts/basic/Makefile b/scripts/basic/Makefile

index 09559951df1208e00f5a51efa1208145a87efe14..4c324a1f1e0efb8668b4d64e05b56e8e0f64f25b 100644 (file)
--- a/scripts/basic/Makefile
+++ b/scripts/basic/Makefile
@@ -9,7 +9,7 @@
  # fixdep:       Used to generate dependency information during build process
  # docproc:      Used in Documentation/DocBook
  
-hostprogs-y    := fixdep docproc hash
+hostprogs-y    := fixdep docproc
  always         := $(hostprogs-y)
  
  # fixdep is needed to compile other host programs
diff --git a/scripts/basic/hash.c b/scripts/basic/hash.c

deleted file mode 100644 (file)

index 2ef5d3f..0000000
--- a/scripts/basic/hash.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (C) 2008 Red Hat, Inc., Jason Baron <jbaron@redhat.com>
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#define DYNAMIC_DEBUG_HASH_BITS 6
-
-static const char *program;
-
-static void usage(void)
-{
-       printf("Usage: %s <djb2|r5> <modname>\n", program);
-       exit(1);
-}
-
-/* djb2 hashing algorithm by Dan Bernstein. From:
- * http://www.cse.yorku.ca/~oz/hash.html
- */
-
-static unsigned int djb2_hash(char *str)
-{
-       unsigned long hash = 5381;
-       int c;
-
-       c = *str;
-       while (c) {
-               hash = ((hash << 5) + hash) + c;
-               c = *++str;
-       }
-       return (unsigned int)(hash & ((1 << DYNAMIC_DEBUG_HASH_BITS) - 1));
-}
-
-static unsigned int r5_hash(char *str)
-{
-       unsigned long hash = 0;
-       int c;
-
-       c = *str;
-       while (c) {
-               hash = (hash + (c << 4) + (c >> 4)) * 11;
-               c = *++str;
-       }
-       return (unsigned int)(hash & ((1 << DYNAMIC_DEBUG_HASH_BITS) - 1));
-}
-
-int main(int argc, char *argv[])
-{
-       program = argv[0];
-
-       if (argc != 3)
-               usage();
-       if (!strcmp(argv[1], "djb2"))
-               printf("%d\n", djb2_hash(argv[2]));
-       else if (!strcmp(argv[1], "r5"))
-               printf("%d\n", r5_hash(argv[2]));
-       else
-               usage();
-       exit(0);
-}
-
diff --git a/scripts/gcc-goto.sh b/scripts/gcc-goto.sh

new file mode 100644 (file)

index 0000000..520d16b
--- /dev/null
+++ b/scripts/gcc-goto.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+# Test for gcc 'asm goto' suport
+# Copyright (C) 2010, Jason Baron <jbaron@redhat.com>
+
+echo "int main(void) { entry: asm goto (\"\"::::entry); return 0; }" | $@ -x c - -c -o /dev/null >/dev/null 2>&1 && echo "y"
diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c

new file mode 100644 (file)

index 0000000..26e1271
--- /dev/null
+++ b/scripts/recordmcount.c
@@ -0,0 +1,363 @@
+/*
+ * recordmcount.c: construct a table of the locations of calls to 'mcount'
+ * so that ftrace can find them quickly.
+ * Copyright 2009 John F. Reiser <jreiser@BitWagon.com>.  All rights reserved.
+ * Licensed under the GNU General Public License, version 2 (GPLv2).
+ *
+ * Restructured to fit Linux format, as well as other updates:
+ *  Copyright 2010 Steven Rostedt <srostedt@redhat.com>, Red Hat Inc.
+ */
+
+/*
+ * Strategy: alter the .o file in-place.
+ *
+ * Append a new STRTAB that has the new section names, followed by a new array
+ * ElfXX_Shdr[] that has the new section headers, followed by the section
+ * contents for __mcount_loc and its relocations.  The old shstrtab strings,
+ * and the old ElfXX_Shdr[] array, remain as "garbage" (commonly, a couple
+ * kilobytes.)  Subsequent processing by /bin/ld (or the kernel module loader)
+ * will ignore the garbage regions, because they are not designated by the
+ * new .e_shoff nor the new ElfXX_Shdr[].  [In order to remove the garbage,
+ * then use "ld -r" to create a new file that omits the garbage.]
+ */
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <elf.h>
+#include <fcntl.h>
+#include <setjmp.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+static int fd_map;     /* File descriptor for file being modified. */
+static int mmap_failed; /* Boolean flag. */
+static void *ehdr_curr; /* current ElfXX_Ehdr *  for resource cleanup */
+static char gpfx;      /* prefix for global symbol name (sometimes '_') */
+static struct stat sb; /* Remember .st_size, etc. */
+static jmp_buf jmpenv; /* setjmp/longjmp per-file error escape */
+
+/* setjmp() return values */
+enum {
+       SJ_SETJMP = 0,  /* hardwired first return */
+       SJ_FAIL,
+       SJ_SUCCEED
+};
+
+/* Per-file resource cleanup when multiple files. */
+static void
+cleanup(void)
+{
+       if (!mmap_failed)
+               munmap(ehdr_curr, sb.st_size);
+       else
+               free(ehdr_curr);
+       close(fd_map);
+}
+
+static void __attribute__((noreturn))
+fail_file(void)
+{
+       cleanup();
+       longjmp(jmpenv, SJ_FAIL);
+}
+
+static void __attribute__((noreturn))
+succeed_file(void)
+{
+       cleanup();
+       longjmp(jmpenv, SJ_SUCCEED);
+}
+
+/* ulseek, uread, ...:  Check return value for errors. */
+
+static off_t
+ulseek(int const fd, off_t const offset, int const whence)
+{
+       off_t const w = lseek(fd, offset, whence);
+       if ((off_t)-1 == w) {
+               perror("lseek");
+               fail_file();
+       }
+       return w;
+}
+
+static size_t
+uread(int const fd, void *const buf, size_t const count)
+{
+       size_t const n = read(fd, buf, count);
+       if (n != count) {
+               perror("read");
+               fail_file();
+       }
+       return n;
+}
+
+static size_t
+uwrite(int const fd, void const *const buf, size_t const count)
+{
+       size_t const n = write(fd, buf, count);
+       if (n != count) {
+               perror("write");
+               fail_file();
+       }
+       return n;
+}
+
+static void *
+umalloc(size_t size)
+{
+       void *const addr = malloc(size);
+       if (0 == addr) {
+               fprintf(stderr, "malloc failed: %zu bytes\n", size);
+               fail_file();
+       }
+       return addr;
+}
+
+/*
+ * Get the whole file as a programming convenience in order to avoid
+ * malloc+lseek+read+free of many pieces.  If successful, then mmap
+ * avoids copying unused pieces; else just read the whole file.
+ * Open for both read and write; new info will be appended to the file.
+ * Use MAP_PRIVATE so that a few changes to the in-memory ElfXX_Ehdr
+ * do not propagate to the file until an explicit overwrite at the last.
+ * This preserves most aspects of consistency (all except .st_size)
+ * for simultaneous readers of the file while we are appending to it.
+ * However, multiple writers still are bad.  We choose not to use
+ * locking because it is expensive and the use case of kernel build
+ * makes multiple writers unlikely.
+ */
+static void *mmap_file(char const *fname)
+{
+       void *addr;
+
+       fd_map = open(fname, O_RDWR);
+       if (0 > fd_map || 0 > fstat(fd_map, &sb)) {
+               perror(fname);
+               fail_file();
+       }
+       if (!S_ISREG(sb.st_mode)) {
+               fprintf(stderr, "not a regular file: %s\n", fname);
+               fail_file();
+       }
+       addr = mmap(0, sb.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE,
+                   fd_map, 0);
+       mmap_failed = 0;
+       if (MAP_FAILED == addr) {
+               mmap_failed = 1;
+               addr = umalloc(sb.st_size);
+               uread(fd_map, addr, sb.st_size);
+       }
+       return addr;
+}
+
+/* w8rev, w8nat, ...: Handle endianness. */
+
+static uint64_t w8rev(uint64_t const x)
+{
+       return   ((0xff & (x >> (0 * 8))) << (7 * 8))
+              | ((0xff & (x >> (1 * 8))) << (6 * 8))
+              | ((0xff & (x >> (2 * 8))) << (5 * 8))
+              | ((0xff & (x >> (3 * 8))) << (4 * 8))
+              | ((0xff & (x >> (4 * 8))) << (3 * 8))
+              | ((0xff & (x >> (5 * 8))) << (2 * 8))
+              | ((0xff & (x >> (6 * 8))) << (1 * 8))
+              | ((0xff & (x >> (7 * 8))) << (0 * 8));
+}
+
+static uint32_t w4rev(uint32_t const x)
+{
+       return   ((0xff & (x >> (0 * 8))) << (3 * 8))
+              | ((0xff & (x >> (1 * 8))) << (2 * 8))
+              | ((0xff & (x >> (2 * 8))) << (1 * 8))
+              | ((0xff & (x >> (3 * 8))) << (0 * 8));
+}
+
+static uint32_t w2rev(uint16_t const x)
+{
+       return   ((0xff & (x >> (0 * 8))) << (1 * 8))
+              | ((0xff & (x >> (1 * 8))) << (0 * 8));
+}
+
+static uint64_t w8nat(uint64_t const x)
+{
+       return x;
+}
+
+static uint32_t w4nat(uint32_t const x)
+{
+       return x;
+}
+
+static uint32_t w2nat(uint16_t const x)
+{
+       return x;
+}
+
+static uint64_t (*w8)(uint64_t);
+static uint32_t (*w)(uint32_t);
+static uint32_t (*w2)(uint16_t);
+
+/* Names of the sections that could contain calls to mcount. */
+static int
+is_mcounted_section_name(char const *const txtname)
+{
+       return 0 == strcmp(".text",          txtname) ||
+               0 == strcmp(".sched.text",    txtname) ||
+               0 == strcmp(".spinlock.text", txtname) ||
+               0 == strcmp(".irqentry.text", txtname) ||
+               0 == strcmp(".text.unlikely", txtname);
+}
+
+/* 32 bit and 64 bit are very similar */
+#include "recordmcount.h"
+#define RECORD_MCOUNT_64
+#include "recordmcount.h"
+
+static void
+do_file(char const *const fname)
+{
+       Elf32_Ehdr *const ehdr = mmap_file(fname);
+       unsigned int reltype = 0;
+
+       ehdr_curr = ehdr;
+       w = w4nat;
+       w2 = w2nat;
+       w8 = w8nat;
+       switch (ehdr->e_ident[EI_DATA]) {
+               static unsigned int const endian = 1;
+       default: {
+               fprintf(stderr, "unrecognized ELF data encoding %d: %s\n",
+                       ehdr->e_ident[EI_DATA], fname);
+               fail_file();
+       } break;
+       case ELFDATA2LSB: {
+               if (1 != *(unsigned char const *)&endian) {
+                       /* main() is big endian, file.o is little endian. */
+                       w = w4rev;
+                       w2 = w2rev;
+                       w8 = w8rev;
+               }
+       } break;
+       case ELFDATA2MSB: {
+               if (0 != *(unsigned char const *)&endian) {
+                       /* main() is little endian, file.o is big endian. */
+                       w = w4rev;
+                       w2 = w2rev;
+                       w8 = w8rev;
+               }
+       } break;
+       }  /* end switch */
+       if (0 != memcmp(ELFMAG, ehdr->e_ident, SELFMAG)
+       ||  ET_REL != w2(ehdr->e_type)
+       ||  EV_CURRENT != ehdr->e_ident[EI_VERSION]) {
+               fprintf(stderr, "unrecognized ET_REL file %s\n", fname);
+               fail_file();
+       }
+
+       gpfx = 0;
+       switch (w2(ehdr->e_machine)) {
+       default: {
+               fprintf(stderr, "unrecognized e_machine %d %s\n",
+                       w2(ehdr->e_machine), fname);
+               fail_file();
+       } break;
+       case EM_386:     reltype = R_386_32;                   break;
+       case EM_ARM:     reltype = R_ARM_ABS32;                break;
+       case EM_IA_64:   reltype = R_IA64_IMM64;   gpfx = '_'; break;
+       case EM_PPC:     reltype = R_PPC_ADDR32;   gpfx = '_'; break;
+       case EM_PPC64:   reltype = R_PPC64_ADDR64; gpfx = '_'; break;
+       case EM_S390:    /* reltype: e_class    */ gpfx = '_'; break;
+       case EM_SH:      reltype = R_SH_DIR32;                 break;
+       case EM_SPARCV9: reltype = R_SPARC_64;     gpfx = '_'; break;
+       case EM_X86_64:  reltype = R_X86_64_64;                break;
+       }  /* end switch */
+
+       switch (ehdr->e_ident[EI_CLASS]) {
+       default: {
+               fprintf(stderr, "unrecognized ELF class %d %s\n",
+                       ehdr->e_ident[EI_CLASS], fname);
+               fail_file();
+       } break;
+       case ELFCLASS32: {
+               if (sizeof(Elf32_Ehdr) != w2(ehdr->e_ehsize)
+               ||  sizeof(Elf32_Shdr) != w2(ehdr->e_shentsize)) {
+                       fprintf(stderr,
+                               "unrecognized ET_REL file: %s\n", fname);
+                       fail_file();
+               }
+               if (EM_S390 == w2(ehdr->e_machine))
+                       reltype = R_390_32;
+               do32(ehdr, fname, reltype);
+       } break;
+       case ELFCLASS64: {
+               Elf64_Ehdr *const ghdr = (Elf64_Ehdr *)ehdr;
+               if (sizeof(Elf64_Ehdr) != w2(ghdr->e_ehsize)
+               ||  sizeof(Elf64_Shdr) != w2(ghdr->e_shentsize)) {
+                       fprintf(stderr,
+                               "unrecognized ET_REL file: %s\n", fname);
+                       fail_file();
+               }
+               if (EM_S390 == w2(ghdr->e_machine))
+                       reltype = R_390_64;
+               do64(ghdr, fname, reltype);
+       } break;
+       }  /* end switch */
+
+       cleanup();
+}
+
+int
+main(int argc, char const *argv[])
+{
+       const char ftrace[] = "kernel/trace/ftrace.o";
+       int ftrace_size = sizeof(ftrace) - 1;
+       int n_error = 0;  /* gcc-4.3.0 false positive complaint */
+
+       if (argc <= 1) {
+               fprintf(stderr, "usage: recordmcount file.o...\n");
+               return 0;
+       }
+
+       /* Process each file in turn, allowing deep failure. */
+       for (--argc, ++argv; 0 < argc; --argc, ++argv) {
+               int const sjval = setjmp(jmpenv);
+               int len;
+
+               /*
+                * The file kernel/trace/ftrace.o references the mcount
+                * function but does not call it. Since ftrace.o should
+                * not be traced anyway, we just skip it.
+                */
+               len = strlen(argv[0]);
+               if (len >= ftrace_size &&
+                   strcmp(argv[0] + (len - ftrace_size), ftrace) == 0)
+                       continue;
+
+               switch (sjval) {
+               default: {
+                       fprintf(stderr, "internal error: %s\n", argv[0]);
+                       exit(1);
+               } break;
+               case SJ_SETJMP: {  /* normal sequence */
+                       /* Avoid problems if early cleanup() */
+                       fd_map = -1;
+                       ehdr_curr = NULL;
+                       mmap_failed = 1;
+                       do_file(argv[0]);
+               } break;
+               case SJ_FAIL: {  /* error in do_file or below */
+                       ++n_error;
+               } break;
+               case SJ_SUCCEED: {  /* premature success */
+                       /* do nothing */
+               } break;
+               }  /* end switch */
+       }
+       return !!n_error;
+}
+
+
diff --git a/scripts/recordmcount.h b/scripts/recordmcount.h

new file mode 100644 (file)

index 0000000..7f39d09
--- /dev/null
+++ b/scripts/recordmcount.h
@@ -0,0 +1,366 @@
+/*
+ * recordmcount.h
+ *
+ * This code was taken out of recordmcount.c written by
+ * Copyright 2009 John F. Reiser <jreiser@BitWagon.com>.  All rights reserved.
+ *
+ * The original code had the same algorithms for both 32bit
+ * and 64bit ELF files, but the code was duplicated to support
+ * the difference in structures that were used. This
+ * file creates a macro of everything that is different between
+ * the 64 and 32 bit code, such that by including this header
+ * twice we can create both sets of functions by including this
+ * header once with RECORD_MCOUNT_64 undefined, and again with
+ * it defined.
+ *
+ * This conversion to macros was done by:
+ * Copyright 2010 Steven Rostedt <srostedt@redhat.com>, Red Hat Inc.
+ *
+ * Licensed under the GNU General Public License, version 2 (GPLv2).
+ */
+#undef append_func
+#undef sift_rel_mcount
+#undef find_secsym_ndx
+#undef __has_rel_mcount
+#undef has_rel_mcount
+#undef tot_relsize
+#undef do_func
+#undef Elf_Ehdr
+#undef Elf_Shdr
+#undef Elf_Rel
+#undef Elf_Rela
+#undef Elf_Sym
+#undef ELF_R_SYM
+#undef ELF_R_INFO
+#undef ELF_ST_BIND
+#undef uint_t
+#undef _w
+#undef _align
+#undef _size
+
+#ifdef RECORD_MCOUNT_64
+# define append_func           append64
+# define sift_rel_mcount       sift64_rel_mcount
+# define find_secsym_ndx       find64_secsym_ndx
+# define __has_rel_mcount      __has64_rel_mcount
+# define has_rel_mcount                has64_rel_mcount
+# define tot_relsize           tot64_relsize
+# define do_func               do64
+# define Elf_Ehdr              Elf64_Ehdr
+# define Elf_Shdr              Elf64_Shdr
+# define Elf_Rel               Elf64_Rel
+# define Elf_Rela              Elf64_Rela
+# define Elf_Sym               Elf64_Sym
+# define ELF_R_SYM             ELF64_R_SYM
+# define ELF_R_INFO            ELF64_R_INFO
+# define ELF_ST_BIND           ELF64_ST_BIND
+# define uint_t                        uint64_t
+# define _w                    w8
+# define _align                        7u
+# define _size                 8
+#else
+# define append_func           append32
+# define sift_rel_mcount       sift32_rel_mcount
+# define find_secsym_ndx       find32_secsym_ndx
+# define __has_rel_mcount      __has32_rel_mcount
+# define has_rel_mcount                has32_rel_mcount
+# define tot_relsize           tot32_relsize
+# define do_func               do32
+# define Elf_Ehdr              Elf32_Ehdr
+# define Elf_Shdr              Elf32_Shdr
+# define Elf_Rel               Elf32_Rel
+# define Elf_Rela              Elf32_Rela
+# define Elf_Sym               Elf32_Sym
+# define ELF_R_SYM             ELF32_R_SYM
+# define ELF_R_INFO            ELF32_R_INFO
+# define ELF_ST_BIND           ELF32_ST_BIND
+# define uint_t                        uint32_t
+# define _w                    w
+# define _align                        3u
+# define _size                 4
+#endif
+
+/* Append the new shstrtab, Elf_Shdr[], __mcount_loc and its relocations. */
+static void append_func(Elf_Ehdr *const ehdr,
+                       Elf_Shdr *const shstr,
+                       uint_t const *const mloc0,
+                       uint_t const *const mlocp,
+                       Elf_Rel const *const mrel0,
+                       Elf_Rel const *const mrelp,
+                       unsigned int const rel_entsize,
+                       unsigned int const symsec_sh_link)
+{
+       /* Begin constructing output file */
+       Elf_Shdr mcsec;
+       char const *mc_name = (sizeof(Elf_Rela) == rel_entsize)
+               ? ".rela__mcount_loc"
+               :  ".rel__mcount_loc";
+       unsigned const old_shnum = w2(ehdr->e_shnum);
+       uint_t const old_shoff = _w(ehdr->e_shoff);
+       uint_t const old_shstr_sh_size   = _w(shstr->sh_size);
+       uint_t const old_shstr_sh_offset = _w(shstr->sh_offset);
+       uint_t t = 1 + strlen(mc_name) + _w(shstr->sh_size);
+       uint_t new_e_shoff;
+
+       shstr->sh_size = _w(t);
+       shstr->sh_offset = _w(sb.st_size);
+       t += sb.st_size;
+       t += (_align & -t);  /* word-byte align */
+       new_e_shoff = t;
+
+       /* body for new shstrtab */
+       ulseek(fd_map, sb.st_size, SEEK_SET);
+       uwrite(fd_map, old_shstr_sh_offset + (void *)ehdr, old_shstr_sh_size);
+       uwrite(fd_map, mc_name, 1 + strlen(mc_name));
+
+       /* old(modified) Elf_Shdr table, word-byte aligned */
+       ulseek(fd_map, t, SEEK_SET);
+       t += sizeof(Elf_Shdr) * old_shnum;
+       uwrite(fd_map, old_shoff + (void *)ehdr,
+              sizeof(Elf_Shdr) * old_shnum);
+
+       /* new sections __mcount_loc and .rel__mcount_loc */
+       t += 2*sizeof(mcsec);
+       mcsec.sh_name = w((sizeof(Elf_Rela) == rel_entsize) + strlen(".rel")
+               + old_shstr_sh_size);
+       mcsec.sh_type = w(SHT_PROGBITS);
+       mcsec.sh_flags = _w(SHF_ALLOC);
+       mcsec.sh_addr = 0;
+       mcsec.sh_offset = _w(t);
+       mcsec.sh_size = _w((void *)mlocp - (void *)mloc0);
+       mcsec.sh_link = 0;
+       mcsec.sh_info = 0;
+       mcsec.sh_addralign = _w(_size);
+       mcsec.sh_entsize = _w(_size);
+       uwrite(fd_map, &mcsec, sizeof(mcsec));
+
+       mcsec.sh_name = w(old_shstr_sh_size);
+       mcsec.sh_type = (sizeof(Elf_Rela) == rel_entsize)
+               ? w(SHT_RELA)
+               : w(SHT_REL);
+       mcsec.sh_flags = 0;
+       mcsec.sh_addr = 0;
+       mcsec.sh_offset = _w((void *)mlocp - (void *)mloc0 + t);
+       mcsec.sh_size   = _w((void *)mrelp - (void *)mrel0);
+       mcsec.sh_link = w(symsec_sh_link);
+       mcsec.sh_info = w(old_shnum);
+       mcsec.sh_addralign = _w(_size);
+       mcsec.sh_entsize = _w(rel_entsize);
+       uwrite(fd_map, &mcsec, sizeof(mcsec));
+
+       uwrite(fd_map, mloc0, (void *)mlocp - (void *)mloc0);
+       uwrite(fd_map, mrel0, (void *)mrelp - (void *)mrel0);
+
+       ehdr->e_shoff = _w(new_e_shoff);
+       ehdr->e_shnum = w2(2 + w2(ehdr->e_shnum));  /* {.rel,}__mcount_loc */
+       ulseek(fd_map, 0, SEEK_SET);
+       uwrite(fd_map, ehdr, sizeof(*ehdr));
+}
+
+
+/*
+ * Look at the relocations in order to find the calls to mcount.
+ * Accumulate the section offsets that are found, and their relocation info,
+ * onto the end of the existing arrays.
+ */
+static uint_t *sift_rel_mcount(uint_t *mlocp,
+                              unsigned const offbase,
+                              Elf_Rel **const mrelpp,
+                              Elf_Shdr const *const relhdr,
+                              Elf_Ehdr const *const ehdr,
+                              unsigned const recsym,
+                              uint_t const recval,
+                              unsigned const reltype)
+{
+       uint_t *const mloc0 = mlocp;
+       Elf_Rel *mrelp = *mrelpp;
+       Elf_Shdr *const shdr0 = (Elf_Shdr *)(_w(ehdr->e_shoff)
+               + (void *)ehdr);
+       unsigned const symsec_sh_link = w(relhdr->sh_link);
+       Elf_Shdr const *const symsec = &shdr0[symsec_sh_link];
+       Elf_Sym const *const sym0 = (Elf_Sym const *)(_w(symsec->sh_offset)
+               + (void *)ehdr);
+
+       Elf_Shdr const *const strsec = &shdr0[w(symsec->sh_link)];
+       char const *const str0 = (char const *)(_w(strsec->sh_offset)
+               + (void *)ehdr);
+
+       Elf_Rel const *const rel0 = (Elf_Rel const *)(_w(relhdr->sh_offset)
+               + (void *)ehdr);
+       unsigned rel_entsize = _w(relhdr->sh_entsize);
+       unsigned const nrel = _w(relhdr->sh_size) / rel_entsize;
+       Elf_Rel const *relp = rel0;
+
+       unsigned mcountsym = 0;
+       unsigned t;
+
+       for (t = nrel; t; --t) {
+               if (!mcountsym) {
+                       Elf_Sym const *const symp =
+                               &sym0[ELF_R_SYM(_w(relp->r_info))];
+                       char const *symname = &str0[w(symp->st_name)];
+
+                       if ('.' == symname[0])
+                               ++symname;  /* ppc64 hack */
+                       if (0 == strcmp((('_' == gpfx) ? "_mcount" : "mcount"),
+                                       symname))
+                               mcountsym = ELF_R_SYM(_w(relp->r_info));
+               }
+
+               if (mcountsym == ELF_R_SYM(_w(relp->r_info))) {
+                       uint_t const addend = _w(_w(relp->r_offset) - recval);
+
+                       mrelp->r_offset = _w(offbase
+                               + ((void *)mlocp - (void *)mloc0));
+                       mrelp->r_info = _w(ELF_R_INFO(recsym, reltype));
+                       if (sizeof(Elf_Rela) == rel_entsize) {
+                               ((Elf_Rela *)mrelp)->r_addend = addend;
+                               *mlocp++ = 0;
+                       } else
+                               *mlocp++ = addend;
+
+                       mrelp = (Elf_Rel *)(rel_entsize + (void *)mrelp);
+               }
+               relp = (Elf_Rel const *)(rel_entsize + (void *)relp);
+       }
+       *mrelpp = mrelp;
+       return mlocp;
+}
+
+
+/*
+ * Find a symbol in the given section, to be used as the base for relocating
+ * the table of offsets of calls to mcount.  A local or global symbol suffices,
+ * but avoid a Weak symbol because it may be overridden; the change in value
+ * would invalidate the relocations of the offsets of the calls to mcount.
+ * Often the found symbol will be the unnamed local symbol generated by
+ * GNU 'as' for the start of each section.  For example:
+ *    Num:    Value  Size Type    Bind   Vis      Ndx Name
+ *      2: 00000000     0 SECTION LOCAL  DEFAULT    1
+ */
+static unsigned find_secsym_ndx(unsigned const txtndx,
+                               char const *const txtname,
+                               uint_t *const recvalp,
+                               Elf_Shdr const *const symhdr,
+                               Elf_Ehdr const *const ehdr)
+{
+       Elf_Sym const *const sym0 = (Elf_Sym const *)(_w(symhdr->sh_offset)
+               + (void *)ehdr);
+       unsigned const nsym = _w(symhdr->sh_size) / _w(symhdr->sh_entsize);
+       Elf_Sym const *symp;
+       unsigned t;
+
+       for (symp = sym0, t = nsym; t; --t, ++symp) {
+               unsigned int const st_bind = ELF_ST_BIND(symp->st_info);
+
+               if (txtndx == w2(symp->st_shndx)
+                       /* avoid STB_WEAK */
+                   && (STB_LOCAL == st_bind || STB_GLOBAL == st_bind)) {
+                       *recvalp = _w(symp->st_value);
+                       return symp - sym0;
+               }
+       }
+       fprintf(stderr, "Cannot find symbol for section %d: %s.\n",
+               txtndx, txtname);
+       fail_file();
+}
+
+
+/* Evade ISO C restriction: no declaration after statement in has_rel_mcount. */
+static char const *
+__has_rel_mcount(Elf_Shdr const *const relhdr,  /* is SHT_REL or SHT_RELA */
+                Elf_Shdr const *const shdr0,
+                char const *const shstrtab,
+                char const *const fname)
+{
+       /* .sh_info depends on .sh_type == SHT_REL[,A] */
+       Elf_Shdr const *const txthdr = &shdr0[w(relhdr->sh_info)];
+       char const *const txtname = &shstrtab[w(txthdr->sh_name)];
+
+       if (0 == strcmp("__mcount_loc", txtname)) {
+               fprintf(stderr, "warning: __mcount_loc already exists: %s\n",
+                       fname);
+               succeed_file();
+       }
+       if (SHT_PROGBITS != w(txthdr->sh_type) ||
+           !is_mcounted_section_name(txtname))
+               return NULL;
+       return txtname;
+}
+
+static char const *has_rel_mcount(Elf_Shdr const *const relhdr,
+                                 Elf_Shdr const *const shdr0,
+                                 char const *const shstrtab,
+                                 char const *const fname)
+{
+       if (SHT_REL  != w(relhdr->sh_type) && SHT_RELA != w(relhdr->sh_type))
+               return NULL;
+       return __has_rel_mcount(relhdr, shdr0, shstrtab, fname);
+}
+
+
+static unsigned tot_relsize(Elf_Shdr const *const shdr0,
+                           unsigned nhdr,
+                           const char *const shstrtab,
+                           const char *const fname)
+{
+       unsigned totrelsz = 0;
+       Elf_Shdr const *shdrp = shdr0;
+
+       for (; nhdr; --nhdr, ++shdrp) {
+               if (has_rel_mcount(shdrp, shdr0, shstrtab, fname))
+                       totrelsz += _w(shdrp->sh_size);
+       }
+       return totrelsz;
+}
+
+
+/* Overall supervision for Elf32 ET_REL file. */
+static void
+do_func(Elf_Ehdr *const ehdr, char const *const fname, unsigned const reltype)
+{
+       Elf_Shdr *const shdr0 = (Elf_Shdr *)(_w(ehdr->e_shoff)
+               + (void *)ehdr);
+       unsigned const nhdr = w2(ehdr->e_shnum);
+       Elf_Shdr *const shstr = &shdr0[w2(ehdr->e_shstrndx)];
+       char const *const shstrtab = (char const *)(_w(shstr->sh_offset)
+               + (void *)ehdr);
+
+       Elf_Shdr const *relhdr;
+       unsigned k;
+
+       /* Upper bound on space: assume all relevant relocs are for mcount. */
+       unsigned const totrelsz = tot_relsize(shdr0, nhdr, shstrtab, fname);
+       Elf_Rel *const mrel0 = umalloc(totrelsz);
+       Elf_Rel *      mrelp = mrel0;
+
+       /* 2*sizeof(address) <= sizeof(Elf_Rel) */
+       uint_t *const mloc0 = umalloc(totrelsz>>1);
+       uint_t *      mlocp = mloc0;
+
+       unsigned rel_entsize = 0;
+       unsigned symsec_sh_link = 0;
+
+       for (relhdr = shdr0, k = nhdr; k; --k, ++relhdr) {
+               char const *const txtname = has_rel_mcount(relhdr, shdr0,
+                       shstrtab, fname);
+               if (txtname) {
+                       uint_t recval = 0;
+                       unsigned const recsym = find_secsym_ndx(
+                               w(relhdr->sh_info), txtname, &recval,
+                               &shdr0[symsec_sh_link = w(relhdr->sh_link)],
+                               ehdr);
+
+                       rel_entsize = _w(relhdr->sh_entsize);
+                       mlocp = sift_rel_mcount(mlocp,
+                               (void *)mlocp - (void *)mloc0, &mrelp,
+                               relhdr, ehdr, recsym, recval, reltype);
+               }
+       }
+       if (mloc0 != mlocp) {
+               append_func(ehdr, shstr, mloc0, mlocp, mrel0, mrelp,
+                           rel_entsize, symsec_sh_link);
+       }
+       free(mrel0);
+       free(mloc0);
+}
diff --git a/security/apparmor/.gitignore b/security/apparmor/.gitignore

index 0a0a99f3b08331f78f7499c5e964929af20e96c9..4d995aeaebc0ad0a3232faebead627da4ebb40b9 100644 (file)
--- a/security/apparmor/.gitignore
+++ b/security/apparmor/.gitignore
@@ -3,3 +3,4 @@
  #
  af_names.h
  capability_names.h
+rlim_names.h
diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c

index 7320331b44aba5bd52eac6b2a97302ad21a4ca5f..544ff5837cb640623dadd738fe8879d823574eda 100644 (file)
--- a/security/apparmor/apparmorfs.c
+++ b/security/apparmor/apparmorfs.c
@@ -29,7 +29,7 @@
   * aa_simple_write_to_buffer - common routine for getting policy from user
   * @op: operation doing the user buffer copy
   * @userbuf: user buffer to copy data from  (NOT NULL)
- * @alloc_size: size of user buffer
+ * @alloc_size: size of user buffer (REQUIRES: @alloc_size >= @copy_size)
   * @copy_size: size of data to copy from user buffer
   * @pos: position write is at in the file (NOT NULL)
   *
@@ -42,6 +42,8 @@ static char *aa_simple_write_to_buffer(int op, const char __user *userbuf,
  {
         char *data;
  
+       BUG_ON(copy_size > alloc_size);
+
         if (*pos != 0)
                 /* only writes from pos 0, that is complete writes */
                 return ERR_PTR(-ESPIPE);
diff --git a/security/capability.c b/security/capability.c

index 95a6599a37bb3ae0737779d16d9a0d811d5bfc82..30ae00fbecd591591acb55c1431d62a1bbbac427 100644 (file)
--- a/security/capability.c
+++ b/security/capability.c
@@ -677,7 +677,18 @@ static void cap_inet_conn_established(struct sock *sk, struct sk_buff *skb)
  {
  }
  
+static int cap_secmark_relabel_packet(u32 secid)
+{
+       return 0;
+}
  
+static void cap_secmark_refcount_inc(void)
+{
+}
+
+static void cap_secmark_refcount_dec(void)
+{
+}
  
  static void cap_req_classify_flow(const struct request_sock *req,
                                   struct flowi *fl)
@@ -777,7 +788,8 @@ static int cap_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
  
  static int cap_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
  {
-       return -EOPNOTSUPP;
+       *secid = 0;
+       return 0;
  }
  
  static void cap_release_secctx(char *secdata, u32 seclen)
@@ -1018,6 +1030,9 @@ void __init security_fixup_ops(struct security_operations *ops)
         set_to_cap_if_null(ops, inet_conn_request);
         set_to_cap_if_null(ops, inet_csk_clone);
         set_to_cap_if_null(ops, inet_conn_established);
+       set_to_cap_if_null(ops, secmark_relabel_packet);
+       set_to_cap_if_null(ops, secmark_refcount_inc);
+       set_to_cap_if_null(ops, secmark_refcount_dec);
         set_to_cap_if_null(ops, req_classify_flow);
         set_to_cap_if_null(ops, tun_dev_create);
         set_to_cap_if_null(ops, tun_dev_post_create);
diff --git a/security/commoncap.c b/security/commoncap.c

index 9d172e6e330c9fd7906a8a2e5754713f80dfb433..5e632b4857e443d8031eaa17c0e2bd7e877b3d14 100644 (file)
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -719,14 +719,11 @@ static int cap_safe_nice(struct task_struct *p)
  /**
   * cap_task_setscheduler - Detemine if scheduler policy change is permitted
   * @p: The task to affect
- * @policy: The policy to effect
- * @lp: The parameters to the scheduling policy
   *
   * Detemine if the requested scheduler policy change is permitted for the
   * specified task, returning 0 if permission is granted, -ve if denied.
   */
-int cap_task_setscheduler(struct task_struct *p, int policy,
-                          struct sched_param *lp)
+int cap_task_setscheduler(struct task_struct *p)
  {
         return cap_safe_nice(p);
  }
diff --git a/security/security.c b/security/security.c

index c53949f17d9e0dddc0601032576ef2922fb88f86..b50f472061a43c6ec7fb53781c084fb2cb487dcb 100644 (file)
--- a/security/security.c
+++ b/security/security.c
@@ -89,20 +89,12 @@ __setup("security=", choose_lsm);
   * Return true if:
   *     -The passed LSM is the one chosen by user at boot time,
   *     -or the passed LSM is configured as the default and the user did not
- *      choose an alternate LSM at boot time,
- *     -or there is no default LSM set and the user didn't specify a
- *      specific LSM and we're the first to ask for registration permission,
- *     -or the passed LSM is currently loaded.
+ *      choose an alternate LSM at boot time.
   * Otherwise, return false.
   */
  int __init security_module_enable(struct security_operations *ops)
  {
-       if (!*chosen_lsm)
-               strncpy(chosen_lsm, ops->name, SECURITY_NAME_MAX);
-       else if (strncmp(ops->name, chosen_lsm, SECURITY_NAME_MAX))
-               return 0;
-
-       return 1;
+       return !strcmp(ops->name, chosen_lsm);
  }
  
  /**
@@ -786,10 +778,9 @@ int security_task_setrlimit(struct task_struct *p, unsigned int resource,
         return security_ops->task_setrlimit(p, resource, new_rlim);
  }
  
-int security_task_setscheduler(struct task_struct *p,
-                               int policy, struct sched_param *lp)
+int security_task_setscheduler(struct task_struct *p)
  {
-       return security_ops->task_setscheduler(p, policy, lp);
+       return security_ops->task_setscheduler(p);
  }
  
  int security_task_getscheduler(struct task_struct *p)
@@ -1145,6 +1136,24 @@ void security_inet_conn_established(struct sock *sk,
         security_ops->inet_conn_established(sk, skb);
  }
  
+int security_secmark_relabel_packet(u32 secid)
+{
+       return security_ops->secmark_relabel_packet(secid);
+}
+EXPORT_SYMBOL(security_secmark_relabel_packet);
+
+void security_secmark_refcount_inc(void)
+{
+       security_ops->secmark_refcount_inc();
+}
+EXPORT_SYMBOL(security_secmark_refcount_inc);
+
+void security_secmark_refcount_dec(void)
+{
+       security_ops->secmark_refcount_dec();
+}
+EXPORT_SYMBOL(security_secmark_refcount_dec);
+
  int security_tun_dev_create(void)
  {
         return security_ops->tun_dev_create();
diff --git a/security/selinux/Makefile b/security/selinux/Makefile

index 58d80f3bd6f681f6d366f5b67becd7ff433ce2c7..ad5cd76ec231cd14f02b2fb15f07a3d8a069972f 100644 (file)
--- a/security/selinux/Makefile
+++ b/security/selinux/Makefile
@@ -2,25 +2,20 @@
  # Makefile for building the SELinux module as part of the kernel tree.
  #
  
-obj-$(CONFIG_SECURITY_SELINUX) := selinux.o ss/
-
-selinux-y := avc.o \
-            hooks.o \
-            selinuxfs.o \
-            netlink.o \
-            nlmsgtab.o \
-            netif.o \
-            netnode.o \
-            netport.o \
-            exports.o
+obj-$(CONFIG_SECURITY_SELINUX) := selinux.o
+
+selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o netif.o \
+            netnode.o netport.o exports.o \
+            ss/ebitmap.o ss/hashtab.o ss/symtab.o ss/sidtab.o ss/avtab.o \
+            ss/policydb.o ss/services.o ss/conditional.o ss/mls.o ss/status.o
  
  selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o
  
  selinux-$(CONFIG_NETLABEL) += netlabel.o
  
-EXTRA_CFLAGS += -Isecurity/selinux -Isecurity/selinux/include
+ccflags-y := -Isecurity/selinux -Isecurity/selinux/include
  
-$(obj)/avc.o: $(obj)/flask.h
+$(addprefix $(obj)/,$(selinux-y)): $(obj)/flask.h
  
  quiet_cmd_flask = GEN     $(obj)/flask.h $(obj)/av_permissions.h
        cmd_flask = scripts/selinux/genheaders/genheaders $(obj)/flask.h $(obj)/av_permissions.h
diff --git a/security/selinux/exports.c b/security/selinux/exports.c

index c0a454aee1e03cb0e3825a2c5f965c941636c4d9..90664385dead0df01f6eec2b7479c83f4f021160 100644 (file)
--- a/security/selinux/exports.c
+++ b/security/selinux/exports.c
@@ -11,58 +11,9 @@
   * it under the terms of the GNU General Public License version 2,
   * as published by the Free Software Foundation.
   */
-#include <linux/types.h>
-#include <linux/kernel.h>
  #include <linux/module.h>
-#include <linux/selinux.h>
-#include <linux/fs.h>
-#include <linux/ipc.h>
-#include <asm/atomic.h>
  
  #include "security.h"
-#include "objsec.h"
-
-/* SECMARK reference count */
-extern atomic_t selinux_secmark_refcount;
-
-int selinux_string_to_sid(char *str, u32 *sid)
-{
-       if (selinux_enabled)
-               return security_context_to_sid(str, strlen(str), sid);
-       else {
-               *sid = 0;
-               return 0;
-       }
-}
-EXPORT_SYMBOL_GPL(selinux_string_to_sid);
-
-int selinux_secmark_relabel_packet_permission(u32 sid)
-{
-       if (selinux_enabled) {
-               const struct task_security_struct *__tsec;
-               u32 tsid;
-
-               __tsec = current_security();
-               tsid = __tsec->sid;
-
-               return avc_has_perm(tsid, sid, SECCLASS_PACKET,
-                                   PACKET__RELABELTO, NULL);
-       }
-       return 0;
-}
-EXPORT_SYMBOL_GPL(selinux_secmark_relabel_packet_permission);
-
-void selinux_secmark_refcount_inc(void)
-{
-       atomic_inc(&selinux_secmark_refcount);
-}
-EXPORT_SYMBOL_GPL(selinux_secmark_refcount_inc);
-
-void selinux_secmark_refcount_dec(void)
-{
-       atomic_dec(&selinux_secmark_refcount);
-}
-EXPORT_SYMBOL_GPL(selinux_secmark_refcount_dec);
  
  bool selinux_is_enabled(void)
  {
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c

index 4796ddd4e721ae454a02563d713aa235870ece02..d9154cf90ae19cd4eb5f40d65882abb60781da3d 100644 (file)
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3354,11 +3354,11 @@ static int selinux_task_setrlimit(struct task_struct *p, unsigned int resource,
         return 0;
  }
  
-static int selinux_task_setscheduler(struct task_struct *p, int policy, struct sched_param *lp)
+static int selinux_task_setscheduler(struct task_struct *p)
  {
         int rc;
  
-       rc = cap_task_setscheduler(p, policy, lp);
+       rc = cap_task_setscheduler(p);
         if (rc)
                 return rc;
  
@@ -4279,6 +4279,27 @@ static void selinux_inet_conn_established(struct sock *sk, struct sk_buff *skb)
         selinux_skb_peerlbl_sid(skb, family, &sksec->peer_sid);
  }
  
+static int selinux_secmark_relabel_packet(u32 sid)
+{
+       const struct task_security_struct *__tsec;
+       u32 tsid;
+
+       __tsec = current_security();
+       tsid = __tsec->sid;
+
+       return avc_has_perm(tsid, sid, SECCLASS_PACKET, PACKET__RELABELTO, NULL);
+}
+
+static void selinux_secmark_refcount_inc(void)
+{
+       atomic_inc(&selinux_secmark_refcount);
+}
+
+static void selinux_secmark_refcount_dec(void)
+{
+       atomic_dec(&selinux_secmark_refcount);
+}
+
  static void selinux_req_classify_flow(const struct request_sock *req,
                                       struct flowi *fl)
  {
@@ -5533,6 +5554,9 @@ static struct security_operations selinux_ops = {
         .inet_conn_request =            selinux_inet_conn_request,
         .inet_csk_clone =               selinux_inet_csk_clone,
         .inet_conn_established =        selinux_inet_conn_established,
+       .secmark_relabel_packet =       selinux_secmark_relabel_packet,
+       .secmark_refcount_inc =         selinux_secmark_refcount_inc,
+       .secmark_refcount_dec =         selinux_secmark_refcount_dec,
         .req_classify_flow =            selinux_req_classify_flow,
         .tun_dev_create =               selinux_tun_dev_create,
         .tun_dev_post_create =          selinux_tun_dev_post_create,
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h

index b4c9eb4bd6f9127a506e2a4483c592362c8fcafa..8858d2b2d4b6ad1dd1b005b20a06afa4a3505d03 100644 (file)
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -17,7 +17,7 @@ struct security_class_mapping secclass_map[] = {
           { "compute_av", "compute_create", "compute_member",
             "check_context", "load_policy", "compute_relabel",
             "compute_user", "setenforce", "setbool", "setsecparam",
-           "setcheckreqprot", NULL } },
+           "setcheckreqprot", "read_policy", NULL } },
         { "process",
           { "fork", "transition", "sigchld", "sigkill",
             "sigstop", "signull", "signal", "ptrace", "getsched", "setsched",
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h

index 1f7c2491d3dccbc54769a6ccaf509d50255cfe3f..671273eb1115c4e7f05983af071069aea7535650 100644 (file)
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -9,6 +9,7 @@
  #define _SELINUX_SECURITY_H_
  
  #include <linux/magic.h>
+#include <linux/types.h>
  #include "flask.h"
  
  #define SECSID_NULL                    0x00000000 /* unspecified SID */
@@ -82,6 +83,8 @@ extern int selinux_policycap_openperm;
  int security_mls_enabled(void);
  
  int security_load_policy(void *data, size_t len);
+int security_read_policy(void **data, ssize_t *len);
+size_t security_policydb_len(void);
  
  int security_policycap_supported(unsigned int req_cap);
  
@@ -191,5 +194,25 @@ static inline int security_netlbl_sid_to_secattr(u32 sid,
  
  const char *security_get_initial_sid_context(u32 sid);
  
+/*
+ * status notifier using mmap interface
+ */
+extern struct page *selinux_kernel_status_page(void);
+
+#define SELINUX_KERNEL_STATUS_VERSION  1
+struct selinux_kernel_status {
+       u32     version;        /* version number of thie structure */
+       u32     sequence;       /* sequence number of seqlock logic */
+       u32     enforcing;      /* current setting of enforcing mode */
+       u32     policyload;     /* times of policy reloaded */
+       u32     deny_unknown;   /* current setting of deny_unknown */
+       /*
+        * The version > 0 supports above members.
+        */
+} __attribute__((packed));
+
+extern void selinux_status_update_setenforce(int enforcing);
+extern void selinux_status_update_policyload(int seqno);
+
  #endif /* _SELINUX_SECURITY_H_ */
  
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c

index 79a1bb635662fbc7f65a306e10b5b67e534fcf1c..87e0556bae70ff977ea290b3cdfcc2c308d8edf5 100644 (file)
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -68,6 +68,8 @@ static int *bool_pending_values;
  static struct dentry *class_dir;
  static unsigned long last_class_ino;
  
+static char policy_opened;
+
  /* global data for policy capabilities */
  static struct dentry *policycap_dir;
  
@@ -110,6 +112,8 @@ enum sel_inos {
         SEL_COMPAT_NET, /* whether to use old compat network packet controls */
         SEL_REJECT_UNKNOWN, /* export unknown reject handling to userspace */
         SEL_DENY_UNKNOWN, /* export unknown deny handling to userspace */
+       SEL_STATUS,     /* export current status using mmap() */
+       SEL_POLICY,     /* allow userspace to read the in kernel policy */
         SEL_INO_NEXT,   /* The next inode number to use */
  };
  
@@ -171,6 +175,7 @@ static ssize_t sel_write_enforce(struct file *file, const char __user *buf,
                 if (selinux_enforcing)
                         avc_ss_reset(0);
                 selnl_notify_setenforce(selinux_enforcing);
+               selinux_status_update_setenforce(selinux_enforcing);
         }
         length = count;
  out:
@@ -205,6 +210,59 @@ static const struct file_operations sel_handle_unknown_ops = {
         .llseek         = generic_file_llseek,
  };
  
+static int sel_open_handle_status(struct inode *inode, struct file *filp)
+{
+       struct page    *status = selinux_kernel_status_page();
+
+       if (!status)
+               return -ENOMEM;
+
+       filp->private_data = status;
+
+       return 0;
+}
+
+static ssize_t sel_read_handle_status(struct file *filp, char __user *buf,
+                                     size_t count, loff_t *ppos)
+{
+       struct page    *status = filp->private_data;
+
+       BUG_ON(!status);
+
+       return simple_read_from_buffer(buf, count, ppos,
+                                      page_address(status),
+                                      sizeof(struct selinux_kernel_status));
+}
+
+static int sel_mmap_handle_status(struct file *filp,
+                                 struct vm_area_struct *vma)
+{
+       struct page    *status = filp->private_data;
+       unsigned long   size = vma->vm_end - vma->vm_start;
+
+       BUG_ON(!status);
+
+       /* only allows one page from the head */
+       if (vma->vm_pgoff > 0 || size != PAGE_SIZE)
+               return -EIO;
+       /* disallow writable mapping */
+       if (vma->vm_flags & VM_WRITE)
+               return -EPERM;
+       /* disallow mprotect() turns it into writable */
+       vma->vm_flags &= ~VM_MAYWRITE;
+
+       return remap_pfn_range(vma, vma->vm_start,
+                              page_to_pfn(status),
+                              size, vma->vm_page_prot);
+}
+
+static const struct file_operations sel_handle_status_ops = {
+       .open           = sel_open_handle_status,
+       .read           = sel_read_handle_status,
+       .mmap           = sel_mmap_handle_status,
+       .llseek         = generic_file_llseek,
+};
+
  #ifdef CONFIG_SECURITY_SELINUX_DISABLE
  static ssize_t sel_write_disable(struct file *file, const char __user *buf,
                                  size_t count, loff_t *ppos)
@@ -296,6 +354,141 @@ static const struct file_operations sel_mls_ops = {
         .llseek         = generic_file_llseek,
  };
  
+struct policy_load_memory {
+       size_t len;
+       void *data;
+};
+
+static int sel_open_policy(struct inode *inode, struct file *filp)
+{
+       struct policy_load_memory *plm = NULL;
+       int rc;
+
+       BUG_ON(filp->private_data);
+
+       mutex_lock(&sel_mutex);
+
+       rc = task_has_security(current, SECURITY__READ_POLICY);
+       if (rc)
+               goto err;
+
+       rc = -EBUSY;
+       if (policy_opened)
+               goto err;
+
+       rc = -ENOMEM;
+       plm = kzalloc(sizeof(*plm), GFP_KERNEL);
+       if (!plm)
+               goto err;
+
+       if (i_size_read(inode) != security_policydb_len()) {
+               mutex_lock(&inode->i_mutex);
+               i_size_write(inode, security_policydb_len());
+               mutex_unlock(&inode->i_mutex);
+       }
+
+       rc = security_read_policy(&plm->data, &plm->len);
+       if (rc)
+               goto err;
+
+       policy_opened = 1;
+
+       filp->private_data = plm;
+
+       mutex_unlock(&sel_mutex);
+
+       return 0;
+err:
+       mutex_unlock(&sel_mutex);
+
+       if (plm)
+               vfree(plm->data);
+       kfree(plm);
+       return rc;
+}
+
+static int sel_release_policy(struct inode *inode, struct file *filp)
+{
+       struct policy_load_memory *plm = filp->private_data;
+
+       BUG_ON(!plm);
+
+       policy_opened = 0;
+
+       vfree(plm->data);
+       kfree(plm);
+
+       return 0;
+}
+
+static ssize_t sel_read_policy(struct file *filp, char __user *buf,
+                              size_t count, loff_t *ppos)
+{
+       struct policy_load_memory *plm = filp->private_data;
+       int ret;
+
+       mutex_lock(&sel_mutex);
+
+       ret = task_has_security(current, SECURITY__READ_POLICY);
+       if (ret)
+               goto out;
+
+       ret = simple_read_from_buffer(buf, count, ppos, plm->data, plm->len);
+out:
+       mutex_unlock(&sel_mutex);
+       return ret;
+}
+
+static int sel_mmap_policy_fault(struct vm_area_struct *vma,
+                                struct vm_fault *vmf)
+{
+       struct policy_load_memory *plm = vma->vm_file->private_data;
+       unsigned long offset;
+       struct page *page;
+
+       if (vmf->flags & (FAULT_FLAG_MKWRITE | FAULT_FLAG_WRITE))
+               return VM_FAULT_SIGBUS;
+
+       offset = vmf->pgoff << PAGE_SHIFT;
+       if (offset >= roundup(plm->len, PAGE_SIZE))
+               return VM_FAULT_SIGBUS;
+
+       page = vmalloc_to_page(plm->data + offset);
+       get_page(page);
+
+       vmf->page = page;
+
+       return 0;
+}
+
+static struct vm_operations_struct sel_mmap_policy_ops = {
+       .fault = sel_mmap_policy_fault,
+       .page_mkwrite = sel_mmap_policy_fault,
+};
+
+int sel_mmap_policy(struct file *filp, struct vm_area_struct *vma)
+{
+       if (vma->vm_flags & VM_SHARED) {
+               /* do not allow mprotect to make mapping writable */
+               vma->vm_flags &= ~VM_MAYWRITE;
+
+               if (vma->vm_flags & VM_WRITE)
+                       return -EACCES;
+       }
+
+       vma->vm_flags |= VM_RESERVED;
+       vma->vm_ops = &sel_mmap_policy_ops;
+
+       return 0;
+}
+
+static const struct file_operations sel_policy_ops = {
+       .open           = sel_open_policy,
+       .read           = sel_read_policy,
+       .mmap           = sel_mmap_policy,
+       .release        = sel_release_policy,
+};
+
  static ssize_t sel_write_load(struct file *file, const char __user *buf,
                               size_t count, loff_t *ppos)
  
@@ -1612,6 +1805,8 @@ static int sel_fill_super(struct super_block *sb, void *data, int silent)
                 [SEL_CHECKREQPROT] = {"checkreqprot", &sel_checkreqprot_ops, S_IRUGO|S_IWUSR},
                 [SEL_REJECT_UNKNOWN] = {"reject_unknown", &sel_handle_unknown_ops, S_IRUGO},
                 [SEL_DENY_UNKNOWN] = {"deny_unknown", &sel_handle_unknown_ops, S_IRUGO},
+               [SEL_STATUS] = {"status", &sel_handle_status_ops, S_IRUGO},
+               [SEL_POLICY] = {"policy", &sel_policy_ops, S_IRUSR},
                 /* last one */ {""}
         };
         ret = simple_fill_super(sb, SELINUX_MAGIC, selinux_files);
diff --git a/security/selinux/ss/Makefile b/security/selinux/ss/Makefile

deleted file mode 100644 (file)

index 15d4e62..0000000
--- a/security/selinux/ss/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-#
-# Makefile for building the SELinux security server as part of the kernel tree.
-#
-
-EXTRA_CFLAGS += -Isecurity/selinux -Isecurity/selinux/include
-obj-y := ss.o
-
-ss-y := ebitmap.o hashtab.o symtab.o sidtab.o avtab.o policydb.o services.o conditional.o mls.o
-
diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c

index 929480c6c4306e874eff82db107045b944cdd33b..a3dd9faa19c01eda269b13f7cfcd7ab6da6aa098 100644 (file)
--- a/security/selinux/ss/avtab.c
+++ b/security/selinux/ss/avtab.c
@@ -266,8 +266,8 @@ int avtab_alloc(struct avtab *h, u32 nrules)
         if (shift > 2)
                 shift = shift - 2;
         nslot = 1 << shift;
-       if (nslot > MAX_AVTAB_SIZE)
-               nslot = MAX_AVTAB_SIZE;
+       if (nslot > MAX_AVTAB_HASH_BUCKETS)
+               nslot = MAX_AVTAB_HASH_BUCKETS;
         mask = nslot - 1;
  
         h->htable = kcalloc(nslot, sizeof(*(h->htable)), GFP_KERNEL);
@@ -501,6 +501,48 @@ bad:
         goto out;
  }
  
+int avtab_write_item(struct policydb *p, struct avtab_node *cur, void *fp)
+{
+       __le16 buf16[4];
+       __le32 buf32[1];
+       int rc;
+
+       buf16[0] = cpu_to_le16(cur->key.source_type);
+       buf16[1] = cpu_to_le16(cur->key.target_type);
+       buf16[2] = cpu_to_le16(cur->key.target_class);
+       buf16[3] = cpu_to_le16(cur->key.specified);
+       rc = put_entry(buf16, sizeof(u16), 4, fp);
+       if (rc)
+               return rc;
+       buf32[0] = cpu_to_le32(cur->datum.data);
+       rc = put_entry(buf32, sizeof(u32), 1, fp);
+       if (rc)
+               return rc;
+       return 0;
+}
+
+int avtab_write(struct policydb *p, struct avtab *a, void *fp)
+{
+       unsigned int i;
+       int rc = 0;
+       struct avtab_node *cur;
+       __le32 buf[1];
+
+       buf[0] = cpu_to_le32(a->nel);
+       rc = put_entry(buf, sizeof(u32), 1, fp);
+       if (rc)
+               return rc;
+
+       for (i = 0; i < a->nslot; i++) {
+               for (cur = a->htable[i]; cur; cur = cur->next) {
+                       rc = avtab_write_item(p, cur, fp);
+                       if (rc)
+                               return rc;
+               }
+       }
+
+       return rc;
+}
  void avtab_cache_init(void)
  {
         avtab_node_cachep = kmem_cache_create("avtab_node",
diff --git a/security/selinux/ss/avtab.h b/security/selinux/ss/avtab.h

index cd4f734e27499cf2f9e203f0edb2557cf02bc24c..dff0c75345c1642fd2b3181a6677035947834951 100644 (file)
--- a/security/selinux/ss/avtab.h
+++ b/security/selinux/ss/avtab.h
@@ -71,6 +71,8 @@ int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol,
                     void *p);
  
  int avtab_read(struct avtab *a, void *fp, struct policydb *pol);
+int avtab_write_item(struct policydb *p, struct avtab_node *cur, void *fp);
+int avtab_write(struct policydb *p, struct avtab *a, void *fp);
  
  struct avtab_node *avtab_insert_nonunique(struct avtab *h, struct avtab_key *key,
                                           struct avtab_datum *datum);
@@ -85,7 +87,6 @@ void avtab_cache_destroy(void);
  #define MAX_AVTAB_HASH_BITS 11
  #define MAX_AVTAB_HASH_BUCKETS (1 << MAX_AVTAB_HASH_BITS)
  #define MAX_AVTAB_HASH_MASK (MAX_AVTAB_HASH_BUCKETS-1)
-#define MAX_AVTAB_SIZE MAX_AVTAB_HASH_BUCKETS
  
  #endif /* _SS_AVTAB_H_ */
  
diff --git a/security/selinux/ss/conditional.c b/security/selinux/ss/conditional.c

index c91e150c3087d78127eb8ca6cc6faccf04e9cd32..655fe1c6cc69dccd862b3142a37e41b6fb4010a7 100644 (file)
--- a/security/selinux/ss/conditional.c
+++ b/security/selinux/ss/conditional.c
@@ -490,6 +490,129 @@ err:
         return rc;
  }
  
+int cond_write_bool(void *vkey, void *datum, void *ptr)
+{
+       char *key = vkey;
+       struct cond_bool_datum *booldatum = datum;
+       struct policy_data *pd = ptr;
+       void *fp = pd->fp;
+       __le32 buf[3];
+       u32 len;
+       int rc;
+
+       len = strlen(key);
+       buf[0] = cpu_to_le32(booldatum->value);
+       buf[1] = cpu_to_le32(booldatum->state);
+       buf[2] = cpu_to_le32(len);
+       rc = put_entry(buf, sizeof(u32), 3, fp);
+       if (rc)
+               return rc;
+       rc = put_entry(key, 1, len, fp);
+       if (rc)
+               return rc;
+       return 0;
+}
+
+/*
+ * cond_write_cond_av_list doesn't write out the av_list nodes.
+ * Instead it writes out the key/value pairs from the avtab. This
+ * is necessary because there is no way to uniquely identifying rules
+ * in the avtab so it is not possible to associate individual rules
+ * in the avtab with a conditional without saving them as part of
+ * the conditional. This means that the avtab with the conditional
+ * rules will not be saved but will be rebuilt on policy load.
+ */
+static int cond_write_av_list(struct policydb *p,
+                             struct cond_av_list *list, struct policy_file *fp)
+{
+       __le32 buf[1];
+       struct cond_av_list *cur_list;
+       u32 len;
+       int rc;
+
+       len = 0;
+       for (cur_list = list; cur_list != NULL; cur_list = cur_list->next)
+               len++;
+
+       buf[0] = cpu_to_le32(len);
+       rc = put_entry(buf, sizeof(u32), 1, fp);
+       if (rc)
+               return rc;
+
+       if (len == 0)
+               return 0;
+
+       for (cur_list = list; cur_list != NULL; cur_list = cur_list->next) {
+               rc = avtab_write_item(p, cur_list->node, fp);
+               if (rc)
+                       return rc;
+       }
+
+       return 0;
+}
+
+int cond_write_node(struct policydb *p, struct cond_node *node,
+                   struct policy_file *fp)
+{
+       struct cond_expr *cur_expr;
+       __le32 buf[2];
+       int rc;
+       u32 len = 0;
+
+       buf[0] = cpu_to_le32(node->cur_state);
+       rc = put_entry(buf, sizeof(u32), 1, fp);
+       if (rc)
+               return rc;
+
+       for (cur_expr = node->expr; cur_expr != NULL; cur_expr = cur_expr->next)
+               len++;
+
+       buf[0] = cpu_to_le32(len);
+       rc = put_entry(buf, sizeof(u32), 1, fp);
+       if (rc)
+               return rc;
+
+       for (cur_expr = node->expr; cur_expr != NULL; cur_expr = cur_expr->next) {
+               buf[0] = cpu_to_le32(cur_expr->expr_type);
+               buf[1] = cpu_to_le32(cur_expr->bool);
+               rc = put_entry(buf, sizeof(u32), 2, fp);
+               if (rc)
+                       return rc;
+       }
+
+       rc = cond_write_av_list(p, node->true_list, fp);
+       if (rc)
+               return rc;
+       rc = cond_write_av_list(p, node->false_list, fp);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+int cond_write_list(struct policydb *p, struct cond_node *list, void *fp)
+{
+       struct cond_node *cur;
+       u32 len;
+       __le32 buf[1];
+       int rc;
+
+       len = 0;
+       for (cur = list; cur != NULL; cur = cur->next)
+               len++;
+       buf[0] = cpu_to_le32(len);
+       rc = put_entry(buf, sizeof(u32), 1, fp);
+       if (rc)
+               return rc;
+
+       for (cur = list; cur != NULL; cur = cur->next) {
+               rc = cond_write_node(p, cur, fp);
+               if (rc)
+                       return rc;
+       }
+
+       return 0;
+}
  /* Determine whether additional permissions are granted by the conditional
   * av table, and if so, add them to the result
   */
diff --git a/security/selinux/ss/conditional.h b/security/selinux/ss/conditional.h

index 53ddb013ae573f8bb053da9fa9416fbcf5ee9701..3f209c635295681f026874e332573577e743dea4 100644 (file)
--- a/security/selinux/ss/conditional.h
+++ b/security/selinux/ss/conditional.h
@@ -69,6 +69,8 @@ int cond_index_bool(void *key, void *datum, void *datap);
  
  int cond_read_bool(struct policydb *p, struct hashtab *h, void *fp);
  int cond_read_list(struct policydb *p, void *fp);
+int cond_write_bool(void *key, void *datum, void *ptr);
+int cond_write_list(struct policydb *p, struct cond_node *list, void *fp);
  
  void cond_compute_av(struct avtab *ctab, struct avtab_key *key, struct av_decision *avd);
  
diff --git a/security/selinux/ss/ebitmap.c b/security/selinux/ss/ebitmap.c

index 04b6145d767f96093423d5f733e6fe1b6a4e5687..d42951fcbe877355b08c16d5a63526c729f4355c 100644 (file)
--- a/security/selinux/ss/ebitmap.c
+++ b/security/selinux/ss/ebitmap.c
@@ -22,6 +22,8 @@
  #include "ebitmap.h"
  #include "policydb.h"
  
+#define BITS_PER_U64   (sizeof(u64) * 8)
+
  int ebitmap_cmp(struct ebitmap *e1, struct ebitmap *e2)
  {
         struct ebitmap_node *n1, *n2;
@@ -363,10 +365,10 @@ int ebitmap_read(struct ebitmap *e, void *fp)
         e->highbit = le32_to_cpu(buf[1]);
         count = le32_to_cpu(buf[2]);
  
-       if (mapunit != sizeof(u64) * 8) {
+       if (mapunit != BITS_PER_U64) {
                 printk(KERN_ERR "SELinux: ebitmap: map size %u does not "
                        "match my size %Zd (high bit was %d)\n",
-                      mapunit, sizeof(u64) * 8, e->highbit);
+                      mapunit, BITS_PER_U64, e->highbit);
                 goto bad;
         }
  
@@ -446,3 +448,78 @@ bad:
         ebitmap_destroy(e);
         goto out;
  }
+
+int ebitmap_write(struct ebitmap *e, void *fp)
+{
+       struct ebitmap_node *n;
+       u32 count;
+       __le32 buf[3];
+       u64 map;
+       int bit, last_bit, last_startbit, rc;
+
+       buf[0] = cpu_to_le32(BITS_PER_U64);
+
+       count = 0;
+       last_bit = 0;
+       last_startbit = -1;
+       ebitmap_for_each_positive_bit(e, n, bit) {
+               if (rounddown(bit, (int)BITS_PER_U64) > last_startbit) {
+                       count++;
+                       last_startbit = rounddown(bit, BITS_PER_U64);
+               }
+               last_bit = roundup(bit + 1, BITS_PER_U64);
+       }
+       buf[1] = cpu_to_le32(last_bit);
+       buf[2] = cpu_to_le32(count);
+
+       rc = put_entry(buf, sizeof(u32), 3, fp);
+       if (rc)
+               return rc;
+
+       map = 0;
+       last_startbit = INT_MIN;
+       ebitmap_for_each_positive_bit(e, n, bit) {
+               if (rounddown(bit, (int)BITS_PER_U64) > last_startbit) {
+                       __le64 buf64[1];
+
+                       /* this is the very first bit */
+                       if (!map) {
+                               last_startbit = rounddown(bit, BITS_PER_U64);
+                               map = (u64)1 << (bit - last_startbit);
+                               continue;
+                       }
+
+                       /* write the last node */
+                       buf[0] = cpu_to_le32(last_startbit);
+                       rc = put_entry(buf, sizeof(u32), 1, fp);
+                       if (rc)
+                               return rc;
+
+                       buf64[0] = cpu_to_le64(map);
+                       rc = put_entry(buf64, sizeof(u64), 1, fp);
+                       if (rc)
+                               return rc;
+
+                       /* set up for the next node */
+                       map = 0;
+                       last_startbit = rounddown(bit, BITS_PER_U64);
+               }
+               map |= (u64)1 << (bit - last_startbit);
+       }
+       /* write the last node */
+       if (map) {
+               __le64 buf64[1];
+
+               /* write the last node */
+               buf[0] = cpu_to_le32(last_startbit);
+               rc = put_entry(buf, sizeof(u32), 1, fp);
+               if (rc)
+                       return rc;
+
+               buf64[0] = cpu_to_le64(map);
+               rc = put_entry(buf64, sizeof(u64), 1, fp);
+               if (rc)
+                       return rc;
+       }
+       return 0;
+}
diff --git a/security/selinux/ss/ebitmap.h b/security/selinux/ss/ebitmap.h

index f283b4367f54d640e6a3b6fedb8f3180786883c3..1f4e93c2ae8695430c4a9ff33cfd31e7d36cec36 100644 (file)
--- a/security/selinux/ss/ebitmap.h
+++ b/security/selinux/ss/ebitmap.h
@@ -123,6 +123,7 @@ int ebitmap_get_bit(struct ebitmap *e, unsigned long bit);
  int ebitmap_set_bit(struct ebitmap *e, unsigned long bit, int value);
  void ebitmap_destroy(struct ebitmap *e);
  int ebitmap_read(struct ebitmap *e, void *fp);
+int ebitmap_write(struct ebitmap *e, void *fp);
  
  #ifdef CONFIG_NETLABEL
  int ebitmap_netlbl_export(struct ebitmap *ebmap,
diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c

index 3a29704be8ce10f4409dd0a3d4f0bea8bb0d1086..94f630d93a5c5d0964a51e030646d26652619ee6 100644 (file)
--- a/security/selinux/ss/policydb.c
+++ b/security/selinux/ss/policydb.c
@@ -37,6 +37,7 @@
  #include "policydb.h"
  #include "conditional.h"
  #include "mls.h"
+#include "services.h"
  
  #define _DEBUG_HASHES
  
@@ -185,9 +186,19 @@ static u32 rangetr_hash(struct hashtab *h, const void *k)
  static int rangetr_cmp(struct hashtab *h, const void *k1, const void *k2)
  {
         const struct range_trans *key1 = k1, *key2 = k2;
-       return (key1->source_type != key2->source_type ||
-               key1->target_type != key2->target_type ||
-               key1->target_class != key2->target_class);
+       int v;
+
+       v = key1->source_type - key2->source_type;
+       if (v)
+               return v;
+
+       v = key1->target_type - key2->target_type;
+       if (v)
+               return v;
+
+       v = key1->target_class - key2->target_class;
+
+       return v;
  }
  
  /*
@@ -1624,11 +1635,11 @@ static int role_bounds_sanity_check(void *key, void *datum, void *datap)
  
  static int type_bounds_sanity_check(void *key, void *datum, void *datap)
  {
-       struct type_datum *upper, *type;
+       struct type_datum *upper;
         struct policydb *p = datap;
         int depth = 0;
  
-       upper = type = datum;
+       upper = datum;
         while (upper->bounds) {
                 if (++depth == POLICYDB_BOUNDS_MAXDEPTH) {
                         printk(KERN_ERR "SELinux: type %s: "
@@ -2306,3 +2317,843 @@ bad:
         policydb_destroy(p);
         goto out;
  }
+
+/*
+ * Write a MLS level structure to a policydb binary
+ * representation file.
+ */
+static int mls_write_level(struct mls_level *l, void *fp)
+{
+       __le32 buf[1];
+       int rc;
+
+       buf[0] = cpu_to_le32(l->sens);
+       rc = put_entry(buf, sizeof(u32), 1, fp);
+       if (rc)
+               return rc;
+
+       rc = ebitmap_write(&l->cat, fp);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+/*
+ * Write a MLS range structure to a policydb binary
+ * representation file.
+ */
+static int mls_write_range_helper(struct mls_range *r, void *fp)
+{
+       __le32 buf[3];
+       size_t items;
+       int rc, eq;
+
+       eq = mls_level_eq(&r->level[1], &r->level[0]);
+
+       if (eq)
+               items = 2;
+       else
+               items = 3;
+       buf[0] = cpu_to_le32(items-1);
+       buf[1] = cpu_to_le32(r->level[0].sens);
+       if (!eq)
+               buf[2] = cpu_to_le32(r->level[1].sens);
+
+       BUG_ON(items > (sizeof(buf)/sizeof(buf[0])));
+
+       rc = put_entry(buf, sizeof(u32), items, fp);
+       if (rc)
+               return rc;
+
+       rc = ebitmap_write(&r->level[0].cat, fp);
+       if (rc)
+               return rc;
+       if (!eq) {
+               rc = ebitmap_write(&r->level[1].cat, fp);
+               if (rc)
+                       return rc;
+       }
+
+       return 0;
+}
+
+static int sens_write(void *vkey, void *datum, void *ptr)
+{
+       char *key = vkey;
+       struct level_datum *levdatum = datum;
+       struct policy_data *pd = ptr;
+       void *fp = pd->fp;
+       __le32 buf[2];
+       size_t len;
+       int rc;
+
+       len = strlen(key);
+       buf[0] = cpu_to_le32(len);
+       buf[1] = cpu_to_le32(levdatum->isalias);
+       rc = put_entry(buf, sizeof(u32), 2, fp);
+       if (rc)
+               return rc;
+
+       rc = put_entry(key, 1, len, fp);
+       if (rc)
+               return rc;
+
+       rc = mls_write_level(levdatum->level, fp);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+static int cat_write(void *vkey, void *datum, void *ptr)
+{
+       char *key = vkey;
+       struct cat_datum *catdatum = datum;
+       struct policy_data *pd = ptr;
+       void *fp = pd->fp;
+       __le32 buf[3];
+       size_t len;
+       int rc;
+
+       len = strlen(key);
+       buf[0] = cpu_to_le32(len);
+       buf[1] = cpu_to_le32(catdatum->value);
+       buf[2] = cpu_to_le32(catdatum->isalias);
+       rc = put_entry(buf, sizeof(u32), 3, fp);
+       if (rc)
+               return rc;
+
+       rc = put_entry(key, 1, len, fp);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+static int role_trans_write(struct role_trans *r, void *fp)
+{
+       struct role_trans *tr;
+       u32 buf[3];
+       size_t nel;
+       int rc;
+
+       nel = 0;
+       for (tr = r; tr; tr = tr->next)
+               nel++;
+       buf[0] = cpu_to_le32(nel);
+       rc = put_entry(buf, sizeof(u32), 1, fp);
+       if (rc)
+               return rc;
+       for (tr = r; tr; tr = tr->next) {
+               buf[0] = cpu_to_le32(tr->role);
+               buf[1] = cpu_to_le32(tr->type);
+               buf[2] = cpu_to_le32(tr->new_role);
+               rc = put_entry(buf, sizeof(u32), 3, fp);
+               if (rc)
+                       return rc;
+       }
+
+       return 0;
+}
+
+static int role_allow_write(struct role_allow *r, void *fp)
+{
+       struct role_allow *ra;
+       u32 buf[2];
+       size_t nel;
+       int rc;
+
+       nel = 0;
+       for (ra = r; ra; ra = ra->next)
+               nel++;
+       buf[0] = cpu_to_le32(nel);
+       rc = put_entry(buf, sizeof(u32), 1, fp);
+       if (rc)
+               return rc;
+       for (ra = r; ra; ra = ra->next) {
+               buf[0] = cpu_to_le32(ra->role);
+               buf[1] = cpu_to_le32(ra->new_role);
+               rc = put_entry(buf, sizeof(u32), 2, fp);
+               if (rc)
+                       return rc;
+       }
+       return 0;
+}
+
+/*
+ * Write a security context structure
+ * to a policydb binary representation file.
+ */
+static int context_write(struct policydb *p, struct context *c,
+                        void *fp)
+{
+       int rc;
+       __le32 buf[3];
+
+       buf[0] = cpu_to_le32(c->user);
+       buf[1] = cpu_to_le32(c->role);
+       buf[2] = cpu_to_le32(c->type);
+
+       rc = put_entry(buf, sizeof(u32), 3, fp);
+       if (rc)
+               return rc;
+
+       rc = mls_write_range_helper(&c->range, fp);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+/*
+ * The following *_write functions are used to
+ * write the symbol data to a policy database
+ * binary representation file.
+ */
+
+static int perm_write(void *vkey, void *datum, void *fp)
+{
+       char *key = vkey;
+       struct perm_datum *perdatum = datum;
+       __le32 buf[2];
+       size_t len;
+       int rc;
+
+       len = strlen(key);
+       buf[0] = cpu_to_le32(len);
+       buf[1] = cpu_to_le32(perdatum->value);
+       rc = put_entry(buf, sizeof(u32), 2, fp);
+       if (rc)
+               return rc;
+
+       rc = put_entry(key, 1, len, fp);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+static int common_write(void *vkey, void *datum, void *ptr)
+{
+       char *key = vkey;
+       struct common_datum *comdatum = datum;
+       struct policy_data *pd = ptr;
+       void *fp = pd->fp;
+       __le32 buf[4];
+       size_t len;
+       int rc;
+
+       len = strlen(key);
+       buf[0] = cpu_to_le32(len);
+       buf[1] = cpu_to_le32(comdatum->value);
+       buf[2] = cpu_to_le32(comdatum->permissions.nprim);
+       buf[3] = cpu_to_le32(comdatum->permissions.table->nel);
+       rc = put_entry(buf, sizeof(u32), 4, fp);
+       if (rc)
+               return rc;
+
+       rc = put_entry(key, 1, len, fp);
+       if (rc)
+               return rc;
+
+       rc = hashtab_map(comdatum->permissions.table, perm_write, fp);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+static int write_cons_helper(struct policydb *p, struct constraint_node *node,
+                            void *fp)
+{
+       struct constraint_node *c;
+       struct constraint_expr *e;
+       __le32 buf[3];
+       u32 nel;
+       int rc;
+
+       for (c = node; c; c = c->next) {
+               nel = 0;
+               for (e = c->expr; e; e = e->next)
+                       nel++;
+               buf[0] = cpu_to_le32(c->permissions);
+               buf[1] = cpu_to_le32(nel);
+               rc = put_entry(buf, sizeof(u32), 2, fp);
+               if (rc)
+                       return rc;
+               for (e = c->expr; e; e = e->next) {
+                       buf[0] = cpu_to_le32(e->expr_type);
+                       buf[1] = cpu_to_le32(e->attr);
+                       buf[2] = cpu_to_le32(e->op);
+                       rc = put_entry(buf, sizeof(u32), 3, fp);
+                       if (rc)
+                               return rc;
+
+                       switch (e->expr_type) {
+                       case CEXPR_NAMES:
+                               rc = ebitmap_write(&e->names, fp);
+                               if (rc)
+                                       return rc;
+                               break;
+                       default:
+                               break;
+                       }
+               }
+       }
+
+       return 0;
+}
+
+static int class_write(void *vkey, void *datum, void *ptr)
+{
+       char *key = vkey;
+       struct class_datum *cladatum = datum;
+       struct policy_data *pd = ptr;
+       void *fp = pd->fp;
+       struct policydb *p = pd->p;
+       struct constraint_node *c;
+       __le32 buf[6];
+       u32 ncons;
+       size_t len, len2;
+       int rc;
+
+       len = strlen(key);
+       if (cladatum->comkey)
+               len2 = strlen(cladatum->comkey);
+       else
+               len2 = 0;
+
+       ncons = 0;
+       for (c = cladatum->constraints; c; c = c->next)
+               ncons++;
+
+       buf[0] = cpu_to_le32(len);
+       buf[1] = cpu_to_le32(len2);
+       buf[2] = cpu_to_le32(cladatum->value);
+       buf[3] = cpu_to_le32(cladatum->permissions.nprim);
+       if (cladatum->permissions.table)
+               buf[4] = cpu_to_le32(cladatum->permissions.table->nel);
+       else
+               buf[4] = 0;
+       buf[5] = cpu_to_le32(ncons);
+       rc = put_entry(buf, sizeof(u32), 6, fp);
+       if (rc)
+               return rc;
+
+       rc = put_entry(key, 1, len, fp);
+       if (rc)
+               return rc;
+
+       if (cladatum->comkey) {
+               rc = put_entry(cladatum->comkey, 1, len2, fp);
+               if (rc)
+                       return rc;
+       }
+
+       rc = hashtab_map(cladatum->permissions.table, perm_write, fp);
+       if (rc)
+               return rc;
+
+       rc = write_cons_helper(p, cladatum->constraints, fp);
+       if (rc)
+               return rc;
+
+       /* write out the validatetrans rule */
+       ncons = 0;
+       for (c = cladatum->validatetrans; c; c = c->next)
+               ncons++;
+
+       buf[0] = cpu_to_le32(ncons);
+       rc = put_entry(buf, sizeof(u32), 1, fp);
+       if (rc)
+               return rc;
+
+       rc = write_cons_helper(p, cladatum->validatetrans, fp);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+static int role_write(void *vkey, void *datum, void *ptr)
+{
+       char *key = vkey;
+       struct role_datum *role = datum;
+       struct policy_data *pd = ptr;
+       void *fp = pd->fp;
+       struct policydb *p = pd->p;
+       __le32 buf[3];
+       size_t items, len;
+       int rc;
+
+       len = strlen(key);
+       items = 0;
+       buf[items++] = cpu_to_le32(len);
+       buf[items++] = cpu_to_le32(role->value);
+       if (p->policyvers >= POLICYDB_VERSION_BOUNDARY)
+               buf[items++] = cpu_to_le32(role->bounds);
+
+       BUG_ON(items > (sizeof(buf)/sizeof(buf[0])));
+
+       rc = put_entry(buf, sizeof(u32), items, fp);
+       if (rc)
+               return rc;
+
+       rc = put_entry(key, 1, len, fp);
+       if (rc)
+               return rc;
+
+       rc = ebitmap_write(&role->dominates, fp);
+       if (rc)
+               return rc;
+
+       rc = ebitmap_write(&role->types, fp);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+static int type_write(void *vkey, void *datum, void *ptr)
+{
+       char *key = vkey;
+       struct type_datum *typdatum = datum;
+       struct policy_data *pd = ptr;
+       struct policydb *p = pd->p;
+       void *fp = pd->fp;
+       __le32 buf[4];
+       int rc;
+       size_t items, len;
+
+       len = strlen(key);
+       items = 0;
+       buf[items++] = cpu_to_le32(len);
+       buf[items++] = cpu_to_le32(typdatum->value);
+       if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) {
+               u32 properties = 0;
+
+               if (typdatum->primary)
+                       properties |= TYPEDATUM_PROPERTY_PRIMARY;
+
+               if (typdatum->attribute)
+                       properties |= TYPEDATUM_PROPERTY_ATTRIBUTE;
+
+               buf[items++] = cpu_to_le32(properties);
+               buf[items++] = cpu_to_le32(typdatum->bounds);
+       } else {
+               buf[items++] = cpu_to_le32(typdatum->primary);
+       }
+       BUG_ON(items > (sizeof(buf) / sizeof(buf[0])));
+       rc = put_entry(buf, sizeof(u32), items, fp);
+       if (rc)
+               return rc;
+
+       rc = put_entry(key, 1, len, fp);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+static int user_write(void *vkey, void *datum, void *ptr)
+{
+       char *key = vkey;
+       struct user_datum *usrdatum = datum;
+       struct policy_data *pd = ptr;
+       struct policydb *p = pd->p;
+       void *fp = pd->fp;
+       __le32 buf[3];
+       size_t items, len;
+       int rc;
+
+       len = strlen(key);
+       items = 0;
+       buf[items++] = cpu_to_le32(len);
+       buf[items++] = cpu_to_le32(usrdatum->value);
+       if (p->policyvers >= POLICYDB_VERSION_BOUNDARY)
+               buf[items++] = cpu_to_le32(usrdatum->bounds);
+       BUG_ON(items > (sizeof(buf) / sizeof(buf[0])));
+       rc = put_entry(buf, sizeof(u32), items, fp);
+       if (rc)
+               return rc;
+
+       rc = put_entry(key, 1, len, fp);
+       if (rc)
+               return rc;
+
+       rc = ebitmap_write(&usrdatum->roles, fp);
+       if (rc)
+               return rc;
+
+       rc = mls_write_range_helper(&usrdatum->range, fp);
+       if (rc)
+               return rc;
+
+       rc = mls_write_level(&usrdatum->dfltlevel, fp);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+static int (*write_f[SYM_NUM]) (void *key, void *datum,
+                               void *datap) =
+{
+       common_write,
+       class_write,
+       role_write,
+       type_write,
+       user_write,
+       cond_write_bool,
+       sens_write,
+       cat_write,
+};
+
+static int ocontext_write(struct policydb *p, struct policydb_compat_info *info,
+                         void *fp)
+{
+       unsigned int i, j, rc;
+       size_t nel, len;
+       __le32 buf[3];
+       u32 nodebuf[8];
+       struct ocontext *c;
+       for (i = 0; i < info->ocon_num; i++) {
+               nel = 0;
+               for (c = p->ocontexts[i]; c; c = c->next)
+                       nel++;
+               buf[0] = cpu_to_le32(nel);
+               rc = put_entry(buf, sizeof(u32), 1, fp);
+               if (rc)
+                       return rc;
+               for (c = p->ocontexts[i]; c; c = c->next) {
+                       switch (i) {
+                       case OCON_ISID:
+                               buf[0] = cpu_to_le32(c->sid[0]);
+                               rc = put_entry(buf, sizeof(u32), 1, fp);
+                               if (rc)
+                                       return rc;
+                               rc = context_write(p, &c->context[0], fp);
+                               if (rc)
+                                       return rc;
+                               break;
+                       case OCON_FS:
+                       case OCON_NETIF:
+                               len = strlen(c->u.name);
+                               buf[0] = cpu_to_le32(len);
+                               rc = put_entry(buf, sizeof(u32), 1, fp);
+                               if (rc)
+                                       return rc;
+                               rc = put_entry(c->u.name, 1, len, fp);
+                               if (rc)
+                                       return rc;
+                               rc = context_write(p, &c->context[0], fp);
+                               if (rc)
+                                       return rc;
+                               rc = context_write(p, &c->context[1], fp);
+                               if (rc)
+                                       return rc;
+                               break;
+                       case OCON_PORT:
+                               buf[0] = cpu_to_le32(c->u.port.protocol);
+                               buf[1] = cpu_to_le32(c->u.port.low_port);
+                               buf[2] = cpu_to_le32(c->u.port.high_port);
+                               rc = put_entry(buf, sizeof(u32), 3, fp);
+                               if (rc)
+                                       return rc;
+                               rc = context_write(p, &c->context[0], fp);
+                               if (rc)
+                                       return rc;
+                               break;
+                       case OCON_NODE:
+                               nodebuf[0] = c->u.node.addr; /* network order */
+                               nodebuf[1] = c->u.node.mask; /* network order */
+                               rc = put_entry(nodebuf, sizeof(u32), 2, fp);
+                               if (rc)
+                                       return rc;
+                               rc = context_write(p, &c->context[0], fp);
+                               if (rc)
+                                       return rc;
+                               break;
+                       case OCON_FSUSE:
+                               buf[0] = cpu_to_le32(c->v.behavior);
+                               len = strlen(c->u.name);
+                               buf[1] = cpu_to_le32(len);
+                               rc = put_entry(buf, sizeof(u32), 2, fp);
+                               if (rc)
+                                       return rc;
+                               rc = put_entry(c->u.name, 1, len, fp);
+                               if (rc)
+                                       return rc;
+                               rc = context_write(p, &c->context[0], fp);
+                               if (rc)
+                                       return rc;
+                               break;
+                       case OCON_NODE6:
+                               for (j = 0; j < 4; j++)
+                                       nodebuf[j] = c->u.node6.addr[j]; /* network order */
+                               for (j = 0; j < 4; j++)
+                                       nodebuf[j + 4] = c->u.node6.mask[j]; /* network order */
+                               rc = put_entry(nodebuf, sizeof(u32), 8, fp);
+                               if (rc)
+                                       return rc;
+                               rc = context_write(p, &c->context[0], fp);
+                               if (rc)
+                                       return rc;
+                               break;
+                       }
+               }
+       }
+       return 0;
+}
+
+static int genfs_write(struct policydb *p, void *fp)
+{
+       struct genfs *genfs;
+       struct ocontext *c;
+       size_t len;
+       __le32 buf[1];
+       int rc;
+
+       len = 0;
+       for (genfs = p->genfs; genfs; genfs = genfs->next)
+               len++;
+       buf[0] = cpu_to_le32(len);
+       rc = put_entry(buf, sizeof(u32), 1, fp);
+       if (rc)
+               return rc;
+       for (genfs = p->genfs; genfs; genfs = genfs->next) {
+               len = strlen(genfs->fstype);
+               buf[0] = cpu_to_le32(len);
+               rc = put_entry(buf, sizeof(u32), 1, fp);
+               if (rc)
+                       return rc;
+               rc = put_entry(genfs->fstype, 1, len, fp);
+               if (rc)
+                       return rc;
+               len = 0;
+               for (c = genfs->head; c; c = c->next)
+                       len++;
+               buf[0] = cpu_to_le32(len);
+               rc = put_entry(buf, sizeof(u32), 1, fp);
+               if (rc)
+                       return rc;
+               for (c = genfs->head; c; c = c->next) {
+                       len = strlen(c->u.name);
+                       buf[0] = cpu_to_le32(len);
+                       rc = put_entry(buf, sizeof(u32), 1, fp);
+                       if (rc)
+                               return rc;
+                       rc = put_entry(c->u.name, 1, len, fp);
+                       if (rc)
+                               return rc;
+                       buf[0] = cpu_to_le32(c->v.sclass);
+                       rc = put_entry(buf, sizeof(u32), 1, fp);
+                       if (rc)
+                               return rc;
+                       rc = context_write(p, &c->context[0], fp);
+                       if (rc)
+                               return rc;
+               }
+       }
+       return 0;
+}
+
+static int range_count(void *key, void *data, void *ptr)
+{
+       int *cnt = ptr;
+       *cnt = *cnt + 1;
+
+       return 0;
+}
+
+static int range_write_helper(void *key, void *data, void *ptr)
+{
+       __le32 buf[2];
+       struct range_trans *rt = key;
+       struct mls_range *r = data;
+       struct policy_data *pd = ptr;
+       void *fp = pd->fp;
+       struct policydb *p = pd->p;
+       int rc;
+
+       buf[0] = cpu_to_le32(rt->source_type);
+       buf[1] = cpu_to_le32(rt->target_type);
+       rc = put_entry(buf, sizeof(u32), 2, fp);
+       if (rc)
+               return rc;
+       if (p->policyvers >= POLICYDB_VERSION_RANGETRANS) {
+               buf[0] = cpu_to_le32(rt->target_class);
+               rc = put_entry(buf, sizeof(u32), 1, fp);
+               if (rc)
+                       return rc;
+       }
+       rc = mls_write_range_helper(r, fp);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+static int range_write(struct policydb *p, void *fp)
+{
+       size_t nel;
+       __le32 buf[1];
+       int rc;
+       struct policy_data pd;
+
+       pd.p = p;
+       pd.fp = fp;
+
+       /* count the number of entries in the hashtab */
+       nel = 0;
+       rc = hashtab_map(p->range_tr, range_count, &nel);
+       if (rc)
+               return rc;
+
+       buf[0] = cpu_to_le32(nel);
+       rc = put_entry(buf, sizeof(u32), 1, fp);
+       if (rc)
+               return rc;
+
+       /* actually write all of the entries */
+       rc = hashtab_map(p->range_tr, range_write_helper, &pd);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+/*
+ * Write the configuration data in a policy database
+ * structure to a policy database binary representation
+ * file.
+ */
+int policydb_write(struct policydb *p, void *fp)
+{
+       unsigned int i, num_syms;
+       int rc;
+       __le32 buf[4];
+       u32 config;
+       size_t len;
+       struct policydb_compat_info *info;
+
+       /*
+        * refuse to write policy older than compressed avtab
+        * to simplify the writer.  There are other tests dropped
+        * since we assume this throughout the writer code.  Be
+        * careful if you ever try to remove this restriction
+        */
+       if (p->policyvers < POLICYDB_VERSION_AVTAB) {
+               printk(KERN_ERR "SELinux: refusing to write policy version %d."
+                      "  Because it is less than version %d\n", p->policyvers,
+                      POLICYDB_VERSION_AVTAB);
+               return -EINVAL;
+       }
+
+       config = 0;
+       if (p->mls_enabled)
+               config |= POLICYDB_CONFIG_MLS;
+
+       if (p->reject_unknown)
+               config |= REJECT_UNKNOWN;
+       if (p->allow_unknown)
+               config |= ALLOW_UNKNOWN;
+
+       /* Write the magic number and string identifiers. */
+       buf[0] = cpu_to_le32(POLICYDB_MAGIC);
+       len = strlen(POLICYDB_STRING);
+       buf[1] = cpu_to_le32(len);
+       rc = put_entry(buf, sizeof(u32), 2, fp);
+       if (rc)
+               return rc;
+       rc = put_entry(POLICYDB_STRING, 1, len, fp);
+       if (rc)
+               return rc;
+
+       /* Write the version, config, and table sizes. */
+       info = policydb_lookup_compat(p->policyvers);
+       if (!info) {
+               printk(KERN_ERR "SELinux: compatibility lookup failed for policy "
+                   "version %d", p->policyvers);
+               return rc;
+       }
+
+       buf[0] = cpu_to_le32(p->policyvers);
+       buf[1] = cpu_to_le32(config);
+       buf[2] = cpu_to_le32(info->sym_num);
+       buf[3] = cpu_to_le32(info->ocon_num);
+
+       rc = put_entry(buf, sizeof(u32), 4, fp);
+       if (rc)
+               return rc;
+
+       if (p->policyvers >= POLICYDB_VERSION_POLCAP) {
+               rc = ebitmap_write(&p->policycaps, fp);
+               if (rc)
+                       return rc;
+       }
+
+       if (p->policyvers >= POLICYDB_VERSION_PERMISSIVE) {
+               rc = ebitmap_write(&p->permissive_map, fp);
+               if (rc)
+                       return rc;
+       }
+
+       num_syms = info->sym_num;
+       for (i = 0; i < num_syms; i++) {
+               struct policy_data pd;
+
+               pd.fp = fp;
+               pd.p = p;
+
+               buf[0] = cpu_to_le32(p->symtab[i].nprim);
+               buf[1] = cpu_to_le32(p->symtab[i].table->nel);
+
+               rc = put_entry(buf, sizeof(u32), 2, fp);
+               if (rc)
+                       return rc;
+               rc = hashtab_map(p->symtab[i].table, write_f[i], &pd);
+               if (rc)
+                       return rc;
+       }
+
+       rc = avtab_write(p, &p->te_avtab, fp);
+       if (rc)
+               return rc;
+
+       rc = cond_write_list(p, p->cond_list, fp);
+       if (rc)
+               return rc;
+
+       rc = role_trans_write(p->role_tr, fp);
+       if (rc)
+               return rc;
+
+       rc = role_allow_write(p->role_allow, fp);
+       if (rc)
+               return rc;
+
+       rc = ocontext_write(p, info, fp);
+       if (rc)
+               return rc;
+
+       rc = genfs_write(p, fp);
+       if (rc)
+               return rc;
+
+       rc = range_write(p, fp);
+       if (rc)
+               return rc;
+
+       for (i = 0; i < p->p_types.nprim; i++) {
+               struct ebitmap *e = flex_array_get(p->type_attr_map_array, i);
+
+               BUG_ON(!e);
+               rc = ebitmap_write(e, fp);
+               if (rc)
+                       return rc;
+       }
+
+       return 0;
+}
diff --git a/security/selinux/ss/policydb.h b/security/selinux/ss/policydb.h

index 310e94442cb8b3535a8774b952de45ba0180794e..95d3d7de361e628adcc53b974533cafb384ef851 100644 (file)
--- a/security/selinux/ss/policydb.h
+++ b/security/selinux/ss/policydb.h
@@ -254,6 +254,9 @@ struct policydb {
  
         struct ebitmap permissive_map;
  
+       /* length of this policy when it was loaded */
+       size_t len;
+
         unsigned int policyvers;
  
         unsigned int reject_unknown : 1;
@@ -270,6 +273,7 @@ extern int policydb_class_isvalid(struct policydb *p, unsigned int class);
  extern int policydb_type_isvalid(struct policydb *p, unsigned int type);
  extern int policydb_role_isvalid(struct policydb *p, unsigned int role);
  extern int policydb_read(struct policydb *p, void *fp);
+extern int policydb_write(struct policydb *p, void *fp);
  
  #define PERM_SYMTAB_SIZE 32
  
@@ -290,6 +294,11 @@ struct policy_file {
         size_t len;
  };
  
+struct policy_data {
+       struct policydb *p;
+       void *fp;
+};
+
  static inline int next_entry(void *buf, struct policy_file *fp, size_t bytes)
  {
         if (bytes > fp->len)
@@ -301,6 +310,17 @@ static inline int next_entry(void *buf, struct policy_file *fp, size_t bytes)
         return 0;
  }
  
+static inline int put_entry(void *buf, size_t bytes, int num, struct policy_file *fp)
+{
+       size_t len = bytes * num;
+
+       memcpy(fp->data, buf, len);
+       fp->data += len;
+       fp->len -= len;
+
+       return 0;
+}
+
  extern u16 string_to_security_class(struct policydb *p, const char *name);
  extern u32 string_to_av_perm(struct policydb *p, u16 tclass, const char *name);
  
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c

index 9ea2feca3cd4f7b572361543fdf1b265002cecfb..223c1ff6ef2324488eca915d2ba87bc4f6e27fa9 100644 (file)
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -51,6 +51,7 @@
  #include <linux/mutex.h>
  #include <linux/selinux.h>
  #include <linux/flex_array.h>
+#include <linux/vmalloc.h>
  #include <net/netlabel.h>
  
  #include "flask.h"
@@ -991,7 +992,8 @@ static int context_struct_to_string(struct context *context, char **scontext, u3
  {
         char *scontextp;
  
-       *scontext = NULL;
+       if (scontext)
+               *scontext = NULL;
         *scontext_len = 0;
  
         if (context->len) {
@@ -1008,6 +1010,9 @@ static int context_struct_to_string(struct context *context, char **scontext, u3
         *scontext_len += strlen(policydb.p_type_val_to_name[context->type - 1]) + 1;
         *scontext_len += mls_compute_context_len(context);
  
+       if (!scontext)
+               return 0;
+
         /* Allocate space for the context; caller must free this space. */
         scontextp = kmalloc(*scontext_len, GFP_ATOMIC);
         if (!scontextp)
@@ -1047,7 +1052,8 @@ static int security_sid_to_context_core(u32 sid, char **scontext,
         struct context *context;
         int rc = 0;
  
-       *scontext = NULL;
+       if (scontext)
+               *scontext = NULL;
         *scontext_len  = 0;
  
         if (!ss_initialized) {
@@ -1055,6 +1061,8 @@ static int security_sid_to_context_core(u32 sid, char **scontext,
                         char *scontextp;
  
                         *scontext_len = strlen(initial_sid_to_string[sid]) + 1;
+                       if (!scontext)
+                               goto out;
                         scontextp = kmalloc(*scontext_len, GFP_ATOMIC);
                         if (!scontextp) {
                                 rc = -ENOMEM;
@@ -1769,6 +1777,7 @@ int security_load_policy(void *data, size_t len)
                         return rc;
                 }
  
+               policydb.len = len;
                 rc = selinux_set_mapping(&policydb, secclass_map,
                                          &current_mapping,
                                          &current_mapping_size);
@@ -1791,6 +1800,7 @@ int security_load_policy(void *data, size_t len)
                 selinux_complete_init();
                 avc_ss_reset(seqno);
                 selnl_notify_policyload(seqno);
+               selinux_status_update_policyload(seqno);
                 selinux_netlbl_cache_invalidate();
                 selinux_xfrm_notify_policyload();
                 return 0;
@@ -1804,6 +1814,7 @@ int security_load_policy(void *data, size_t len)
         if (rc)
                 return rc;
  
+       newpolicydb.len = len;
         /* If switching between different policy types, log MLS status */
         if (policydb.mls_enabled && !newpolicydb.mls_enabled)
                 printk(KERN_INFO "SELinux: Disabling MLS support...\n");
@@ -1870,6 +1881,7 @@ int security_load_policy(void *data, size_t len)
  
         avc_ss_reset(seqno);
         selnl_notify_policyload(seqno);
+       selinux_status_update_policyload(seqno);
         selinux_netlbl_cache_invalidate();
         selinux_xfrm_notify_policyload();
  
@@ -1883,6 +1895,17 @@ err:
  
  }
  
+size_t security_policydb_len(void)
+{
+       size_t len;
+
+       read_lock(&policy_rwlock);
+       len = policydb.len;
+       read_unlock(&policy_rwlock);
+
+       return len;
+}
+
  /**
   * security_port_sid - Obtain the SID for a port.
   * @protocol: protocol number
@@ -2374,6 +2397,7 @@ out:
         if (!rc) {
                 avc_ss_reset(seqno);
                 selnl_notify_policyload(seqno);
+               selinux_status_update_policyload(seqno);
                 selinux_xfrm_notify_policyload();
         }
         return rc;
@@ -3129,3 +3153,38 @@ netlbl_sid_to_secattr_failure:
         return rc;
  }
  #endif /* CONFIG_NETLABEL */
+
+/**
+ * security_read_policy - read the policy.
+ * @data: binary policy data
+ * @len: length of data in bytes
+ *
+ */
+int security_read_policy(void **data, ssize_t *len)
+{
+       int rc;
+       struct policy_file fp;
+
+       if (!ss_initialized)
+               return -EINVAL;
+
+       *len = security_policydb_len();
+
+       *data = vmalloc_user(*len);
+       if (!*data)
+               return -ENOMEM;
+
+       fp.data = *data;
+       fp.len = *len;
+
+       read_lock(&policy_rwlock);
+       rc = policydb_write(&policydb, &fp);
+       read_unlock(&policy_rwlock);
+
+       if (rc)
+               return rc;
+
+       *len = (unsigned long)fp.data - (unsigned long)*data;
+       return 0;
+
+}
diff --git a/security/selinux/ss/status.c b/security/selinux/ss/status.c

new file mode 100644 (file)

index 0000000..d982365
--- /dev/null
+++ b/security/selinux/ss/status.c
@@ -0,0 +1,126 @@
+/*
+ * mmap based event notifications for SELinux
+ *
+ * Author: KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * Copyright (C) 2010 NEC corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2,
+ * as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include "avc.h"
+#include "services.h"
+
+/*
+ * The selinux_status_page shall be exposed to userspace applications
+ * using mmap interface on /selinux/status.
+ * It enables to notify applications a few events that will cause reset
+ * of userspace access vector without context switching.
+ *
+ * The selinux_kernel_status structure on the head of status page is
+ * protected from concurrent accesses using seqlock logic, so userspace
+ * application should reference the status page according to the seqlock
+ * logic.
+ *
+ * Typically, application checks status->sequence at the head of access
+ * control routine. If it is odd-number, kernel is updating the status,
+ * so please wait for a moment. If it is changed from the last sequence
+ * number, it means something happen, so application will reset userspace
+ * avc, if needed.
+ * In most cases, application shall confirm the kernel status is not
+ * changed without any system call invocations.
+ */
+static struct page *selinux_status_page;
+static DEFINE_MUTEX(selinux_status_lock);
+
+/*
+ * selinux_kernel_status_page
+ *
+ * It returns a reference to selinux_status_page. If the status page is
+ * not allocated yet, it also tries to allocate it at the first time.
+ */
+struct page *selinux_kernel_status_page(void)
+{
+       struct selinux_kernel_status   *status;
+       struct page                    *result = NULL;
+
+       mutex_lock(&selinux_status_lock);
+       if (!selinux_status_page) {
+               selinux_status_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
+
+               if (selinux_status_page) {
+                       status = page_address(selinux_status_page);
+
+                       status->version = SELINUX_KERNEL_STATUS_VERSION;
+                       status->sequence = 0;
+                       status->enforcing = selinux_enforcing;
+                       /*
+                        * NOTE: the next policyload event shall set
+                        * a positive value on the status->policyload,
+                        * although it may not be 1, but never zero.
+                        * So, application can know it was updated.
+                        */
+                       status->policyload = 0;
+                       status->deny_unknown = !security_get_allow_unknown();
+               }
+       }
+       result = selinux_status_page;
+       mutex_unlock(&selinux_status_lock);
+
+       return result;
+}
+
+/*
+ * selinux_status_update_setenforce
+ *
+ * It updates status of the current enforcing/permissive mode.
+ */
+void selinux_status_update_setenforce(int enforcing)
+{
+       struct selinux_kernel_status   *status;
+
+       mutex_lock(&selinux_status_lock);
+       if (selinux_status_page) {
+               status = page_address(selinux_status_page);
+
+               status->sequence++;
+               smp_wmb();
+
+               status->enforcing = enforcing;
+
+               smp_wmb();
+               status->sequence++;
+       }
+       mutex_unlock(&selinux_status_lock);
+}
+
+/*
+ * selinux_status_update_policyload
+ *
+ * It updates status of the times of policy reloaded, and current
+ * setting of deny_unknown.
+ */
+void selinux_status_update_policyload(int seqno)
+{
+       struct selinux_kernel_status   *status;
+
+       mutex_lock(&selinux_status_lock);
+       if (selinux_status_page) {
+               status = page_address(selinux_status_page);
+
+               status->sequence++;
+               smp_wmb();
+
+               status->policyload = seqno;
+               status->deny_unknown = !security_get_allow_unknown();
+
+               smp_wmb();
+               status->sequence++;
+       }
+       mutex_unlock(&selinux_status_lock);
+}
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c

index c448d57ae2b7721f72f17c5cf42e88f3f1bcba5e..bc39f4067af668874312af4187ad6f21dbdbb113 100644 (file)
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -1281,12 +1281,11 @@ static int smack_task_getioprio(struct task_struct *p)
   *
   * Return 0 if read access is permitted
   */
-static int smack_task_setscheduler(struct task_struct *p, int policy,
-                                  struct sched_param *lp)
+static int smack_task_setscheduler(struct task_struct *p)
  {
         int rc;
  
-       rc = cap_task_setscheduler(p, policy, lp);
+       rc = cap_task_setscheduler(p);
         if (rc == 0)
                 rc = smk_curacc_on_task(p, MAY_WRITE);
         return rc;
@@ -3005,7 +3004,8 @@ static int smack_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
  {
         char *sp = smack_from_secid(secid);
  
-       *secdata = sp;
+       if (secdata)
+               *secdata = sp;
         *seclen = strlen(sp);
         return 0;
  }
diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c

index c668b447c72594494417f6be50795f3b49f860bf..7556315c197823e0baedcf15d0f0930c74345429 100644 (file)
--- a/security/tomoyo/common.c
+++ b/security/tomoyo/common.c
@@ -768,8 +768,10 @@ static bool tomoyo_select_one(struct tomoyo_io_buffer *head, const char *data)
                 return true; /* Do nothing if open(O_WRONLY). */
         memset(&head->r, 0, sizeof(head->r));
         head->r.print_this_domain_only = true;
-       head->r.eof = !domain;
-       head->r.domain = &domain->list;
+       if (domain)
+               head->r.domain = &domain->list;
+       else
+               head->r.eof = 1;
         tomoyo_io_printf(head, "# select %s\n", data);
         if (domain && domain->is_deleted)
                 tomoyo_io_printf(head, "# This is a deleted domain.\n");
@@ -2051,13 +2053,22 @@ void tomoyo_check_profile(void)
                 const u8 profile = domain->profile;
                 if (tomoyo_profile_ptr[profile])
                         continue;
+               printk(KERN_ERR "You need to define profile %u before using it.\n",
+                      profile);
+               printk(KERN_ERR "Please see http://tomoyo.sourceforge.jp/2.3/ "
+                      "for more information.\n");
                 panic("Profile %u (used by '%s') not defined.\n",
                       profile, domain->domainname->name);
         }
         tomoyo_read_unlock(idx);
-       if (tomoyo_profile_version != 20090903)
+       if (tomoyo_profile_version != 20090903) {
+               printk(KERN_ERR "You need to install userland programs for "
+                      "TOMOYO 2.3 and initialize policy configuration.\n");
+               printk(KERN_ERR "Please see http://tomoyo.sourceforge.jp/2.3/ "
+                      "for more information.\n");
                 panic("Profile version %u is not supported.\n",
                       tomoyo_profile_version);
+       }
         printk(KERN_INFO "TOMOYO: 2.3.0\n");
         printk(KERN_INFO "Mandatory Access Control activated.\n");
  }
diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c

index a7868ad4d530fd40b3a9bd608bd7860a7d35ce44..cbbed0db9e560315ae0559c7e5e97786387a5371 100644 (file)
--- a/sound/core/rawmidi.c
+++ b/sound/core/rawmidi.c
@@ -535,13 +535,15 @@ static int snd_rawmidi_release(struct inode *inode, struct file *file)
  {
         struct snd_rawmidi_file *rfile;
         struct snd_rawmidi *rmidi;
+       struct module *module;
  
         rfile = file->private_data;
         rmidi = rfile->rmidi;
         rawmidi_release_priv(rfile);
         kfree(rfile);
+       module = rmidi->card->module;
         snd_card_file_remove(rmidi->card, file);
-       module_put(rmidi->card->module);
+       module_put(module);
         return 0;
  }
  
diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt

index 5164a655c39f60b578c8642f2caabe8af66fa016..b2c63309a65165b471822e99268c828bbdb07777 100644 (file)
--- a/tools/perf/Documentation/perf-annotate.txt
+++ b/tools/perf/Documentation/perf-annotate.txt
@@ -8,7 +8,7 @@ perf-annotate - Read perf.data (created by perf record) and display annotated co
  SYNOPSIS
  --------
  [verse]
-'perf annotate' [-i <file> | --input=file] symbol_name
+'perf annotate' [-i <file> | --input=file] [symbol_name]
  
  DESCRIPTION
  -----------
@@ -24,6 +24,13 @@ OPTIONS
  --input=::
          Input file name. (default: perf.data)
  
+--stdio:: Use the stdio interface.
+
+--tui:: Use the TUI interface Use of --tui requires a tty, if one is not
+       present, as when piping to other commands, the stdio interface is
+       used. This interfaces starts by centering on the line with more
+       samples, TAB/UNTAB cycles thru the lines with more samples.
+
  SEE ALSO
  --------
-linkperf:perf-record[1]
+linkperf:perf-record[1], linkperf:perf-report[1]
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt

index abfabe9147a4f2a48b6fd47bfdb758de2a3f3eea..12052c9ed0babfc3a1c93cc01758ec3b7747ee10 100644 (file)
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -65,6 +65,13 @@ OPTIONS
                  the tree is considered as a new profiled object. +
         Default: fractal,0.5.
  
+--stdio:: Use the stdio interface.
+
+--tui:: Use the TUI interface, that is integrated with annotate and allows
+        zooming into DSOs or threads, among other features. Use of --tui
+       requires a tty, if one is not present, as when piping to other
+       commands, the stdio interface is used.
+
  SEE ALSO
  --------
  linkperf:perf-stat[1]
diff --git a/tools/perf/Makefile b/tools/perf/Makefile

index 1950e19af1cf6b23be56f0250bcf8cbb52b3d9f3..d1db0f676a4bf14850fa0264e78fe3d482d376dc 100644 (file)
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -313,6 +313,9 @@ TEST_PROGRAMS =
  
  SCRIPT_SH += perf-archive.sh
  
+grep-libs = $(filter -l%,$(1))
+strip-libs = $(filter-out -l%,$(1))
+
  #
  # No Perl scripts right now:
  #
@@ -588,14 +591,17 @@ endif
  ifdef NO_LIBPERL
         BASIC_CFLAGS += -DNO_LIBPERL
  else
-       PERL_EMBED_LDOPTS = `perl -MExtUtils::Embed -e ldopts 2>/dev/null`
+       PERL_EMBED_LDOPTS = $(shell perl -MExtUtils::Embed -e ldopts 2>/dev/null)
+       PERL_EMBED_LDFLAGS = $(call strip-libs,$(PERL_EMBED_LDOPTS))
+       PERL_EMBED_LIBADD = $(call grep-libs,$(PERL_EMBED_LDOPTS))
         PERL_EMBED_CCOPTS = `perl -MExtUtils::Embed -e ccopts 2>/dev/null`
         FLAGS_PERL_EMBED=$(PERL_EMBED_CCOPTS) $(PERL_EMBED_LDOPTS)
  
         ifneq ($(call try-cc,$(SOURCE_PERL_EMBED),$(FLAGS_PERL_EMBED)),y)
                 BASIC_CFLAGS += -DNO_LIBPERL
         else
-               ALL_LDFLAGS += $(PERL_EMBED_LDOPTS)
+               ALL_LDFLAGS += $(PERL_EMBED_LDFLAGS)
+               EXTLIBS += $(PERL_EMBED_LIBADD)
                 LIB_OBJS += $(OUTPUT)util/scripting-engines/trace-event-perl.o
                 LIB_OBJS += $(OUTPUT)scripts/perl/Perf-Trace-Util/Context.o
         endif
@@ -604,13 +610,16 @@ endif
  ifdef NO_LIBPYTHON
         BASIC_CFLAGS += -DNO_LIBPYTHON
  else
-       PYTHON_EMBED_LDOPTS = `python-config --ldflags 2>/dev/null`
+       PYTHON_EMBED_LDOPTS = $(shell python-config --ldflags 2>/dev/null)
+       PYTHON_EMBED_LDFLAGS = $(call strip-libs,$(PYTHON_EMBED_LDOPTS))
+       PYTHON_EMBED_LIBADD = $(call grep-libs,$(PYTHON_EMBED_LDOPTS))
         PYTHON_EMBED_CCOPTS = `python-config --cflags 2>/dev/null`
         FLAGS_PYTHON_EMBED=$(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS)
         ifneq ($(call try-cc,$(SOURCE_PYTHON_EMBED),$(FLAGS_PYTHON_EMBED)),y)
                 BASIC_CFLAGS += -DNO_LIBPYTHON
         else
-               ALL_LDFLAGS += $(PYTHON_EMBED_LDOPTS)
+               ALL_LDFLAGS += $(PYTHON_EMBED_LDFLAGS)
+               EXTLIBS += $(PYTHON_EMBED_LIBADD)
                 LIB_OBJS += $(OUTPUT)util/scripting-engines/trace-event-python.o
                 LIB_OBJS += $(OUTPUT)scripts/python/Perf-Trace-Util/Context.o
         endif
@@ -653,6 +662,15 @@ else
         endif
  endif
  
+
+ifdef NO_STRLCPY
+       BASIC_CFLAGS += -DNO_STRLCPY
+else
+       ifneq ($(call try-cc,$(SOURCE_STRLCPY),),y)
+               BASIC_CFLAGS += -DNO_STRLCPY
+       endif
+endif
+
  ifndef CC_LD_DYNPATH
         ifdef NO_R_TO_GCC_LINKER
                 # Some gcc does not accept and pass -R to the linker to specify
@@ -910,8 +928,8 @@ $(OUTPUT)perf.o: perf.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS
                 $(ALL_CFLAGS) -c $(filter %.c,$^) -o $@
  
  $(OUTPUT)perf$X: $(OUTPUT)perf.o $(BUILTIN_OBJS) $(PERFLIBS)
-       $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(OUTPUT)perf.o \
-               $(BUILTIN_OBJS) $(ALL_LDFLAGS) $(LIBS)
+       $(QUIET_LINK)$(CC) $(ALL_CFLAGS) $(ALL_LDFLAGS) $(OUTPUT)perf.o \
+               $(BUILTIN_OBJS) $(LIBS) -o $@
  
  $(OUTPUT)builtin-help.o: builtin-help.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS
         $(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) \
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c

index 1478dc64bf157fc0f226cba51eef80ec8646c15f..6d5604d8df9599acb55d87017f5d58e19d906395 100644 (file)
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -28,7 +28,7 @@
  
  static char            const *input_name = "perf.data";
  
-static bool            force;
+static bool            force, use_tui, use_stdio;
  
  static bool            full_paths;
  
@@ -321,7 +321,7 @@ static int hist_entry__tty_annotate(struct hist_entry *he)
  
  static void hists__find_annotations(struct hists *self)
  {
-       struct rb_node *first = rb_first(&self->entries), *nd = first;
+       struct rb_node *nd = rb_first(&self->entries), *next;
         int key = KEY_RIGHT;
  
         while (nd) {
@@ -343,20 +343,19 @@ find_next:
  
                 if (use_browser > 0) {
                         key = hist_entry__tui_annotate(he);
-                       if (is_exit_key(key))
-                               break;
                         switch (key) {
                         case KEY_RIGHT:
-                       case '\t':
-                               nd = rb_next(nd);
+                               next = rb_next(nd);
                                 break;
                         case KEY_LEFT:
-                               if (nd == first)
-                                       continue;
-                               nd = rb_prev(nd);
-                       default:
+                               next = rb_prev(nd);
                                 break;
+                       default:
+                               return;
                         }
+
+                       if (next != NULL)
+                               nd = next;
                 } else {
                         hist_entry__tty_annotate(he);
                         nd = rb_next(nd);
@@ -428,6 +427,8 @@ static const struct option options[] = {
                     "be more verbose (show symbol address, etc)"),
         OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
                     "dump raw trace in ASCII"),
+       OPT_BOOLEAN(0, "tui", &use_tui, "Use the TUI interface"),
+       OPT_BOOLEAN(0, "stdio", &use_stdio, "Use the stdio interface"),
         OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
                    "file", "vmlinux pathname"),
         OPT_BOOLEAN('m', "modules", &symbol_conf.use_modules,
@@ -443,6 +444,11 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __used)
  {
         argc = parse_options(argc, argv, options, annotate_usage, 0);
  
+       if (use_stdio)
+               use_browser = 0;
+       else if (use_tui)
+               use_browser = 1;
+
         setup_browser();
  
         symbol_conf.priv_size = sizeof(struct sym_priv);
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c

index 55fc1f46892a6a920411db7dc91226bcbe6a7f82..5de405d452300318541338293563d8ebc41ccb87 100644 (file)
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -32,7 +32,7 @@
  
  static char            const *input_name = "perf.data";
  
-static bool            force;
+static bool            force, use_tui, use_stdio;
  static bool            hide_unresolved;
  static bool            dont_use_callchains;
  
@@ -107,7 +107,8 @@ static int perf_session__add_hist_entry(struct perf_session *self,
                 goto out_free_syms;
         err = 0;
         if (symbol_conf.use_callchain) {
-               err = append_chain(he->callchain, data->callchain, syms, data->period);
+               err = callchain_append(he->callchain, data->callchain, syms,
+                                      data->period);
                 if (err)
                         goto out_free_syms;
         }
@@ -450,6 +451,8 @@ static const struct option options[] = {
                     "Show per-thread event counters"),
         OPT_STRING(0, "pretty", &pretty_printing_style, "key",
                    "pretty printing style key: normal raw"),
+       OPT_BOOLEAN(0, "tui", &use_tui, "Use the TUI interface"),
+       OPT_BOOLEAN(0, "stdio", &use_stdio, "Use the stdio interface"),
         OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
                    "sort by key(s): pid, comm, dso, symbol, parent"),
         OPT_BOOLEAN(0, "showcpuutilization", &symbol_conf.show_cpu_utilization,
@@ -482,8 +485,15 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
  {
         argc = parse_options(argc, argv, options, report_usage, 0);
  
+       if (use_stdio)
+               use_browser = 0;
+       else if (use_tui)
+               use_browser = 1;
+
         if (strcmp(input_name, "-") != 0)
                 setup_browser();
+       else
+               use_browser = 0;
         /*
          * Only in the newt browser we are doing integrated annotation,
          * so don't allocate extra space that won't be used in the stdio
diff --git a/tools/perf/feature-tests.mak b/tools/perf/feature-tests.mak

index 7a7b6085905382c791834b0c0f35ed5f39658e26..b253db634f04b7e8ddfddd1cc33bb3ce8343a49a 100644 (file)
--- a/tools/perf/feature-tests.mak
+++ b/tools/perf/feature-tests.mak
@@ -110,6 +110,17 @@ int main(void)
  }
  endef
  
+define SOURCE_STRLCPY
+#include <stdlib.h>
+extern size_t strlcpy(char *dest, const char *src, size_t size);
+
+int main(void)
+{
+       strlcpy(NULL, NULL, 0);
+       return 0;
+}
+endef
+
  # try-cc
  # Usage: option = $(call try-cc, source-to-build, cc-options)
  try-cc = $(shell sh -c                                           \
diff --git a/tools/perf/scripts/python/bin/netdev-times-record b/tools/perf/scripts/python/bin/netdev-times-record

new file mode 100644 (file)

index 0000000..d931a82
--- /dev/null
+++ b/tools/perf/scripts/python/bin/netdev-times-record
@@ -0,0 +1,8 @@
+#!/bin/bash
+perf record -a -e net:net_dev_xmit -e net:net_dev_queue                \
+               -e net:netif_receive_skb -e net:netif_rx                \
+               -e skb:consume_skb -e skb:kfree_skb                     \
+               -e skb:skb_copy_datagram_iovec -e napi:napi_poll        \
+               -e irq:irq_handler_entry -e irq:irq_handler_exit        \
+               -e irq:softirq_entry -e irq:softirq_exit                \
+               -e irq:softirq_raise $@
diff --git a/tools/perf/scripts/python/bin/netdev-times-report b/tools/perf/scripts/python/bin/netdev-times-report

new file mode 100644 (file)

index 0000000..c3d0a63
--- /dev/null
+++ b/tools/perf/scripts/python/bin/netdev-times-report
@@ -0,0 +1,5 @@
+#!/bin/bash
+# description: display a process of packet and processing time
+# args: [tx] [rx] [dev=] [debug]
+
+perf trace -s ~/libexec/perf-core/scripts/python/netdev-times.py $@
diff --git a/tools/perf/scripts/python/netdev-times.py b/tools/perf/scripts/python/netdev-times.py

new file mode 100644 (file)

index 0000000..9aa0a32
--- /dev/null
+++ b/tools/perf/scripts/python/netdev-times.py
@@ -0,0 +1,464 @@
+# Display a process of packets and processed time.
+# It helps us to investigate networking or network device.
+#
+# options
+# tx: show only tx chart
+# rx: show only rx chart
+# dev=: show only thing related to specified device
+# debug: work with debug mode. It shows buffer status.
+
+import os
+import sys
+
+sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+       '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+
+from perf_trace_context import *
+from Core import *
+from Util import *
+
+all_event_list = []; # insert all tracepoint event related with this script
+irq_dic = {}; # key is cpu and value is a list which stacks irqs
+              # which raise NET_RX softirq
+net_rx_dic = {}; # key is cpu and value include time of NET_RX softirq-entry
+                # and a list which stacks receive
+receive_hunk_list = []; # a list which include a sequence of receive events
+rx_skb_list = []; # received packet list for matching
+                      # skb_copy_datagram_iovec
+
+buffer_budget = 65536; # the budget of rx_skb_list, tx_queue_list and
+                      # tx_xmit_list
+of_count_rx_skb_list = 0; # overflow count
+
+tx_queue_list = []; # list of packets which pass through dev_queue_xmit
+of_count_tx_queue_list = 0; # overflow count
+
+tx_xmit_list = [];  # list of packets which pass through dev_hard_start_xmit
+of_count_tx_xmit_list = 0; # overflow count
+
+tx_free_list = [];  # list of packets which is freed
+
+# options
+show_tx = 0;
+show_rx = 0;
+dev = 0; # store a name of device specified by option "dev="
+debug = 0;
+
+# indices of event_info tuple
+EINFO_IDX_NAME=   0
+EINFO_IDX_CONTEXT=1
+EINFO_IDX_CPU=    2
+EINFO_IDX_TIME=   3
+EINFO_IDX_PID=    4
+EINFO_IDX_COMM=   5
+
+# Calculate a time interval(msec) from src(nsec) to dst(nsec)
+def diff_msec(src, dst):
+       return (dst - src) / 1000000.0
+
+# Display a process of transmitting a packet
+def print_transmit(hunk):
+       if dev != 0 and hunk['dev'].find(dev) < 0:
+               return
+       print "%7s %5d %6d.%06dsec %12.3fmsec      %12.3fmsec" % \
+               (hunk['dev'], hunk['len'],
+               nsecs_secs(hunk['queue_t']),
+               nsecs_nsecs(hunk['queue_t'])/1000,
+               diff_msec(hunk['queue_t'], hunk['xmit_t']),
+               diff_msec(hunk['xmit_t'], hunk['free_t']))
+
+# Format for displaying rx packet processing
+PF_IRQ_ENTRY= "  irq_entry(+%.3fmsec irq=%d:%s)"
+PF_SOFT_ENTRY="  softirq_entry(+%.3fmsec)"
+PF_NAPI_POLL= "  napi_poll_exit(+%.3fmsec %s)"
+PF_JOINT=     "         |"
+PF_WJOINT=    "         |            |"
+PF_NET_RECV=  "         |---netif_receive_skb(+%.3fmsec skb=%x len=%d)"
+PF_NET_RX=    "         |---netif_rx(+%.3fmsec skb=%x)"
+PF_CPY_DGRAM= "         |      skb_copy_datagram_iovec(+%.3fmsec %d:%s)"
+PF_KFREE_SKB= "         |      kfree_skb(+%.3fmsec location=%x)"
+PF_CONS_SKB=  "         |      consume_skb(+%.3fmsec)"
+
+# Display a process of received packets and interrputs associated with
+# a NET_RX softirq
+def print_receive(hunk):
+       show_hunk = 0
+       irq_list = hunk['irq_list']
+       cpu = irq_list[0]['cpu']
+       base_t = irq_list[0]['irq_ent_t']
+       # check if this hunk should be showed
+       if dev != 0:
+               for i in range(len(irq_list)):
+                       if irq_list[i]['name'].find(dev) >= 0:
+                               show_hunk = 1
+                               break
+       else:
+               show_hunk = 1
+       if show_hunk == 0:
+               return
+
+       print "%d.%06dsec cpu=%d" % \
+               (nsecs_secs(base_t), nsecs_nsecs(base_t)/1000, cpu)
+       for i in range(len(irq_list)):
+               print PF_IRQ_ENTRY % \
+                       (diff_msec(base_t, irq_list[i]['irq_ent_t']),
+                       irq_list[i]['irq'], irq_list[i]['name'])
+               print PF_JOINT
+               irq_event_list = irq_list[i]['event_list']
+               for j in range(len(irq_event_list)):
+                       irq_event = irq_event_list[j]
+                       if irq_event['event'] == 'netif_rx':
+                               print PF_NET_RX % \
+                                       (diff_msec(base_t, irq_event['time']),
+                                       irq_event['skbaddr'])
+                               print PF_JOINT
+       print PF_SOFT_ENTRY % \
+               diff_msec(base_t, hunk['sirq_ent_t'])
+       print PF_JOINT
+       event_list = hunk['event_list']
+       for i in range(len(event_list)):
+               event = event_list[i]
+               if event['event_name'] == 'napi_poll':
+                       print PF_NAPI_POLL % \
+                           (diff_msec(base_t, event['event_t']), event['dev'])
+                       if i == len(event_list) - 1:
+                               print ""
+                       else:
+                               print PF_JOINT
+               else:
+                       print PF_NET_RECV % \
+                           (diff_msec(base_t, event['event_t']), event['skbaddr'],
+                               event['len'])
+                       if 'comm' in event.keys():
+                               print PF_WJOINT
+                               print PF_CPY_DGRAM % \
+                                       (diff_msec(base_t, event['comm_t']),
+                                       event['pid'], event['comm'])
+                       elif 'handle' in event.keys():
+                               print PF_WJOINT
+                               if event['handle'] == "kfree_skb":
+                                       print PF_KFREE_SKB % \
+                                               (diff_msec(base_t,
+                                               event['comm_t']),
+                                               event['location'])
+                               elif event['handle'] == "consume_skb":
+                                       print PF_CONS_SKB % \
+                                               diff_msec(base_t,
+                                                       event['comm_t'])
+                       print PF_JOINT
+
+def trace_begin():
+       global show_tx
+       global show_rx
+       global dev
+       global debug
+
+       for i in range(len(sys.argv)):
+               if i == 0:
+                       continue
+               arg = sys.argv[i]
+               if arg == 'tx':
+                       show_tx = 1
+               elif arg =='rx':
+                       show_rx = 1
+               elif arg.find('dev=',0, 4) >= 0:
+                       dev = arg[4:]
+               elif arg == 'debug':
+                       debug = 1
+       if show_tx == 0  and show_rx == 0:
+               show_tx = 1
+               show_rx = 1
+
+def trace_end():
+       # order all events in time
+       all_event_list.sort(lambda a,b :cmp(a[EINFO_IDX_TIME],
+                                           b[EINFO_IDX_TIME]))
+       # process all events
+       for i in range(len(all_event_list)):
+               event_info = all_event_list[i]
+               name = event_info[EINFO_IDX_NAME]
+               if name == 'irq__softirq_exit':
+                       handle_irq_softirq_exit(event_info)
+               elif name == 'irq__softirq_entry':
+                       handle_irq_softirq_entry(event_info)
+               elif name == 'irq__softirq_raise':
+                       handle_irq_softirq_raise(event_info)
+               elif name == 'irq__irq_handler_entry':
+                       handle_irq_handler_entry(event_info)
+               elif name == 'irq__irq_handler_exit':
+                       handle_irq_handler_exit(event_info)
+               elif name == 'napi__napi_poll':
+                       handle_napi_poll(event_info)
+               elif name == 'net__netif_receive_skb':
+                       handle_netif_receive_skb(event_info)
+               elif name == 'net__netif_rx':
+                       handle_netif_rx(event_info)
+               elif name == 'skb__skb_copy_datagram_iovec':
+                       handle_skb_copy_datagram_iovec(event_info)
+               elif name == 'net__net_dev_queue':
+                       handle_net_dev_queue(event_info)
+               elif name == 'net__net_dev_xmit':
+                       handle_net_dev_xmit(event_info)
+               elif name == 'skb__kfree_skb':
+                       handle_kfree_skb(event_info)
+               elif name == 'skb__consume_skb':
+                       handle_consume_skb(event_info)
+       # display receive hunks
+       if show_rx:
+               for i in range(len(receive_hunk_list)):
+                       print_receive(receive_hunk_list[i])
+       # display transmit hunks
+       if show_tx:
+               print "   dev    len      Qdisc        " \
+                       "       netdevice             free"
+               for i in range(len(tx_free_list)):
+                       print_transmit(tx_free_list[i])
+       if debug:
+               print "debug buffer status"
+               print "----------------------------"
+               print "xmit Qdisc:remain:%d overflow:%d" % \
+                       (len(tx_queue_list), of_count_tx_queue_list)
+               print "xmit netdevice:remain:%d overflow:%d" % \
+                       (len(tx_xmit_list), of_count_tx_xmit_list)
+               print "receive:remain:%d overflow:%d" % \
+                       (len(rx_skb_list), of_count_rx_skb_list)
+
+# called from perf, when it finds a correspoinding event
+def irq__softirq_entry(name, context, cpu, sec, nsec, pid, comm, vec):
+       if symbol_str("irq__softirq_entry", "vec", vec) != "NET_RX":
+               return
+       event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, vec)
+       all_event_list.append(event_info)
+
+def irq__softirq_exit(name, context, cpu, sec, nsec, pid, comm, vec):
+       if symbol_str("irq__softirq_entry", "vec", vec) != "NET_RX":
+               return
+       event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, vec)
+       all_event_list.append(event_info)
+
+def irq__softirq_raise(name, context, cpu, sec, nsec, pid, comm, vec):
+       if symbol_str("irq__softirq_entry", "vec", vec) != "NET_RX":
+               return
+       event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, vec)
+       all_event_list.append(event_info)
+
+def irq__irq_handler_entry(name, context, cpu, sec, nsec, pid, comm,
+                       irq, irq_name):
+       event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+                       irq, irq_name)
+       all_event_list.append(event_info)
+
+def irq__irq_handler_exit(name, context, cpu, sec, nsec, pid, comm, irq, ret):
+       event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, irq, ret)
+       all_event_list.append(event_info)
+
+def napi__napi_poll(name, context, cpu, sec, nsec, pid, comm, napi, dev_name):
+       event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+                       napi, dev_name)
+       all_event_list.append(event_info)
+
+def net__netif_receive_skb(name, context, cpu, sec, nsec, pid, comm, skbaddr,
+                       skblen, dev_name):
+       event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+                       skbaddr, skblen, dev_name)
+       all_event_list.append(event_info)
+
+def net__netif_rx(name, context, cpu, sec, nsec, pid, comm, skbaddr,
+                       skblen, dev_name):
+       event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+                       skbaddr, skblen, dev_name)
+       all_event_list.append(event_info)
+
+def net__net_dev_queue(name, context, cpu, sec, nsec, pid, comm,
+                       skbaddr, skblen, dev_name):
+       event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+                       skbaddr, skblen, dev_name)
+       all_event_list.append(event_info)
+
+def net__net_dev_xmit(name, context, cpu, sec, nsec, pid, comm,
+                       skbaddr, skblen, rc, dev_name):
+       event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+                       skbaddr, skblen, rc ,dev_name)
+       all_event_list.append(event_info)
+
+def skb__kfree_skb(name, context, cpu, sec, nsec, pid, comm,
+                       skbaddr, protocol, location):
+       event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+                       skbaddr, protocol, location)
+       all_event_list.append(event_info)
+
+def skb__consume_skb(name, context, cpu, sec, nsec, pid, comm, skbaddr):
+       event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+                       skbaddr)
+       all_event_list.append(event_info)
+
+def skb__skb_copy_datagram_iovec(name, context, cpu, sec, nsec, pid, comm,
+       skbaddr, skblen):
+       event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+                       skbaddr, skblen)
+       all_event_list.append(event_info)
+
+def handle_irq_handler_entry(event_info):
+       (name, context, cpu, time, pid, comm, irq, irq_name) = event_info
+       if cpu not in irq_dic.keys():
+               irq_dic[cpu] = []
+       irq_record = {'irq':irq, 'name':irq_name, 'cpu':cpu, 'irq_ent_t':time}
+       irq_dic[cpu].append(irq_record)
+
+def handle_irq_handler_exit(event_info):
+       (name, context, cpu, time, pid, comm, irq, ret) = event_info
+       if cpu not in irq_dic.keys():
+               return
+       irq_record = irq_dic[cpu].pop()
+       if irq != irq_record['irq']:
+               return
+       irq_record.update({'irq_ext_t':time})
+       # if an irq doesn't include NET_RX softirq, drop.
+       if 'event_list' in irq_record.keys():
+               irq_dic[cpu].append(irq_record)
+
+def handle_irq_softirq_raise(event_info):
+       (name, context, cpu, time, pid, comm, vec) = event_info
+       if cpu not in irq_dic.keys() \
+       or len(irq_dic[cpu]) == 0:
+               return
+       irq_record = irq_dic[cpu].pop()
+       if 'event_list' in irq_record.keys():
+               irq_event_list = irq_record['event_list']
+       else:
+               irq_event_list = []
+       irq_event_list.append({'time':time, 'event':'sirq_raise'})
+       irq_record.update({'event_list':irq_event_list})
+       irq_dic[cpu].append(irq_record)
+
+def handle_irq_softirq_entry(event_info):
+       (name, context, cpu, time, pid, comm, vec) = event_info
+       net_rx_dic[cpu] = {'sirq_ent_t':time, 'event_list':[]}
+
+def handle_irq_softirq_exit(event_info):
+       (name, context, cpu, time, pid, comm, vec) = event_info
+       irq_list = []
+       event_list = 0
+       if cpu in irq_dic.keys():
+               irq_list = irq_dic[cpu]
+               del irq_dic[cpu]
+       if cpu in net_rx_dic.keys():
+               sirq_ent_t = net_rx_dic[cpu]['sirq_ent_t']
+               event_list = net_rx_dic[cpu]['event_list']
+               del net_rx_dic[cpu]
+       if irq_list == [] or event_list == 0:
+               return
+       rec_data = {'sirq_ent_t':sirq_ent_t, 'sirq_ext_t':time,
+                   'irq_list':irq_list, 'event_list':event_list}
+       # merge information realted to a NET_RX softirq
+       receive_hunk_list.append(rec_data)
+
+def handle_napi_poll(event_info):
+       (name, context, cpu, time, pid, comm, napi, dev_name) = event_info
+       if cpu in net_rx_dic.keys():
+               event_list = net_rx_dic[cpu]['event_list']
+               rec_data = {'event_name':'napi_poll',
+                               'dev':dev_name, 'event_t':time}
+               event_list.append(rec_data)
+
+def handle_netif_rx(event_info):
+       (name, context, cpu, time, pid, comm,
+               skbaddr, skblen, dev_name) = event_info
+       if cpu not in irq_dic.keys() \
+       or len(irq_dic[cpu]) == 0:
+               return
+       irq_record = irq_dic[cpu].pop()
+       if 'event_list' in irq_record.keys():
+               irq_event_list = irq_record['event_list']
+       else:
+               irq_event_list = []
+       irq_event_list.append({'time':time, 'event':'netif_rx',
+               'skbaddr':skbaddr, 'skblen':skblen, 'dev_name':dev_name})
+       irq_record.update({'event_list':irq_event_list})
+       irq_dic[cpu].append(irq_record)
+
+def handle_netif_receive_skb(event_info):
+       global of_count_rx_skb_list
+
+       (name, context, cpu, time, pid, comm,
+               skbaddr, skblen, dev_name) = event_info
+       if cpu in net_rx_dic.keys():
+               rec_data = {'event_name':'netif_receive_skb',
+                           'event_t':time, 'skbaddr':skbaddr, 'len':skblen}
+               event_list = net_rx_dic[cpu]['event_list']
+               event_list.append(rec_data)
+               rx_skb_list.insert(0, rec_data)
+               if len(rx_skb_list) > buffer_budget:
+                       rx_skb_list.pop()
+                       of_count_rx_skb_list += 1
+
+def handle_net_dev_queue(event_info):
+       global of_count_tx_queue_list
+
+       (name, context, cpu, time, pid, comm,
+               skbaddr, skblen, dev_name) = event_info
+       skb = {'dev':dev_name, 'skbaddr':skbaddr, 'len':skblen, 'queue_t':time}
+       tx_queue_list.insert(0, skb)
+       if len(tx_queue_list) > buffer_budget:
+               tx_queue_list.pop()
+               of_count_tx_queue_list += 1
+
+def handle_net_dev_xmit(event_info):
+       global of_count_tx_xmit_list
+
+       (name, context, cpu, time, pid, comm,
+               skbaddr, skblen, rc, dev_name) = event_info
+       if rc == 0: # NETDEV_TX_OK
+               for i in range(len(tx_queue_list)):
+                       skb = tx_queue_list[i]
+                       if skb['skbaddr'] == skbaddr:
+                               skb['xmit_t'] = time
+                               tx_xmit_list.insert(0, skb)
+                               del tx_queue_list[i]
+                               if len(tx_xmit_list) > buffer_budget:
+                                       tx_xmit_list.pop()
+                                       of_count_tx_xmit_list += 1
+                               return
+
+def handle_kfree_skb(event_info):
+       (name, context, cpu, time, pid, comm,
+               skbaddr, protocol, location) = event_info
+       for i in range(len(tx_queue_list)):
+               skb = tx_queue_list[i]
+               if skb['skbaddr'] == skbaddr:
+                       del tx_queue_list[i]
+                       return
+       for i in range(len(tx_xmit_list)):
+               skb = tx_xmit_list[i]
+               if skb['skbaddr'] == skbaddr:
+                       skb['free_t'] = time
+                       tx_free_list.append(skb)
+                       del tx_xmit_list[i]
+                       return
+       for i in range(len(rx_skb_list)):
+               rec_data = rx_skb_list[i]
+               if rec_data['skbaddr'] == skbaddr:
+                       rec_data.update({'handle':"kfree_skb",
+                                       'comm':comm, 'pid':pid, 'comm_t':time})
+                       del rx_skb_list[i]
+                       return
+
+def handle_consume_skb(event_info):
+       (name, context, cpu, time, pid, comm, skbaddr) = event_info
+       for i in range(len(tx_xmit_list)):
+               skb = tx_xmit_list[i]
+               if skb['skbaddr'] == skbaddr:
+                       skb['free_t'] = time
+                       tx_free_list.append(skb)
+                       del tx_xmit_list[i]
+                       return
+
+def handle_skb_copy_datagram_iovec(event_info):
+       (name, context, cpu, time, pid, comm, skbaddr, skblen) = event_info
+       for i in range(len(rx_skb_list)):
+               rec_data = rx_skb_list[i]
+               if skbaddr == rec_data['skbaddr']:
+                       rec_data.update({'handle':"skb_copy_datagram_iovec",
+                                       'comm':comm, 'pid':pid, 'comm_t':time})
+                       del rx_skb_list[i]
+                       return
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h

index 27e9ebe4076e0efbf4117a46f2dbbcc74a1dcc3e..a7729797fd96254bc35326077337a71f919c19b5 100644 (file)
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h
@@ -82,6 +82,8 @@ extern char *perf_path(const char *fmt, ...) __attribute__((format (printf, 1, 2
  extern char *perf_pathdup(const char *fmt, ...)
         __attribute__((format (printf, 1, 2)));
  
+#ifdef NO_STRLCPY
  extern size_t strlcpy(char *dest, const char *src, size_t size);
+#endif
  
  #endif /* __PERF_CACHE_H */
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c

index f231f43424d27930a286cb52902c21cb4534a068..e12d539417b2cc4644e2d5a919cfb8a23e8ae163 100644 (file)
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -28,6 +28,9 @@ bool ip_callchain__valid(struct ip_callchain *chain, const event_t *event)
  #define chain_for_each_child(child, parent)    \
         list_for_each_entry(child, &parent->children, brothers)
  
+#define chain_for_each_child_safe(child, next, parent) \
+       list_for_each_entry_safe(child, next, &parent->children, brothers)
+
  static void
  rb_insert_callchain(struct rb_root *root, struct callchain_node *chain,
                     enum chain_mode mode)
@@ -86,10 +89,10 @@ __sort_chain_flat(struct rb_root *rb_root, struct callchain_node *node,
   * sort them by hit
   */
  static void
-sort_chain_flat(struct rb_root *rb_root, struct callchain_node *node,
+sort_chain_flat(struct rb_root *rb_root, struct callchain_root *root,
                 u64 min_hit, struct callchain_param *param __used)
  {
-       __sort_chain_flat(rb_root, node, min_hit);
+       __sort_chain_flat(rb_root, &root->node, min_hit);
  }
  
  static void __sort_chain_graph_abs(struct callchain_node *node,
@@ -108,11 +111,11 @@ static void __sort_chain_graph_abs(struct callchain_node *node,
  }
  
  static void
-sort_chain_graph_abs(struct rb_root *rb_root, struct callchain_node *chain_root,
+sort_chain_graph_abs(struct rb_root *rb_root, struct callchain_root *chain_root,
                      u64 min_hit, struct callchain_param *param __used)
  {
-       __sort_chain_graph_abs(chain_root, min_hit);
-       rb_root->rb_node = chain_root->rb_root.rb_node;
+       __sort_chain_graph_abs(&chain_root->node, min_hit);
+       rb_root->rb_node = chain_root->node.rb_root.rb_node;
  }
  
  static void __sort_chain_graph_rel(struct callchain_node *node,
@@ -133,11 +136,11 @@ static void __sort_chain_graph_rel(struct callchain_node *node,
  }
  
  static void
-sort_chain_graph_rel(struct rb_root *rb_root, struct callchain_node *chain_root,
+sort_chain_graph_rel(struct rb_root *rb_root, struct callchain_root *chain_root,
                      u64 min_hit __used, struct callchain_param *param)
  {
-       __sort_chain_graph_rel(chain_root, param->min_percent / 100.0);
-       rb_root->rb_node = chain_root->rb_root.rb_node;
+       __sort_chain_graph_rel(&chain_root->node, param->min_percent / 100.0);
+       rb_root->rb_node = chain_root->node.rb_root.rb_node;
  }
  
  int register_callchain_param(struct callchain_param *param)
@@ -284,19 +287,18 @@ split_add_child(struct callchain_node *parent, struct resolved_chain *chain,
  }
  
  static int
-__append_chain(struct callchain_node *root, struct resolved_chain *chain,
-              unsigned int start, u64 period);
+append_chain(struct callchain_node *root, struct resolved_chain *chain,
+            unsigned int start, u64 period);
  
  static void
-__append_chain_children(struct callchain_node *root,
-                       struct resolved_chain *chain,
-                       unsigned int start, u64 period)
+append_chain_children(struct callchain_node *root, struct resolved_chain *chain,
+                     unsigned int start, u64 period)
  {
         struct callchain_node *rnode;
  
         /* lookup in childrens */
         chain_for_each_child(rnode, root) {
-               unsigned int ret = __append_chain(rnode, chain, start, period);
+               unsigned int ret = append_chain(rnode, chain, start, period);
  
                 if (!ret)
                         goto inc_children_hit;
@@ -309,8 +311,8 @@ inc_children_hit:
  }
  
  static int
-__append_chain(struct callchain_node *root, struct resolved_chain *chain,
-              unsigned int start, u64 period)
+append_chain(struct callchain_node *root, struct resolved_chain *chain,
+            unsigned int start, u64 period)
  {
         struct callchain_list *cnode;
         unsigned int i = start;
@@ -357,7 +359,7 @@ __append_chain(struct callchain_node *root, struct resolved_chain *chain,
         }
  
         /* We match the node and still have a part remaining */
-       __append_chain_children(root, chain, i, period);
+       append_chain_children(root, chain, i, period);
  
         return 0;
  }
@@ -380,8 +382,8 @@ static void filter_context(struct ip_callchain *old, struct resolved_chain *new,
  }
  
  
-int append_chain(struct callchain_node *root, struct ip_callchain *chain,
-                struct map_symbol *syms, u64 period)
+int callchain_append(struct callchain_root *root, struct ip_callchain *chain,
+                    struct map_symbol *syms, u64 period)
  {
         struct resolved_chain *filtered;
  
@@ -398,9 +400,65 @@ int append_chain(struct callchain_node *root, struct ip_callchain *chain,
         if (!filtered->nr)
                 goto end;
  
-       __append_chain_children(root, filtered, 0, period);
+       append_chain_children(&root->node, filtered, 0, period);
+
+       if (filtered->nr > root->max_depth)
+               root->max_depth = filtered->nr;
  end:
         free(filtered);
  
         return 0;
  }
+
+static int
+merge_chain_branch(struct callchain_node *dst, struct callchain_node *src,
+                  struct resolved_chain *chain)
+{
+       struct callchain_node *child, *next_child;
+       struct callchain_list *list, *next_list;
+       int old_pos = chain->nr;
+       int err = 0;
+
+       list_for_each_entry_safe(list, next_list, &src->val, list) {
+               chain->ips[chain->nr].ip = list->ip;
+               chain->ips[chain->nr].ms = list->ms;
+               chain->nr++;
+               list_del(&list->list);
+               free(list);
+       }
+
+       if (src->hit)
+               append_chain_children(dst, chain, 0, src->hit);
+
+       chain_for_each_child_safe(child, next_child, src) {
+               err = merge_chain_branch(dst, child, chain);
+               if (err)
+                       break;
+
+               list_del(&child->brothers);
+               free(child);
+       }
+
+       chain->nr = old_pos;
+
+       return err;
+}
+
+int callchain_merge(struct callchain_root *dst, struct callchain_root *src)
+{
+       struct resolved_chain *chain;
+       int err;
+
+       chain = malloc(sizeof(*chain) +
+                      src->max_depth * sizeof(struct resolved_ip));
+       if (!chain)
+               return -ENOMEM;
+
+       chain->nr = 0;
+
+       err = merge_chain_branch(&dst->node, &src->node, chain);
+
+       free(chain);
+
+       return err;
+}
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h

index 6de4313924fb5c510f354cb3a8e2a73061e103f6..c15fb8c24ad2b87388e97cd6346cfdebaac11dd5 100644 (file)
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -26,9 +26,14 @@ struct callchain_node {
         u64                     children_hit;
  };
  
+struct callchain_root {
+       u64                     max_depth;
+       struct callchain_node   node;
+};
+
  struct callchain_param;
  
-typedef void (*sort_chain_func_t)(struct rb_root *, struct callchain_node *,
+typedef void (*sort_chain_func_t)(struct rb_root *, struct callchain_root *,
                                  u64, struct callchain_param *);
  
  struct callchain_param {
@@ -44,15 +49,16 @@ struct callchain_list {
         struct list_head        list;
  };
  
-static inline void callchain_init(struct callchain_node *node)
+static inline void callchain_init(struct callchain_root *root)
  {
-       INIT_LIST_HEAD(&node->brothers);
-       INIT_LIST_HEAD(&node->children);
-       INIT_LIST_HEAD(&node->val);
+       INIT_LIST_HEAD(&root->node.brothers);
+       INIT_LIST_HEAD(&root->node.children);
+       INIT_LIST_HEAD(&root->node.val);
  
-       node->children_hit = 0;
-       node->parent = NULL;
-       node->hit = 0;
+       root->node.parent = NULL;
+       root->node.hit = 0;
+       root->node.children_hit = 0;
+       root->max_depth = 0;
  }
  
  static inline u64 cumul_hits(struct callchain_node *node)
@@ -61,8 +67,9 @@ static inline u64 cumul_hits(struct callchain_node *node)
  }
  
  int register_callchain_param(struct callchain_param *param);
-int append_chain(struct callchain_node *root, struct ip_callchain *chain,
-                struct map_symbol *syms, u64 period);
+int callchain_append(struct callchain_root *root, struct ip_callchain *chain,
+                    struct map_symbol *syms, u64 period);
+int callchain_merge(struct callchain_root *dst, struct callchain_root *src);
  
  bool ip_callchain__valid(struct ip_callchain *chain, const event_t *event);
  #endif /* __PERF_CALLCHAIN_H */
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c

index be22ae6ef0558009c0a1faaa4f55bcf2c5d828ca..2022e87409942ca4b0d133c3f889e41178a663d1 100644 (file)
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -87,7 +87,7 @@ static void hist_entry__add_cpumode_period(struct hist_entry *self,
  
  static struct hist_entry *hist_entry__new(struct hist_entry *template)
  {
-       size_t callchain_size = symbol_conf.use_callchain ? sizeof(struct callchain_node) : 0;
+       size_t callchain_size = symbol_conf.use_callchain ? sizeof(struct callchain_root) : 0;
         struct hist_entry *self = malloc(sizeof(*self) + callchain_size);
  
         if (self != NULL) {
@@ -226,6 +226,8 @@ static bool collapse__insert_entry(struct rb_root *root, struct hist_entry *he)
  
                 if (!cmp) {
                         iter->period += he->period;
+                       if (symbol_conf.use_callchain)
+                               callchain_merge(iter->callchain, he->callchain);
                         hist_entry__free(he);
                         return false;
                 }
diff --git a/tools/perf/util/path.c b/tools/perf/util/path.c

index 58a470d036dd0917c16eda49fb8b1987703ca7b5..bd74977114242ff465af39a291d30aa7d463f3b2 100644 (file)
--- a/tools/perf/util/path.c
+++ b/tools/perf/util/path.c
@@ -22,6 +22,7 @@ static const char *get_perf_dir(void)
         return ".";
  }
  
+#ifdef NO_STRLCPY
  size_t strlcpy(char *dest, const char *src, size_t size)
  {
         size_t ret = strlen(src);
@@ -33,7 +34,7 @@ size_t strlcpy(char *dest, const char *src, size_t size)
         }
         return ret;
  }
-
+#endif
  
  static char *get_pathname(void)
  {
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h

index 46e531d09e8bfcbe1064a4307ce5a75ef72a6405..0b91053a7d11af888eea81a4c8de24fdd60ce6f8 100644 (file)
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -70,7 +70,7 @@ struct hist_entry {
                 struct hist_entry *pair;
                 struct rb_root    sorted_chain;
         };
-       struct callchain_node   callchain[0];
+       struct callchain_root   callchain[0];
  };
  
  enum sort_type {
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c

index b2f5ae97f33dded1cdaba3e843ab2df888515777..b39f499e575a604198bf1bb11d11d6280a091548 100644 (file)
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -388,6 +388,20 @@ size_t dso__fprintf_buildid(struct dso *self, FILE *fp)
         return fprintf(fp, "%s", sbuild_id);
  }
  
+size_t dso__fprintf_symbols_by_name(struct dso *self, enum map_type type, FILE *fp)
+{
+       size_t ret = 0;
+       struct rb_node *nd;
+       struct symbol_name_rb_node *pos;
+
+       for (nd = rb_first(&self->symbol_names[type]); nd; nd = rb_next(nd)) {
+               pos = rb_entry(nd, struct symbol_name_rb_node, rb_node);
+               fprintf(fp, "%s\n", pos->sym.name);
+       }
+
+       return ret;
+}
+
  size_t dso__fprintf(struct dso *self, enum map_type type, FILE *fp)
  {
         struct rb_node *nd;
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h

index ea95c2756f05e73d09f6868f24b174ed1221f59a..038f2201ee09579ca3f460d9f59576770ea477d2 100644 (file)
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -182,6 +182,7 @@ size_t machines__fprintf_dsos(struct rb_root *self, FILE *fp);
  size_t machines__fprintf_dsos_buildid(struct rb_root *self, FILE *fp, bool with_hits);
  
  size_t dso__fprintf_buildid(struct dso *self, FILE *fp);
+size_t dso__fprintf_symbols_by_name(struct dso *self, enum map_type type, FILE *fp);
  size_t dso__fprintf(struct dso *self, enum map_type type, FILE *fp);
  
  enum dso_origin {
diff --git a/tools/perf/util/ui/browser.c b/tools/perf/util/ui/browser.c

index 66f2d583d8c4326971dc9d37cd47df57eeb82306..6d0df809a2edab24f28af4bea093d1cda3c2614d 100644 (file)
--- a/tools/perf/util/ui/browser.c
+++ b/tools/perf/util/ui/browser.c
@@ -1,16 +1,6 @@
-#define _GNU_SOURCE
-#include <stdio.h>
-#undef _GNU_SOURCE
-/*
- * slang versions <= 2.0.6 have a "#if HAVE_LONG_LONG" that breaks
- * the build if it isn't defined. Use the equivalent one that glibc
- * has on features.h.
- */
-#include <features.h>
-#ifndef HAVE_LONG_LONG
-#define HAVE_LONG_LONG __GLIBC_HAVE_LONG_LONG
-#endif
  #include <slang.h>
+#include "libslang.h"
+#include <linux/compiler.h>
  #include <linux/list.h>
  #include <linux/rbtree.h>
  #include <stdlib.h>
@@ -19,17 +9,9 @@
  #include "helpline.h"
  #include "../color.h"
  #include "../util.h"
+#include <stdio.h>
  
-#if SLANG_VERSION < 20104
-#define sltt_set_color(obj, name, fg, bg) \
-       SLtt_set_color(obj,(char *)name, (char *)fg, (char *)bg)
-#else
-#define sltt_set_color SLtt_set_color
-#endif
-
-newtComponent newt_form__new(void);
-
-int ui_browser__percent_color(double percent, bool current)
+static int ui_browser__percent_color(double percent, bool current)
  {
         if (current)
                 return HE_COLORSET_SELECTED;
@@ -40,6 +22,23 @@ int ui_browser__percent_color(double percent, bool current)
         return HE_COLORSET_NORMAL;
  }
  
+void ui_browser__set_color(struct ui_browser *self __used, int color)
+{
+       SLsmg_set_color(color);
+}
+
+void ui_browser__set_percent_color(struct ui_browser *self,
+                                  double percent, bool current)
+{
+        int color = ui_browser__percent_color(percent, current);
+        ui_browser__set_color(self, color);
+}
+
+void ui_browser__gotorc(struct ui_browser *self, int y, int x)
+{
+       SLsmg_gotorc(self->y + y, self->x + x);
+}
+
  void ui_browser__list_head_seek(struct ui_browser *self, off_t offset, int whence)
  {
         struct list_head *head = self->entries;
@@ -111,7 +110,7 @@ unsigned int ui_browser__rb_tree_refresh(struct ui_browser *self)
         nd = self->top;
  
         while (nd != NULL) {
-               SLsmg_gotorc(self->y + row, self->x);
+               ui_browser__gotorc(self, row, 0);
                 self->write(self, nd, row);
                 if (++row == self->height)
                         break;
@@ -131,13 +130,10 @@ void ui_browser__refresh_dimensions(struct ui_browser *self)
         int cols, rows;
         newtGetScreenSize(&cols, &rows);
  
-       if (self->width > cols - 4)
-               self->width = cols - 4;
-       self->height = rows - 5;
-       if (self->height > self->nr_entries)
-               self->height = self->nr_entries;
-       self->y  = (rows - self->height) / 2;
-       self->x = (cols - self->width) / 2;
+       self->width = cols - 1;
+       self->height = rows - 2;
+       self->y = 1;
+       self->x = 0;
  }
  
  void ui_browser__reset_index(struct ui_browser *self)
@@ -146,34 +142,48 @@ void ui_browser__reset_index(struct ui_browser *self)
         self->seek(self, 0, SEEK_SET);
  }
  
+void ui_browser__add_exit_key(struct ui_browser *self, int key)
+{
+       newtFormAddHotKey(self->form, key);
+}
+
+void ui_browser__add_exit_keys(struct ui_browser *self, int keys[])
+{
+       int i = 0;
+
+       while (keys[i] && i < 64) {
+               ui_browser__add_exit_key(self, keys[i]);
+               ++i;
+       }
+}
+
  int ui_browser__show(struct ui_browser *self, const char *title,
                      const char *helpline, ...)
  {
         va_list ap;
+       int keys[] = { NEWT_KEY_UP, NEWT_KEY_DOWN, NEWT_KEY_PGUP,
+                      NEWT_KEY_PGDN, NEWT_KEY_HOME, NEWT_KEY_END, ' ',
+                      NEWT_KEY_LEFT, NEWT_KEY_ESCAPE, 'q', CTRL('c'), 0 };
  
-       if (self->form != NULL) {
+       if (self->form != NULL)
                 newtFormDestroy(self->form);
-               newtPopWindow();
-       }
+
         ui_browser__refresh_dimensions(self);
-       newtCenteredWindow(self->width, self->height, title);
-       self->form = newt_form__new();
+       self->form = newtForm(NULL, NULL, 0);
         if (self->form == NULL)
                 return -1;
  
-       self->sb = newtVerticalScrollbar(self->width, 0, self->height,
+       self->sb = newtVerticalScrollbar(self->width, 1, self->height,
                                          HE_COLORSET_NORMAL,
                                          HE_COLORSET_SELECTED);
         if (self->sb == NULL)
                 return -1;
  
-       newtFormAddHotKey(self->form, NEWT_KEY_UP);
-       newtFormAddHotKey(self->form, NEWT_KEY_DOWN);
-       newtFormAddHotKey(self->form, NEWT_KEY_PGUP);
-       newtFormAddHotKey(self->form, NEWT_KEY_PGDN);
-       newtFormAddHotKey(self->form, NEWT_KEY_HOME);
-       newtFormAddHotKey(self->form, NEWT_KEY_END);
-       newtFormAddHotKey(self->form, ' ');
+       SLsmg_gotorc(0, 0);
+       ui_browser__set_color(self, NEWT_COLORSET_ROOT);
+       slsmg_write_nstring(title, self->width);
+
+       ui_browser__add_exit_keys(self, keys);
         newtFormAddComponent(self->form, self->sb);
  
         va_start(ap, helpline);
@@ -185,7 +195,6 @@ int ui_browser__show(struct ui_browser *self, const char *title,
  void ui_browser__hide(struct ui_browser *self)
  {
         newtFormDestroy(self->form);
-       newtPopWindow();
         self->form = NULL;
         ui_helpline__pop();
  }
@@ -196,28 +205,28 @@ int ui_browser__refresh(struct ui_browser *self)
  
         newtScrollbarSet(self->sb, self->index, self->nr_entries - 1);
         row = self->refresh(self);
-       SLsmg_set_color(HE_COLORSET_NORMAL);
+       ui_browser__set_color(self, HE_COLORSET_NORMAL);
         SLsmg_fill_region(self->y + row, self->x,
                           self->height - row, self->width, ' ');
  
         return 0;
  }
  
-int ui_browser__run(struct ui_browser *self, struct newtExitStruct *es)
+int ui_browser__run(struct ui_browser *self)
  {
+       struct newtExitStruct es;
+
         if (ui_browser__refresh(self) < 0)
                 return -1;
  
         while (1) {
                 off_t offset;
  
-               newtFormRun(self->form, es);
+               newtFormRun(self->form, &es);
  
-               if (es->reason != NEWT_EXIT_HOTKEY)
+               if (es.reason != NEWT_EXIT_HOTKEY)
                         break;
-               if (is_exit_key(es->u.key))
-                       return es->u.key;
-               switch (es->u.key) {
+               switch (es.u.key) {
                 case NEWT_KEY_DOWN:
                         if (self->index == self->nr_entries - 1)
                                 break;
@@ -274,12 +283,12 @@ int ui_browser__run(struct ui_browser *self, struct newtExitStruct *es)
                         self->seek(self, -offset, SEEK_END);
                         break;
                 default:
-                       return es->u.key;
+                       return es.u.key;
                 }
                 if (ui_browser__refresh(self) < 0)
                         return -1;
         }
-       return 0;
+       return -1;
  }
  
  unsigned int ui_browser__list_head_refresh(struct ui_browser *self)
@@ -294,7 +303,7 @@ unsigned int ui_browser__list_head_refresh(struct ui_browser *self)
         pos = self->top;
  
         list_for_each_from(pos, head) {
-               SLsmg_gotorc(self->y + row, self->x);
+               ui_browser__gotorc(self, row, 0);
                 self->write(self, pos, row);
                 if (++row == self->height)
                         break;
diff --git a/tools/perf/util/ui/browser.h b/tools/perf/util/ui/browser.h

index 0b9f829214f756ec16227745835cfea55d7ee503..0dc7e4da36f52c42ef3574dc89dce7102ae8438d 100644 (file)
--- a/tools/perf/util/ui/browser.h
+++ b/tools/perf/util/ui/browser.h
@@ -25,16 +25,21 @@ struct ui_browser {
  };
  
  
-int ui_browser__percent_color(double percent, bool current);
+void ui_browser__set_color(struct ui_browser *self, int color);
+void ui_browser__set_percent_color(struct ui_browser *self,
+                                  double percent, bool current);
  bool ui_browser__is_current_entry(struct ui_browser *self, unsigned row);
  void ui_browser__refresh_dimensions(struct ui_browser *self);
  void ui_browser__reset_index(struct ui_browser *self);
  
+void ui_browser__gotorc(struct ui_browser *self, int y, int x);
+void ui_browser__add_exit_key(struct ui_browser *self, int key);
+void ui_browser__add_exit_keys(struct ui_browser *self, int keys[]);
  int ui_browser__show(struct ui_browser *self, const char *title,
                      const char *helpline, ...);
  void ui_browser__hide(struct ui_browser *self);
  int ui_browser__refresh(struct ui_browser *self);
-int ui_browser__run(struct ui_browser *self, struct newtExitStruct *es);
+int ui_browser__run(struct ui_browser *self);
  
  void ui_browser__rb_tree_seek(struct ui_browser *self, off_t offset, int whence);
  unsigned int ui_browser__rb_tree_refresh(struct ui_browser *self);
diff --git a/tools/perf/util/ui/browsers/annotate.c b/tools/perf/util/ui/browsers/annotate.c

index a90273e63f4fb6939ea64e074513e1afabb1f289..82b78f99251bb2b764165cf8066a85f1e6e4b97d 100644 (file)
--- a/tools/perf/util/ui/browsers/annotate.c
+++ b/tools/perf/util/ui/browsers/annotate.c
@@ -40,14 +40,12 @@ static void annotate_browser__write(struct ui_browser *self, void *entry, int ro
  
         if (ol->offset != -1) {
                 struct objdump_line_rb_node *olrb = objdump_line__rb(ol);
-               int color = ui_browser__percent_color(olrb->percent, current_entry);
-               SLsmg_set_color(color);
+               ui_browser__set_percent_color(self, olrb->percent, current_entry);
                 slsmg_printf(" %7.2f ", olrb->percent);
                 if (!current_entry)
-                       SLsmg_set_color(HE_COLORSET_CODE);
+                       ui_browser__set_color(self, HE_COLORSET_CODE);
         } else {
-               int color = ui_browser__percent_color(0, current_entry);
-               SLsmg_set_color(color);
+               ui_browser__set_percent_color(self, 0, current_entry);
                 slsmg_write_nstring(" ", 9);
         }
  
@@ -135,32 +133,31 @@ static void annotate_browser__set_top(struct annotate_browser *self,
         self->curr_hot = nd;
  }
  
-static int annotate_browser__run(struct annotate_browser *self,
-                                struct newtExitStruct *es)
+static int annotate_browser__run(struct annotate_browser *self)
  {
         struct rb_node *nd;
         struct hist_entry *he = self->b.priv;
+       int key;
  
         if (ui_browser__show(&self->b, he->ms.sym->name,
-                            "<- or ESC: exit, TAB/shift+TAB: cycle thru samples") < 0)
+                            "<-, -> or ESC: exit, TAB/shift+TAB: cycle thru samples") < 0)
                 return -1;
-
-       newtFormAddHotKey(self->b.form, NEWT_KEY_LEFT);
-       newtFormAddHotKey(self->b.form, NEWT_KEY_RIGHT);
+       /*
+        * To allow builtin-annotate to cycle thru multiple symbols by
+        * examining the exit key for this function.
+        */
+       ui_browser__add_exit_key(&self->b, NEWT_KEY_RIGHT);
  
         nd = self->curr_hot;
         if (nd) {
-               newtFormAddHotKey(self->b.form, NEWT_KEY_TAB);
-               newtFormAddHotKey(self->b.form, NEWT_KEY_UNTAB);
+               int tabs[] = { NEWT_KEY_TAB, NEWT_KEY_UNTAB, 0 };
+               ui_browser__add_exit_keys(&self->b, tabs);
         }
  
         while (1) {
-               ui_browser__run(&self->b, es);
-
-               if (es->reason != NEWT_EXIT_HOTKEY)
-                       break;
+               key = ui_browser__run(&self->b);
  
-               switch (es->u.key) {
+               switch (key) {
                 case NEWT_KEY_TAB:
                         nd = rb_prev(nd);
                         if (nd == NULL)
@@ -179,12 +176,11 @@ static int annotate_browser__run(struct annotate_browser *self,
         }
  out:
         ui_browser__hide(&self->b);
-       return es->u.key;
+       return key;
  }
  
  int hist_entry__tui_annotate(struct hist_entry *self)
  {
-       struct newtExitStruct es;
         struct objdump_line *pos, *n;
         struct objdump_line_rb_node *rbpos;
         LIST_HEAD(head);
@@ -232,7 +228,7 @@ int hist_entry__tui_annotate(struct hist_entry *self)
                 annotate_browser__set_top(&browser, browser.curr_hot);
  
         browser.b.width += 18; /* Percentage */
-       ret = annotate_browser__run(&browser, &es);
+       ret = annotate_browser__run(&browser);
         list_for_each_entry_safe(pos, n, &head, node) {
                 list_del(&pos->node);
                 objdump_line__free(pos);
diff --git a/tools/perf/util/ui/browsers/hists.c b/tools/perf/util/ui/browsers/hists.c

index 6866aa4c41e09cfd0239559cc903506a351a549c..ebda8c3fde9e6468ddbd84fc2df7e324ba862854 100644 (file)
--- a/tools/perf/util/ui/browsers/hists.c
+++ b/tools/perf/util/ui/browsers/hists.c
@@ -58,6 +58,11 @@ static char callchain_list__folded(const struct callchain_list *self)
         return map_symbol__folded(&self->ms);
  }
  
+static void map_symbol__set_folding(struct map_symbol *self, bool unfold)
+{
+       self->unfolded = unfold ? self->has_children : false;
+}
+
  static int callchain_node__count_rows_rb_tree(struct callchain_node *self)
  {
         int n = 0;
@@ -129,16 +134,16 @@ static void callchain_node__init_have_children_rb_tree(struct callchain_node *se
         for (nd = rb_first(&self->rb_root); nd; nd = rb_next(nd)) {
                 struct callchain_node *child = rb_entry(nd, struct callchain_node, rb_node);
                 struct callchain_list *chain;
-               int first = true;
+               bool first = true;
  
                 list_for_each_entry(chain, &child->val, list) {
                         if (first) {
                                 first = false;
                                 chain->ms.has_children = chain->list.next != &child->val ||
-                                                        rb_first(&child->rb_root) != NULL;
+                                                        !RB_EMPTY_ROOT(&child->rb_root);
                         } else
                                 chain->ms.has_children = chain->list.next == &child->val &&
-                                                        rb_first(&child->rb_root) != NULL;
+                                                        !RB_EMPTY_ROOT(&child->rb_root);
                 }
  
                 callchain_node__init_have_children_rb_tree(child);
@@ -150,7 +155,7 @@ static void callchain_node__init_have_children(struct callchain_node *self)
         struct callchain_list *chain;
  
         list_for_each_entry(chain, &self->val, list)
-               chain->ms.has_children = rb_first(&self->rb_root) != NULL;
+               chain->ms.has_children = !RB_EMPTY_ROOT(&self->rb_root);
  
         callchain_node__init_have_children_rb_tree(self);
  }
@@ -168,6 +173,7 @@ static void callchain__init_have_children(struct rb_root *self)
  static void hist_entry__init_have_children(struct hist_entry *self)
  {
         if (!self->init_have_children) {
+               self->ms.has_children = !RB_EMPTY_ROOT(&self->sorted_chain);
                 callchain__init_have_children(&self->sorted_chain);
                 self->init_have_children = true;
         }
@@ -195,43 +201,114 @@ static bool hist_browser__toggle_fold(struct hist_browser *self)
         return false;
  }
  
-static int hist_browser__run(struct hist_browser *self, const char *title,
-                            struct newtExitStruct *es)
+static int callchain_node__set_folding_rb_tree(struct callchain_node *self, bool unfold)
+{
+       int n = 0;
+       struct rb_node *nd;
+
+       for (nd = rb_first(&self->rb_root); nd; nd = rb_next(nd)) {
+               struct callchain_node *child = rb_entry(nd, struct callchain_node, rb_node);
+               struct callchain_list *chain;
+               bool has_children = false;
+
+               list_for_each_entry(chain, &child->val, list) {
+                       ++n;
+                       map_symbol__set_folding(&chain->ms, unfold);
+                       has_children = chain->ms.has_children;
+               }
+
+               if (has_children)
+                       n += callchain_node__set_folding_rb_tree(child, unfold);
+       }
+
+       return n;
+}
+
+static int callchain_node__set_folding(struct callchain_node *node, bool unfold)
+{
+       struct callchain_list *chain;
+       bool has_children = false;
+       int n = 0;
+
+       list_for_each_entry(chain, &node->val, list) {
+               ++n;
+               map_symbol__set_folding(&chain->ms, unfold);
+               has_children = chain->ms.has_children;
+       }
+
+       if (has_children)
+               n += callchain_node__set_folding_rb_tree(node, unfold);
+
+       return n;
+}
+
+static int callchain__set_folding(struct rb_root *chain, bool unfold)
+{
+       struct rb_node *nd;
+       int n = 0;
+
+       for (nd = rb_first(chain); nd; nd = rb_next(nd)) {
+               struct callchain_node *node = rb_entry(nd, struct callchain_node, rb_node);
+               n += callchain_node__set_folding(node, unfold);
+       }
+
+       return n;
+}
+
+static void hist_entry__set_folding(struct hist_entry *self, bool unfold)
+{
+       hist_entry__init_have_children(self);
+       map_symbol__set_folding(&self->ms, unfold);
+
+       if (self->ms.has_children) {
+               int n = callchain__set_folding(&self->sorted_chain, unfold);
+               self->nr_rows = unfold ? n : 0;
+       } else
+               self->nr_rows = 0;
+}
+
+static void hists__set_folding(struct hists *self, bool unfold)
+{
+       struct rb_node *nd;
+
+       self->nr_entries = 0;
+
+       for (nd = rb_first(&self->entries); nd; nd = rb_next(nd)) {
+               struct hist_entry *he = rb_entry(nd, struct hist_entry, rb_node);
+               hist_entry__set_folding(he, unfold);
+               self->nr_entries += 1 + he->nr_rows;
+       }
+}
+
+static void hist_browser__set_folding(struct hist_browser *self, bool unfold)
+{
+       hists__set_folding(self->hists, unfold);
+       self->b.nr_entries = self->hists->nr_entries;
+       /* Go to the start, we may be way after valid entries after a collapse */
+       ui_browser__reset_index(&self->b);
+}
+
+static int hist_browser__run(struct hist_browser *self, const char *title)
  {
-       char str[256], unit;
-       unsigned long nr_events = self->hists->stats.nr_events[PERF_RECORD_SAMPLE];
+       int key;
+       int exit_keys[] = { 'a', '?', 'h', 'C', 'd', 'D', 'E', 't',
+                           NEWT_KEY_ENTER, NEWT_KEY_RIGHT, NEWT_KEY_LEFT, 0, };
  
         self->b.entries = &self->hists->entries;
         self->b.nr_entries = self->hists->nr_entries;
  
         hist_browser__refresh_dimensions(self);
  
-       nr_events = convert_unit(nr_events, &unit);
-       snprintf(str, sizeof(str), "Events: %lu%c                            ",
-                nr_events, unit);
-       newtDrawRootText(0, 0, str);
-
         if (ui_browser__show(&self->b, title,
                              "Press '?' for help on key bindings") < 0)
                 return -1;
  
-       newtFormAddHotKey(self->b.form, 'a');
-       newtFormAddHotKey(self->b.form, '?');
-       newtFormAddHotKey(self->b.form, 'h');
-       newtFormAddHotKey(self->b.form, 'd');
-       newtFormAddHotKey(self->b.form, 'D');
-       newtFormAddHotKey(self->b.form, 't');
-
-       newtFormAddHotKey(self->b.form, NEWT_KEY_LEFT);
-       newtFormAddHotKey(self->b.form, NEWT_KEY_RIGHT);
-       newtFormAddHotKey(self->b.form, NEWT_KEY_ENTER);
+       ui_browser__add_exit_keys(&self->b, exit_keys);
  
         while (1) {
-               ui_browser__run(&self->b, es);
+               key = ui_browser__run(&self->b);
  
-               if (es->reason != NEWT_EXIT_HOTKEY)
-                       break;
-               switch (es->u.key) {
+               switch (key) {
                 case 'D': { /* Debug */
                         static int seq;
                         struct hist_entry *h = rb_entry(self->b.top,
@@ -245,18 +322,26 @@ static int hist_browser__run(struct hist_browser *self, const char *title,
                                            self->b.top_idx,
                                            h->row_offset, h->nr_rows);
                 }
-                       continue;
+                       break;
+               case 'C':
+                       /* Collapse the whole world. */
+                       hist_browser__set_folding(self, false);
+                       break;
+               case 'E':
+                       /* Expand the whole world. */
+                       hist_browser__set_folding(self, true);
+                       break;
                 case NEWT_KEY_ENTER:
                         if (hist_browser__toggle_fold(self))
                                 break;
                         /* fall thru */
                 default:
-                       return 0;
+                       goto out;
                 }
         }
-
+out:
         ui_browser__hide(&self->b);
-       return 0;
+       return key;
  }
  
  static char *callchain_list__sym_name(struct callchain_list *self,
@@ -306,15 +391,10 @@ static int hist_browser__show_callchain_node_rb_tree(struct hist_browser *self,
                         int color;
                         bool was_first = first;
  
-                       if (first) {
+                       if (first)
                                 first = false;
-                               chain->ms.has_children = chain->list.next != &child->val ||
-                                                        rb_first(&child->rb_root) != NULL;
-                       } else {
+                       else
                                 extra_offset = LEVEL_OFFSET_STEP;
-                               chain->ms.has_children = chain->list.next == &child->val &&
-                                                        rb_first(&child->rb_root) != NULL;
-                       }
  
                         folded_sign = callchain_list__folded(chain);
                         if (*row_offset != 0) {
@@ -341,8 +421,8 @@ static int hist_browser__show_callchain_node_rb_tree(struct hist_browser *self,
                                 *is_current_entry = true;
                         }
  
-                       SLsmg_set_color(color);
-                       SLsmg_gotorc(self->b.y + row, self->b.x);
+                       ui_browser__set_color(&self->b, color);
+                       ui_browser__gotorc(&self->b, row, 0);
                         slsmg_write_nstring(" ", offset + extra_offset);
                         slsmg_printf("%c ", folded_sign);
                         slsmg_write_nstring(str, width);
@@ -384,12 +464,7 @@ static int hist_browser__show_callchain_node(struct hist_browser *self,
         list_for_each_entry(chain, &node->val, list) {
                 char ipstr[BITS_PER_LONG / 4 + 1], *s;
                 int color;
-               /*
-                * FIXME: This should be moved to somewhere else,
-                * probably when the callchain is created, so as not to
-                * traverse it all over again
-                */
-               chain->ms.has_children = rb_first(&node->rb_root) != NULL;
+
                 folded_sign = callchain_list__folded(chain);
  
                 if (*row_offset != 0) {
@@ -405,8 +480,8 @@ static int hist_browser__show_callchain_node(struct hist_browser *self,
                 }
  
                 s = callchain_list__sym_name(chain, ipstr, sizeof(ipstr));
-               SLsmg_gotorc(self->b.y + row, self->b.x);
-               SLsmg_set_color(color);
+               ui_browser__gotorc(&self->b, row, 0);
+               ui_browser__set_color(&self->b, color);
                 slsmg_write_nstring(" ", offset);
                 slsmg_printf("%c ", folded_sign);
                 slsmg_write_nstring(s, width - 2);
@@ -465,7 +540,7 @@ static int hist_browser__show_entry(struct hist_browser *self,
         }
  
         if (symbol_conf.use_callchain) {
-               entry->ms.has_children = !RB_EMPTY_ROOT(&entry->sorted_chain);
+               hist_entry__init_have_children(entry);
                 folded_sign = hist_entry__folded(entry);
         }
  
@@ -484,8 +559,8 @@ static int hist_browser__show_entry(struct hist_browser *self,
                                 color = HE_COLORSET_NORMAL;
                 }
  
-               SLsmg_set_color(color);
-               SLsmg_gotorc(self->b.y + row, self->b.x);
+               ui_browser__set_color(&self->b, color);
+               ui_browser__gotorc(&self->b, row, 0);
                 if (symbol_conf.use_callchain) {
                         slsmg_printf("%c ", folded_sign);
                         width -= 2;
@@ -687,8 +762,6 @@ static struct hist_browser *hist_browser__new(struct hists *hists)
  
  static void hist_browser__delete(struct hist_browser *self)
  {
-       newtFormDestroy(self->b.form);
-       newtPopWindow();
         free(self);
  }
  
@@ -702,21 +775,26 @@ static struct thread *hist_browser__selected_thread(struct hist_browser *self)
         return self->he_selection->thread;
  }
  
-static int hist_browser__title(char *bf, size_t size, const char *ev_name,
-                              const struct dso *dso, const struct thread *thread)
+static int hists__browser_title(struct hists *self, char *bf, size_t size,
+                               const char *ev_name, const struct dso *dso,
+                               const struct thread *thread)
  {
-       int printed = 0;
+       char unit;
+       int printed;
+       unsigned long nr_events = self->stats.nr_events[PERF_RECORD_SAMPLE];
+
+       nr_events = convert_unit(nr_events, &unit);
+       printed = snprintf(bf, size, "Events: %lu%c %s", nr_events, unit, ev_name);
  
         if (thread)
                 printed += snprintf(bf + printed, size - printed,
-                                   "Thread: %s(%d)",
-                                   (thread->comm_set ?  thread->comm : ""),
+                                   ", Thread: %s(%d)",
+                                   (thread->comm_set ? thread->comm : ""),
                                     thread->pid);
         if (dso)
                 printed += snprintf(bf + printed, size - printed,
-                                   "%sDSO: %s", thread ? " " : "",
-                                   dso->short_name);
-       return printed ?: snprintf(bf, size, "Event: %s", ev_name);
+                                   ", DSO: %s", dso->short_name);
+       return printed;
  }
  
  int hists__browse(struct hists *self, const char *helpline, const char *ev_name)
@@ -725,7 +803,6 @@ int hists__browse(struct hists *self, const char *helpline, const char *ev_name)
         struct pstack *fstack;
         const struct thread *thread_filter = NULL;
         const struct dso *dso_filter = NULL;
-       struct newtExitStruct es;
         char msg[160];
         int key = -1;
  
@@ -738,9 +815,8 @@ int hists__browse(struct hists *self, const char *helpline, const char *ev_name)
  
         ui_helpline__push(helpline);
  
-       hist_browser__title(msg, sizeof(msg), ev_name,
-                           dso_filter, thread_filter);
-
+       hists__browser_title(self, msg, sizeof(msg), ev_name,
+                            dso_filter, thread_filter);
         while (1) {
                 const struct thread *thread;
                 const struct dso *dso;
@@ -749,70 +825,63 @@ int hists__browse(struct hists *self, const char *helpline, const char *ev_name)
                     annotate = -2, zoom_dso = -2, zoom_thread = -2,
                     browse_map = -2;
  
-               if (hist_browser__run(browser, msg, &es))
-                       break;
+               key = hist_browser__run(browser, msg);
  
                 thread = hist_browser__selected_thread(browser);
                 dso = browser->selection->map ? browser->selection->map->dso : NULL;
  
-               if (es.reason == NEWT_EXIT_HOTKEY) {
-                       key = es.u.key;
-
-                       switch (key) {
-                       case NEWT_KEY_F1:
-                               goto do_help;
-                       case NEWT_KEY_TAB:
-                       case NEWT_KEY_UNTAB:
-                               /*
-                                * Exit the browser, let hists__browser_tree
-                                * go to the next or previous
-                                */
-                               goto out_free_stack;
-                       default:;
-                       }
-
-                       switch (key) {
-                       case 'a':
-                               if (browser->selection->map == NULL ||
-                                   browser->selection->map->dso->annotate_warned)
-                                       continue;
-                               goto do_annotate;
-                       case 'd':
-                               goto zoom_dso;
-                       case 't':
-                               goto zoom_thread;
-                       case 'h':
-                       case '?':
-do_help:
-                               ui__help_window("->        Zoom into DSO/Threads & Annotate current symbol\n"
-                                               "<-        Zoom out\n"
-                                               "a         Annotate current symbol\n"
-                                               "h/?/F1    Show this window\n"
-                                               "d         Zoom into current DSO\n"
-                                               "t         Zoom into current Thread\n"
-                                               "q/CTRL+C  Exit browser");
+               switch (key) {
+               case NEWT_KEY_TAB:
+               case NEWT_KEY_UNTAB:
+                       /*
+                        * Exit the browser, let hists__browser_tree
+                        * go to the next or previous
+                        */
+                       goto out_free_stack;
+               case 'a':
+                       if (browser->selection->map == NULL &&
+                           browser->selection->map->dso->annotate_warned)
                                 continue;
-                       default:;
-                       }
-                       if (is_exit_key(key)) {
-                               if (key == NEWT_KEY_ESCAPE &&
-                                   !ui__dialog_yesno("Do you really want to exit?"))
-                                       continue;
-                               break;
-                       }
-
-                       if (es.u.key == NEWT_KEY_LEFT) {
-                               const void *top;
+                       goto do_annotate;
+               case 'd':
+                       goto zoom_dso;
+               case 't':
+                       goto zoom_thread;
+               case NEWT_KEY_F1:
+               case 'h':
+               case '?':
+                       ui__help_window("->        Zoom into DSO/Threads & Annotate current symbol\n"
+                                       "<-        Zoom out\n"
+                                       "a         Annotate current symbol\n"
+                                       "h/?/F1    Show this window\n"
+                                       "C         Collapse all callchains\n"
+                                       "E         Expand all callchains\n"
+                                       "d         Zoom into current DSO\n"
+                                       "t         Zoom into current Thread\n"
+                                       "q/CTRL+C  Exit browser");
+                       continue;
+               case NEWT_KEY_ENTER:
+               case NEWT_KEY_RIGHT:
+                       /* menu */
+                       break;
+               case NEWT_KEY_LEFT: {
+                       const void *top;
  
-                               if (pstack__empty(fstack))
-                                       continue;
-                               top = pstack__pop(fstack);
-                               if (top == &dso_filter)
-                                       goto zoom_out_dso;
-                               if (top == &thread_filter)
-                                       goto zoom_out_thread;
+                       if (pstack__empty(fstack))
                                 continue;
-                       }
+                       top = pstack__pop(fstack);
+                       if (top == &dso_filter)
+                               goto zoom_out_dso;
+                       if (top == &thread_filter)
+                               goto zoom_out_thread;
+                       continue;
+               }
+               case NEWT_KEY_ESCAPE:
+                       if (!ui__dialog_yesno("Do you really want to exit?"))
+                               continue;
+                       /* Fall thru */
+               default:
+                       goto out_free_stack;
                 }
  
                 if (browser->selection->sym != NULL &&
@@ -885,8 +954,8 @@ zoom_out_dso:
                                 pstack__push(fstack, &dso_filter);
                         }
                         hists__filter_by_dso(self, dso_filter);
-                       hist_browser__title(msg, sizeof(msg), ev_name,
-                                           dso_filter, thread_filter);
+                       hists__browser_title(self, msg, sizeof(msg), ev_name,
+                                            dso_filter, thread_filter);
                         hist_browser__reset(browser);
                 } else if (choice == zoom_thread) {
  zoom_thread:
@@ -903,8 +972,8 @@ zoom_out_thread:
                                 pstack__push(fstack, &thread_filter);
                         }
                         hists__filter_by_thread(self, thread_filter);
-                       hist_browser__title(msg, sizeof(msg), ev_name,
-                                           dso_filter, thread_filter);
+                       hists__browser_title(self, msg, sizeof(msg), ev_name,
+                                            dso_filter, thread_filter);
                         hist_browser__reset(browser);
                 }
         }
@@ -925,10 +994,6 @@ int hists__tui_browse_tree(struct rb_root *self, const char *help)
                 const char *ev_name = __event_name(hists->type, hists->config);
  
                 key = hists__browse(hists, help, ev_name);
-
-               if (is_exit_key(key))
-                       break;
-
                 switch (key) {
                 case NEWT_KEY_TAB:
                         next = rb_next(nd);
@@ -940,7 +1005,7 @@ int hists__tui_browse_tree(struct rb_root *self, const char *help)
                                 continue;
                         nd = rb_prev(nd);
                 default:
-                       break;
+                       return key;
                 }
         }
  
diff --git a/tools/perf/util/ui/browsers/map.c b/tools/perf/util/ui/browsers/map.c

index 142b825b42bf41d90ee013f51aaac5e79df44f5e..e35437dfa5b48aea8a0fb0237b2bf7ed7aaf90cd 100644 (file)
--- a/tools/perf/util/ui/browsers/map.c
+++ b/tools/perf/util/ui/browsers/map.c
@@ -1,6 +1,5 @@
  #include "../libslang.h"
  #include <elf.h>
-#include <newt.h>
  #include <sys/ttydefaults.h>
  #include <ctype.h>
  #include <string.h>
@@ -47,7 +46,6 @@ out_free_form:
  struct map_browser {
         struct ui_browser b;
         struct map        *map;
-       u16               namelen;
         u8                addrlen;
  };
  
@@ -56,14 +54,16 @@ static void map_browser__write(struct ui_browser *self, void *nd, int row)
         struct symbol *sym = rb_entry(nd, struct symbol, rb_node);
         struct map_browser *mb = container_of(self, struct map_browser, b);
         bool current_entry = ui_browser__is_current_entry(self, row);
-       int color = ui_browser__percent_color(0, current_entry);
+       int width;
  
-       SLsmg_set_color(color);
+       ui_browser__set_percent_color(self, 0, current_entry);
         slsmg_printf("%*llx %*llx %c ",
                      mb->addrlen, sym->start, mb->addrlen, sym->end,
                      sym->binding == STB_GLOBAL ? 'g' :
                      sym->binding == STB_LOCAL  ? 'l' : 'w');
-       slsmg_write_nstring(sym->name, mb->namelen);
+       width = self->width - ((mb->addrlen * 2) + 4);
+       if (width > 0)
+               slsmg_write_nstring(sym->name, width);
  }
  
  /* FIXME uber-kludgy, see comment on cmd_report... */
@@ -98,31 +98,29 @@ static int map_browser__search(struct map_browser *self)
         return 0;
  }
  
-static int map_browser__run(struct map_browser *self, struct newtExitStruct *es)
+static int map_browser__run(struct map_browser *self)
  {
+       int key;
+
         if (ui_browser__show(&self->b, self->map->dso->long_name,
                              "Press <- or ESC to exit, %s / to search",
                              verbose ? "" : "restart with -v to use") < 0)
                 return -1;
  
-       newtFormAddHotKey(self->b.form, NEWT_KEY_LEFT);
-       newtFormAddHotKey(self->b.form, NEWT_KEY_ENTER);
         if (verbose)
-               newtFormAddHotKey(self->b.form, '/');
+               ui_browser__add_exit_key(&self->b, '/');
  
         while (1) {
-               ui_browser__run(&self->b, es);
+               key = ui_browser__run(&self->b);
  
-               if (es->reason != NEWT_EXIT_HOTKEY)
-                       break;
-               if (verbose && es->u.key == '/')
+               if (verbose && key == '/')
                         map_browser__search(self);
                 else
                         break;
         }
  
         ui_browser__hide(&self->b);
-       return 0;
+       return key;
  }
  
  int map__browse(struct map *self)
@@ -136,7 +134,6 @@ int map__browse(struct map *self)
                 },
                 .map = self,
         };
-       struct newtExitStruct es;
         struct rb_node *nd;
         char tmp[BITS_PER_LONG / 4];
         u64 maxaddr = 0;
@@ -144,8 +141,6 @@ int map__browse(struct map *self)
         for (nd = rb_first(mb.b.entries); nd; nd = rb_next(nd)) {
                 struct symbol *pos = rb_entry(nd, struct symbol, rb_node);
  
-               if (mb.namelen < pos->namelen)
-                       mb.namelen = pos->namelen;
                 if (maxaddr < pos->end)
                         maxaddr = pos->end;
                 if (verbose) {
@@ -156,6 +151,5 @@ int map__browse(struct map *self)
         }
  
         mb.addrlen = snprintf(tmp, sizeof(tmp), "%llx", maxaddr);
-       mb.b.width += mb.addrlen * 2 + 4 + mb.namelen;
-       return map_browser__run(&mb, &es);
+       return map_browser__run(&mb);
  }
diff --git a/tools/perf/util/ui/util.c b/tools/perf/util/ui/util.c

index 04600e26ceea21d08b701570494c5e72210aaa9e..9706d9d40279859321412b270c8ac3053141f4f9 100644 (file)
--- a/tools/perf/util/ui/util.c
+++ b/tools/perf/util/ui/util.c
@@ -11,8 +11,6 @@
  #include "helpline.h"
  #include "util.h"
  
-newtComponent newt_form__new(void);
-
  static void newt_form__set_exit_keys(newtComponent self)
  {
         newtFormAddHotKey(self, NEWT_KEY_LEFT);
@@ -22,7 +20,7 @@ static void newt_form__set_exit_keys(newtComponent self)
         newtFormAddHotKey(self, CTRL('c'));
  }
  
-newtComponent newt_form__new(void)
+static newtComponent newt_form__new(void)
  {
         newtComponent self = newtForm(NULL, NULL, 0);
         if (self)
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h

index f380fed74359034a843756256d6e7a79b0ff22b5..7562707ddd1c491755dc8ea5121637918ba1b844 100644 (file)
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -266,19 +266,6 @@ bool strglobmatch(const char *str, const char *pat);
  bool strlazymatch(const char *str, const char *pat);
  unsigned long convert_unit(unsigned long value, char *unit);
  
-#ifndef ESC
-#define ESC 27
-#endif
-
-static inline bool is_exit_key(int key)
-{
-       char up;
-       if (key == CTRL('c') || key == ESC)
-               return true;
-       up = toupper(key);
-       return up == 'Q';
-}
-
  #define _STR(x) #x
  #define STR(x) _STR(x)
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 21 Oct 2010 19:55:43 +0000 (12:55 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 21 Oct 2010 19:55:43 +0000 (12:55 -0700)