sched: Cure nr_iowait_cpu() users

[net-next-2.6.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index d484081425037b5b59ce076c03f524ead13ec37b..f87abe3b0176613d03d722b4ed74fc56f3597dfa 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -306,52 +306,6 @@ static int init_task_group_load = INIT_TASK_GROUP_LOAD;
   */
  struct task_group init_task_group;
  
-/* return group to which a task belongs */
-static inline struct task_group *task_group(struct task_struct *p)
-{
-       struct task_group *tg;
-
-#ifdef CONFIG_CGROUP_SCHED
-       tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
-                               struct task_group, css);
-#else
-       tg = &init_task_group;
-#endif
-       return tg;
-}
-
-/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
-{
-       /*
-        * Strictly speaking this rcu_read_lock() is not needed since the
-        * task_group is tied to the cgroup, which in turn can never go away
-        * as long as there are tasks attached to it.
-        *
-        * However since task_group() uses task_subsys_state() which is an
-        * rcu_dereference() user, this quiets CONFIG_PROVE_RCU.
-        */
-       rcu_read_lock();
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
-       p->se.parent = task_group(p)->se[cpu];
-#endif
-
-#ifdef CONFIG_RT_GROUP_SCHED
-       p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
-       p->rt.parent = task_group(p)->rt_se[cpu];
-#endif
-       rcu_read_unlock();
-}
-
-#else
-
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
-static inline struct task_group *task_group(struct task_struct *p)
-{
-       return NULL;
-}
-
  #endif /* CONFIG_CGROUP_SCHED */
  
  /* CFS-related fields in a runqueue */
@@ -544,6 +498,8 @@ struct rq {
         struct root_domain *rd;
         struct sched_domain *sd;
  
+       unsigned long cpu_power;
+
         unsigned char idle_at_tick;
         /* For active balancing */
         int post_schedule;
@@ -642,6 +598,49 @@ static inline int cpu_of(struct rq *rq)
  #define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
  #define raw_rq()               (&__raw_get_cpu_var(runqueues))
  
+#ifdef CONFIG_CGROUP_SCHED
+
+/*
+ * Return the group to which this tasks belongs.
+ *
+ * We use task_subsys_state_check() and extend the RCU verification
+ * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
+ * holds that lock for each task it moves into the cgroup. Therefore
+ * by holding that lock, we pin the task to the current cgroup.
+ */
+static inline struct task_group *task_group(struct task_struct *p)
+{
+       struct cgroup_subsys_state *css;
+
+       css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
+                       lockdep_is_held(&task_rq(p)->lock));
+       return container_of(css, struct task_group, css);
+}
+
+/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
+       p->se.parent = task_group(p)->se[cpu];
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+       p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
+       p->rt.parent = task_group(p)->rt_se[cpu];
+#endif
+}
+
+#else /* CONFIG_CGROUP_SCHED */
+
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+static inline struct task_group *task_group(struct task_struct *p)
+{
+       return NULL;
+}
+
+#endif /* CONFIG_CGROUP_SCHED */
+
  inline void update_rq_clock(struct rq *rq)
  {
         if (!rq->skip_clock_update)
@@ -1255,6 +1254,12 @@ static void sched_avg_update(struct rq *rq)
         s64 period = sched_avg_period();
  
         while ((s64)(rq->clock - rq->age_stamp) > period) {
+               /*
+                * Inline assembly required to prevent the compiler
+                * optimising this loop into a divmod call.
+                * See __iter_div_u64_rem() for another example of this.
+                */
+               asm("" : "+rm" (rq->age_stamp));
                 rq->age_stamp += period;
                 rq->rt_avg /= 2;
         }
@@ -1499,24 +1504,9 @@ static unsigned long target_load(int cpu, int type)
         return max(rq->cpu_load[type-1], total);
  }
  
-static struct sched_group *group_of(int cpu)
-{
-       struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
-
-       if (!sd)
-               return NULL;
-
-       return sd->groups;
-}
-
  static unsigned long power_of(int cpu)
  {
-       struct sched_group *group = group_of(cpu);
-
-       if (!group)
-               return SCHED_LOAD_SCALE;
-
-       return group->cpu_power;
+       return cpu_rq(cpu)->cpu_power;
  }
  
  static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
@@ -1673,9 +1663,6 @@ static void update_shares(struct sched_domain *sd)
  
  static void update_h_load(long cpu)
  {
-       if (root_task_group_empty())
-               return;
-
         walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
  }
  
@@ -1854,8 +1841,8 @@ static void dec_nr_running(struct rq *rq)
  static void set_load_weight(struct task_struct *p)
  {
         if (task_has_rt_policy(p)) {
-               p->se.load.weight = prio_to_weight[0] * 2;
-               p->se.load.inv_weight = prio_to_wmult[0] >> 1;
+               p->se.load.weight = 0;
+               p->se.load.inv_weight = WMULT_CONST;
                 return;
         }
  
@@ -2877,9 +2864,9 @@ unsigned long nr_iowait(void)
         return sum;
  }
  
-unsigned long nr_iowait_cpu(void)
+unsigned long nr_iowait_cpu(int cpu)
  {
-       struct rq *this = this_rq();
+       struct rq *this = cpu_rq(cpu);
         return atomic_read(&this->nr_iowait);
  }
  
@@ -4478,16 +4465,6 @@ recheck:
         }
  
         if (user) {
-#ifdef CONFIG_RT_GROUP_SCHED
-               /*
-                * Do not allow realtime tasks into groups that have no runtime
-                * assigned.
-                */
-               if (rt_bandwidth_enabled() && rt_policy(policy) &&
-                               task_group(p)->rt_bandwidth.rt_runtime == 0)
-                       return -EPERM;
-#endif
-
                 retval = security_task_setscheduler(p, policy, param);
                 if (retval)
                         return retval;
@@ -4503,6 +4480,22 @@ recheck:
          * runqueue lock must be held.
          */
         rq = __task_rq_lock(p);
+
+#ifdef CONFIG_RT_GROUP_SCHED
+       if (user) {
+               /*
+                * Do not allow realtime tasks into groups that have no runtime
+                * assigned.
+                */
+               if (rt_bandwidth_enabled() && rt_policy(policy) &&
+                               task_group(p)->rt_bandwidth.rt_runtime == 0) {
+                       __task_rq_unlock(rq);
+                       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+                       return -EPERM;
+               }
+       }
+#endif
+
         /* recheck policy now with rq lock held */
         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                 policy = oldpolicy = -1;
@@ -7605,6 +7598,7 @@ void __init sched_init(void)
  #ifdef CONFIG_SMP
                 rq->sd = NULL;
                 rq->rd = NULL;
+               rq->cpu_power = SCHED_LOAD_SCALE;
                 rq->post_schedule = 0;
                 rq->active_balance = 0;
                 rq->next_balance = jiffies;