xps: Transmit Packet Steering

[net-next-2.6.git] / kernel / sched_fair.c
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c

index 806d1b227a21060aac100994a8992266c00b59b5..f4f6a8326dd01ffc0353b369bbfddf5a4f0e2fde 100644 (file)
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -25,7 +25,7 @@
  
  /*
   * Targeted preemption latency for CPU-bound tasks:
- * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
   *
   * NOTE: this latency value is not the same as the concept of
   * 'timeslice length' - timeslices in CFS are of variable length
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
  
  /*
   * Minimal preemption granularity for CPU-bound tasks:
- * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
   */
-unsigned int sysctl_sched_min_granularity = 2000000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL;
+unsigned int sysctl_sched_min_granularity = 750000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
  
  /*
   * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
   */
-static unsigned int sched_nr_latency = 3;
+static unsigned int sched_nr_latency = 8;
  
  /*
   * After fork, child runs first. If set to 0 (default) then
@@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
  static void update_curr(struct cfs_rq *cfs_rq)
  {
         struct sched_entity *curr = cfs_rq->curr;
-       u64 now = rq_of(cfs_rq)->clock;
+       u64 now = rq_of(cfs_rq)->clock_task;
         unsigned long delta_exec;
  
         if (unlikely(!curr))
@@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
         /*
          * We are starting a new run period:
          */
-       se->exec_start = rq_of(cfs_rq)->clock;
+       se->exec_start = rq_of(cfs_rq)->clock_task;
  }
  
  /**************************************************
@@ -1313,7 +1313,7 @@ static struct sched_group *
  find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                   int this_cpu, int load_idx)
  {
-       struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+       struct sched_group *idlest = NULL, *group = sd->groups;
         unsigned long min_load = ULONG_MAX, this_load = 0;
         int imbalance = 100 + (sd->imbalance_pct-100)/2;
  
@@ -1348,7 +1348,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
  
                 if (local_group) {
                         this_load = avg_load;
-                       this = group;
                 } else if (avg_load < min_load) {
                         min_load = avg_load;
                         idlest = group;
@@ -1765,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
         set_task_cpu(p, this_cpu);
         activate_task(this_rq, p, 0);
         check_preempt_curr(this_rq, p, 0);
+
+       /* re-arm NEWIDLE balancing when moving tasks */
+       src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
+       this_rq->idle_stamp = 0;
  }
  
  /*
@@ -1799,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
          * 2) too many balance attempts have failed.
          */
  
-       tsk_cache_hot = task_hot(p, rq->clock, sd);
+       tsk_cache_hot = task_hot(p, rq->clock_task, sd);
         if (!tsk_cache_hot ||
                 sd->nr_balance_failed > sd->cache_nice_tries) {
  #ifdef CONFIG_SCHEDSTATS
@@ -2031,12 +2034,14 @@ struct sd_lb_stats {
         unsigned long this_load;
         unsigned long this_load_per_task;
         unsigned long this_nr_running;
+       unsigned long this_has_capacity;
  
         /* Statistics of the busiest group */
         unsigned long max_load;
         unsigned long busiest_load_per_task;
         unsigned long busiest_nr_running;
         unsigned long busiest_group_capacity;
+       unsigned long busiest_has_capacity;
  
         int group_imb; /* Is there imbalance in this sd */
  #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2059,6 +2064,7 @@ struct sg_lb_stats {
         unsigned long sum_weighted_load; /* Weighted load of group's tasks */
         unsigned long group_capacity;
         int group_imb; /* Is there an imbalance in the group ? */
+       int group_has_capacity; /* Is there extra capacity in the group? */
  };
  
  /**
@@ -2268,10 +2274,14 @@ unsigned long scale_rt_power(int cpu)
         struct rq *rq = cpu_rq(cpu);
         u64 total, available;
  
-       sched_avg_update(rq);
-
         total = sched_avg_period() + (rq->clock - rq->age_stamp);
-       available = total - rq->rt_avg;
+
+       if (unlikely(total < rq->rt_avg)) {
+               /* Ensures that power won't end up being negative */
+               available = 0;
+       } else {
+               available = total - rq->rt_avg;
+       }
  
         if (unlikely((s64)total < SCHED_LOAD_SCALE))
                 total = SCHED_LOAD_SCALE;
@@ -2381,7 +2391,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
                         int local_group, const struct cpumask *cpus,
                         int *balance, struct sg_lb_stats *sgs)
  {
-       unsigned long load, max_cpu_load, min_cpu_load;
+       unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
         int i;
         unsigned int balance_cpu = -1, first_idle_cpu = 0;
         unsigned long avg_load_per_task = 0;
@@ -2392,6 +2402,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
         /* Tally up the load of all CPUs in the group */
         max_cpu_load = 0;
         min_cpu_load = ~0UL;
+       max_nr_running = 0;
  
         for_each_cpu_and(i, sched_group_cpus(group), cpus) {
                 struct rq *rq = cpu_rq(i);
@@ -2409,8 +2420,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
                         load = target_load(i, load_idx);
                 } else {
                         load = source_load(i, load_idx);
-                       if (load > max_cpu_load)
+                       if (load > max_cpu_load) {
                                 max_cpu_load = load;
+                               max_nr_running = rq->nr_running;
+                       }
                         if (min_cpu_load > load)
                                 min_cpu_load = load;
                 }
@@ -2450,13 +2463,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
         if (sgs->sum_nr_running)
                 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
  
-       if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+       if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
                 sgs->group_imb = 1;
  
-       sgs->group_capacity =
-               DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+       sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
         if (!sgs->group_capacity)
                 sgs->group_capacity = fix_small_capacity(sd, group);
+
+       if (sgs->group_capacity > sgs->sum_nr_running)
+               sgs->group_has_capacity = 1;
  }
  
  /**
@@ -2545,9 +2560,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                 /*
                  * In case the child domain prefers tasks go to siblings
                  * first, lower the sg capacity to one so that we'll try
-                * and move all the excess tasks away.
+                * and move all the excess tasks away. We lower the capacity
+                * of a group only if the local group has the capacity to fit
+                * these excess tasks, i.e. nr_running < group_capacity. The
+                * extra check prevents the case where you always pull from the
+                * heaviest group when it is already under-utilized (possible
+                * with a large weight task outweighs the tasks on the system).
                  */
-               if (prefer_sibling)
+               if (prefer_sibling && !local_group && sds->this_has_capacity)
                         sgs.group_capacity = min(sgs.group_capacity, 1UL);
  
                 if (local_group) {
@@ -2555,12 +2575,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                         sds->this = sg;
                         sds->this_nr_running = sgs.sum_nr_running;
                         sds->this_load_per_task = sgs.sum_weighted_load;
+                       sds->this_has_capacity = sgs.group_has_capacity;
                 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
                         sds->max_load = sgs.avg_load;
                         sds->busiest = sg;
                         sds->busiest_nr_running = sgs.sum_nr_running;
                         sds->busiest_group_capacity = sgs.group_capacity;
                         sds->busiest_load_per_task = sgs.sum_weighted_load;
+                       sds->busiest_has_capacity = sgs.group_has_capacity;
                         sds->group_imb = sgs.group_imb;
                 }
  
@@ -2757,6 +2779,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
                 return fix_small_imbalance(sds, this_cpu, imbalance);
  
  }
+
  /******* find_busiest_group() helpers end here *********************/
  
  /**
@@ -2808,6 +2831,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
          * 4) This group is more busy than the avg busieness at this
          *    sched_domain.
          * 5) The imbalance is within the specified limit.
+        *
+        * Note: when doing newidle balance, if the local group has excess
+        * capacity (i.e. nr_running < group_capacity) and the busiest group
+        * does not have any capacity, we force a load balance to pull tasks
+        * to the local group. In this case, we skip past checks 3, 4 and 5.
          */
         if (!(*balance))
                 goto ret;
@@ -2819,6 +2847,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
         if (!sds.busiest || sds.busiest_nr_running == 0)
                 goto out_balanced;
  
+       /*  SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
+       if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
+                       !sds.busiest_has_capacity)
+               goto force_balance;
+
         if (sds.this_load >= sds.max_load)
                 goto out_balanced;
  
@@ -2830,6 +2863,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
         if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
                 goto out_balanced;
  
+force_balance:
         /* Looks like there is an imbalance. Compute it */
         calculate_imbalance(&sds, this_cpu, imbalance);
         return sds.busiest;
@@ -3034,7 +3068,14 @@ redo:
  
         if (!ld_moved) {
                 schedstat_inc(sd, lb_failed[idle]);
-               sd->nr_balance_failed++;
+               /*
+                * Increment the failure counter only on periodic balance.
+                * We do not want newidle balance, which can be very
+                * frequent, pollute the failure counter causing
+                * excessive cache_hot migrations and active balances.
+                */
+               if (idle != CPU_NEWLY_IDLE)
+                       sd->nr_balance_failed++;
  
                 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
                                         this_cpu)) {
@@ -3156,10 +3197,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
                 interval = msecs_to_jiffies(sd->balance_interval);
                 if (time_after(next_balance, sd->last_balance + interval))
                         next_balance = sd->last_balance + interval;
-               if (pulled_task) {
-                       this_rq->idle_stamp = 0;
+               if (pulled_task)
                         break;
-               }
         }
  
         raw_spin_lock(&this_rq->lock);
@@ -3633,7 +3672,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
         if (time_before(now, nohz.next_balance))
                 return 0;
  
-       if (!rq->nr_running)
+       if (rq->idle_at_tick)
                 return 0;
  
         first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
@@ -3752,8 +3791,13 @@ static void task_fork_fair(struct task_struct *p)
  
         raw_spin_lock_irqsave(&rq->lock, flags);
  
-       if (unlikely(task_cpu(p) != this_cpu))
+       update_rq_clock(rq);
+
+       if (unlikely(task_cpu(p) != this_cpu)) {
+               rcu_read_lock();
                 __set_task_cpu(p, this_cpu);
+               rcu_read_unlock();
+       }
  
         update_curr(cfs_rq);
  
@@ -3825,13 +3869,26 @@ static void set_curr_task_fair(struct rq *rq)
  }
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-static void moved_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int on_rq)
  {
-       struct cfs_rq *cfs_rq = task_cfs_rq(p);
-
-       update_curr(cfs_rq);
+       /*
+        * If the task was not on the rq at the time of this cgroup movement
+        * it must have been asleep, sleeping tasks keep their ->vruntime
+        * absolute on their old rq until wakeup (needed for the fair sleeper
+        * bonus in place_entity()).
+        *
+        * If it was on the rq, we've just 'preempted' it, which does convert
+        * ->vruntime to a relative base.
+        *
+        * Make sure both cases convert their relative position when migrating
+        * to another cgroup's rq. This does somewhat interfere with the
+        * fair sleeper stuff for the first placement, but who cares.
+        */
+       if (!on_rq)
+               p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+       set_task_rq(p, task_cpu(p));
         if (!on_rq)
-               place_entity(cfs_rq, &p->se, 1);
+               p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
  }
  #endif
  
@@ -3883,7 +3940,7 @@ static const struct sched_class fair_sched_class = {
         .get_rr_interval        = get_rr_interval_fair,
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-       .moved_group            = moved_group_fair,
+       .task_move_group        = task_move_group_fair,
  #endif
  };