]> bbs.cooldavid.org Git - net-next-2.6.git/blobdiff - kernel/sched_fair.c
drivers/s390: Remove unnecessary casts of private_data
[net-next-2.6.git] / kernel / sched_fair.c
index a878b5332daad5d7db16625f298a4e963edac909..806d1b227a21060aac100994a8992266c00b59b5 100644 (file)
@@ -2287,13 +2287,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
        unsigned long power = SCHED_LOAD_SCALE;
        struct sched_group *sdg = sd->groups;
 
-       if (sched_feat(ARCH_POWER))
-               power *= arch_scale_freq_power(sd, cpu);
-       else
-               power *= default_scale_freq_power(sd, cpu);
-
-       power >>= SCHED_LOAD_SHIFT;
-
        if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
                if (sched_feat(ARCH_POWER))
                        power *= arch_scale_smt_power(sd, cpu);
@@ -2303,6 +2296,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
                power >>= SCHED_LOAD_SHIFT;
        }
 
+       sdg->cpu_power_orig = power;
+
+       if (sched_feat(ARCH_POWER))
+               power *= arch_scale_freq_power(sd, cpu);
+       else
+               power *= default_scale_freq_power(sd, cpu);
+
+       power >>= SCHED_LOAD_SHIFT;
+
        power *= scale_rt_power(cpu);
        power >>= SCHED_LOAD_SHIFT;
 
@@ -2335,6 +2337,31 @@ static void update_group_power(struct sched_domain *sd, int cpu)
        sdg->cpu_power = power;
 }
 
+/*
+ * Try and fix up capacity for tiny siblings, this is needed when
+ * things like SD_ASYM_PACKING need f_b_g to select another sibling
+ * which on its own isn't powerful enough.
+ *
+ * See update_sd_pick_busiest() and check_asym_packing().
+ */
+static inline int
+fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
+{
+       /*
+        * Only siblings can have significantly less than SCHED_LOAD_SCALE
+        */
+       if (sd->level != SD_LV_SIBLING)
+               return 0;
+
+       /*
+        * If ~90% of the cpu_power is still there, we're good.
+        */
+       if (group->cpu_power * 32 > group->cpu_power_orig * 29)
+               return 1;
+
+       return 0;
+}
+
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @sd: The sched_domain whose statistics are to be updated.
@@ -2400,14 +2427,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
         * domains. In the newly idle case, we will allow all the cpu's
         * to do the newly idle load balance.
         */
-       if (idle != CPU_NEWLY_IDLE && local_group &&
-           balance_cpu != this_cpu) {
-               *balance = 0;
-               return;
+       if (idle != CPU_NEWLY_IDLE && local_group) {
+               if (balance_cpu != this_cpu) {
+                       *balance = 0;
+                       return;
+               }
+               update_group_power(sd, this_cpu);
        }
 
-       update_group_power(sd, this_cpu);
-
        /* Adjust by relative CPU power of the group */
        sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
 
@@ -2428,6 +2455,51 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 
        sgs->group_capacity =
                DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+       if (!sgs->group_capacity)
+               sgs->group_capacity = fix_small_capacity(sd, group);
+}
+
+/**
+ * update_sd_pick_busiest - return 1 on busiest group
+ * @sd: sched_domain whose statistics are to be checked
+ * @sds: sched_domain statistics
+ * @sg: sched_group candidate to be checked for being the busiest
+ * @sgs: sched_group statistics
+ * @this_cpu: the current cpu
+ *
+ * Determine if @sg is a busier group than the previously selected
+ * busiest group.
+ */
+static bool update_sd_pick_busiest(struct sched_domain *sd,
+                                  struct sd_lb_stats *sds,
+                                  struct sched_group *sg,
+                                  struct sg_lb_stats *sgs,
+                                  int this_cpu)
+{
+       if (sgs->avg_load <= sds->max_load)
+               return false;
+
+       if (sgs->sum_nr_running > sgs->group_capacity)
+               return true;
+
+       if (sgs->group_imb)
+               return true;
+
+       /*
+        * ASYM_PACKING needs to move all the work to the lowest
+        * numbered CPUs in the group, therefore mark all groups
+        * higher than ourself as busy.
+        */
+       if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
+           this_cpu < group_first_cpu(sg)) {
+               if (!sds->busiest)
+                       return true;
+
+               if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
+                       return true;
+       }
+
+       return false;
 }
 
 /**
@@ -2435,7 +2507,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
  * @sd: sched_domain whose statistics are to be updated.
  * @this_cpu: Cpu for which load balance is currently performed.
  * @idle: Idle status of this_cpu
- * @sd_idle: Idle status of the sched_domain containing group.
+ * @sd_idle: Idle status of the sched_domain containing sg.
  * @cpus: Set of cpus considered for load balancing.
  * @balance: Should we balance.
  * @sds: variable to hold the statistics for this sched_domain.
@@ -2446,7 +2518,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                        struct sd_lb_stats *sds)
 {
        struct sched_domain *child = sd->child;
-       struct sched_group *group = sd->groups;
+       struct sched_group *sg = sd->groups;
        struct sg_lb_stats sgs;
        int load_idx, prefer_sibling = 0;
 
@@ -2459,21 +2531,20 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
        do {
                int local_group;
 
-               local_group = cpumask_test_cpu(this_cpu,
-                                              sched_group_cpus(group));
+               local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
                memset(&sgs, 0, sizeof(sgs));
-               update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
+               update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
                                local_group, cpus, balance, &sgs);
 
                if (local_group && !(*balance))
                        return;
 
                sds->total_load += sgs.group_load;
-               sds->total_pwr += group->cpu_power;
+               sds->total_pwr += sg->cpu_power;
 
                /*
                 * In case the child domain prefers tasks go to siblings
-                * first, lower the group capacity to one so that we'll try
+                * first, lower the sg capacity to one so that we'll try
                 * and move all the excess tasks away.
                 */
                if (prefer_sibling)
@@ -2481,23 +2552,72 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 
                if (local_group) {
                        sds->this_load = sgs.avg_load;
-                       sds->this = group;
+                       sds->this = sg;
                        sds->this_nr_running = sgs.sum_nr_running;
                        sds->this_load_per_task = sgs.sum_weighted_load;
-               } else if (sgs.avg_load > sds->max_load &&
-                          (sgs.sum_nr_running > sgs.group_capacity ||
-                               sgs.group_imb)) {
+               } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
                        sds->max_load = sgs.avg_load;
-                       sds->busiest = group;
+                       sds->busiest = sg;
                        sds->busiest_nr_running = sgs.sum_nr_running;
                        sds->busiest_group_capacity = sgs.group_capacity;
                        sds->busiest_load_per_task = sgs.sum_weighted_load;
                        sds->group_imb = sgs.group_imb;
                }
 
-               update_sd_power_savings_stats(group, sds, local_group, &sgs);
-               group = group->next;
-       } while (group != sd->groups);
+               update_sd_power_savings_stats(sg, sds, local_group, &sgs);
+               sg = sg->next;
+       } while (sg != sd->groups);
+}
+
+int __weak arch_sd_sibling_asym_packing(void)
+{
+       return 0*SD_ASYM_PACKING;
+}
+
+/**
+ * check_asym_packing - Check to see if the group is packed into the
+ *                     sched doman.
+ *
+ * This is primarily intended to used at the sibling level.  Some
+ * cores like POWER7 prefer to use lower numbered SMT threads.  In the
+ * case of POWER7, it can move to lower SMT modes only when higher
+ * threads are idle.  When in lower SMT modes, the threads will
+ * perform better since they share less core resources.  Hence when we
+ * have idle threads, we want them to be the higher ones.
+ *
+ * This packing function is run on idle threads.  It checks to see if
+ * the busiest CPU in this domain (core in the P7 case) has a higher
+ * CPU number than the packing function is being run on.  Here we are
+ * assuming lower CPU number will be equivalent to lower a SMT thread
+ * number.
+ *
+ * Returns 1 when packing is required and a task should be moved to
+ * this CPU.  The amount of the imbalance is returned in *imbalance.
+ *
+ * @sd: The sched_domain whose packing is to be checked.
+ * @sds: Statistics of the sched_domain which is to be packed
+ * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ * @imbalance: returns amount of imbalanced due to packing.
+ */
+static int check_asym_packing(struct sched_domain *sd,
+                             struct sd_lb_stats *sds,
+                             int this_cpu, unsigned long *imbalance)
+{
+       int busiest_cpu;
+
+       if (!(sd->flags & SD_ASYM_PACKING))
+               return 0;
+
+       if (!sds->busiest)
+               return 0;
+
+       busiest_cpu = group_first_cpu(sds->busiest);
+       if (this_cpu > busiest_cpu)
+               return 0;
+
+       *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
+                                      SCHED_LOAD_SCALE);
+       return 1;
 }
 
 /**
@@ -2692,6 +2812,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
        if (!(*balance))
                goto ret;
 
+       if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
+           check_asym_packing(sd, &sds, this_cpu, imbalance))
+               return sds.busiest;
+
        if (!sds.busiest || sds.busiest_nr_running == 0)
                goto out_balanced;
 
@@ -2726,8 +2850,9 @@ ret:
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
 static struct rq *
-find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
-                  unsigned long imbalance, const struct cpumask *cpus)
+find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
+                  enum cpu_idle_type idle, unsigned long imbalance,
+                  const struct cpumask *cpus)
 {
        struct rq *busiest = NULL, *rq;
        unsigned long max_load = 0;
@@ -2738,6 +2863,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
                unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
                unsigned long wl;
 
+               if (!capacity)
+                       capacity = fix_small_capacity(sd, group);
+
                if (!cpumask_test_cpu(i, cpus))
                        continue;
 
@@ -2777,9 +2905,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
 /* Working cpumask for load_balance and load_balance_newidle. */
 static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
 
-static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
+static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
+                              int busiest_cpu, int this_cpu)
 {
        if (idle == CPU_NEWLY_IDLE) {
+
+               /*
+                * ASYM_PACKING needs to force migrate tasks from busy but
+                * higher numbered CPUs in order to pack all tasks in the
+                * lowest numbered CPUs.
+                */
+               if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
+                       return 1;
+
                /*
                 * The only task running in a non-idle cpu can be moved to this
                 * cpu in an attempt to completely freeup the other CPU
@@ -2854,7 +2992,7 @@ redo:
                goto out_balanced;
        }
 
-       busiest = find_busiest_queue(group, idle, imbalance, cpus);
+       busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
        if (!busiest) {
                schedstat_inc(sd, lb_nobusyq[idle]);
                goto out_balanced;
@@ -2898,7 +3036,8 @@ redo:
                schedstat_inc(sd, lb_failed[idle]);
                sd->nr_balance_failed++;
 
-               if (need_active_balance(sd, sd_idle, idle)) {
+               if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
+                                       this_cpu)) {
                        raw_spin_lock_irqsave(&busiest->lock, flags);
 
                        /* don't kick the active_load_balance_cpu_stop,
@@ -3093,13 +3232,40 @@ out_unlock:
 }
 
 #ifdef CONFIG_NO_HZ
+
+static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
+
+static void trigger_sched_softirq(void *data)
+{
+       raise_softirq_irqoff(SCHED_SOFTIRQ);
+}
+
+static inline void init_sched_softirq_csd(struct call_single_data *csd)
+{
+       csd->func = trigger_sched_softirq;
+       csd->info = NULL;
+       csd->flags = 0;
+       csd->priv = 0;
+}
+
+/*
+ * idle load balancing details
+ * - One of the idle CPUs nominates itself as idle load_balancer, while
+ *   entering idle.
+ * - This idle load balancer CPU will also go into tickless mode when
+ *   it is idle, just like all other idle CPUs
+ * - When one of the busy CPUs notice that there may be an idle rebalancing
+ *   needed, they will kick the idle load balancer, which then does idle
+ *   load balancing for all the idle CPUs.
+ */
 static struct {
        atomic_t load_balancer;
-       cpumask_var_t cpu_mask;
-       cpumask_var_t ilb_grp_nohz_mask;
-} nohz ____cacheline_aligned = {
-       .load_balancer = ATOMIC_INIT(-1),
-};
+       atomic_t first_pick_cpu;
+       atomic_t second_pick_cpu;
+       cpumask_var_t idle_cpus_mask;
+       cpumask_var_t grp_idle_mask;
+       unsigned long next_balance;     /* in jiffy units */
+} nohz ____cacheline_aligned;
 
 int get_nohz_load_balancer(void)
 {
@@ -3153,17 +3319,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
  */
 static inline int is_semi_idle_group(struct sched_group *ilb_group)
 {
-       cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+       cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
                                        sched_group_cpus(ilb_group));
 
        /*
         * A sched_group is semi-idle when it has atleast one busy cpu
         * and atleast one idle cpu.
         */
-       if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+       if (cpumask_empty(nohz.grp_idle_mask))
                return 0;
 
-       if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+       if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
                return 0;
 
        return 1;
@@ -3196,7 +3362,7 @@ static int find_new_ilb(int cpu)
         * Optimize for the case when we have no idle CPUs or only one
         * idle CPU. Don't walk the sched_domain hierarchy in such cases
         */
-       if (cpumask_weight(nohz.cpu_mask) < 2)
+       if (cpumask_weight(nohz.idle_cpus_mask) < 2)
                goto out_done;
 
        for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
@@ -3204,7 +3370,7 @@ static int find_new_ilb(int cpu)
 
                do {
                        if (is_semi_idle_group(ilb_group))
-                               return cpumask_first(nohz.ilb_grp_nohz_mask);
+                               return cpumask_first(nohz.grp_idle_mask);
 
                        ilb_group = ilb_group->next;
 
@@ -3212,98 +3378,116 @@ static int find_new_ilb(int cpu)
        }
 
 out_done:
-       return cpumask_first(nohz.cpu_mask);
+       return nr_cpu_ids;
 }
 #else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
 static inline int find_new_ilb(int call_cpu)
 {
-       return cpumask_first(nohz.cpu_mask);
+       return nr_cpu_ids;
 }
 #endif
 
+/*
+ * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
+ * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
+ * CPU (if there is one).
+ */
+static void nohz_balancer_kick(int cpu)
+{
+       int ilb_cpu;
+
+       nohz.next_balance++;
+
+       ilb_cpu = get_nohz_load_balancer();
+
+       if (ilb_cpu >= nr_cpu_ids) {
+               ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
+               if (ilb_cpu >= nr_cpu_ids)
+                       return;
+       }
+
+       if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
+               struct call_single_data *cp;
+
+               cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
+               cp = &per_cpu(remote_sched_softirq_cb, cpu);
+               __smp_call_function_single(ilb_cpu, cp, 0);
+       }
+       return;
+}
+
 /*
  * This routine will try to nominate the ilb (idle load balancing)
  * owner among the cpus whose ticks are stopped. ilb owner will do the idle
- * load balancing on behalf of all those cpus. If all the cpus in the system
- * go into this tickless mode, then there will be no ilb owner (as there is
- * no need for one) and all the cpus will sleep till the next wakeup event
- * arrives...
- *
- * For the ilb owner, tick is not stopped. And this tick will be used
- * for idle load balancing. ilb owner will still be part of
- * nohz.cpu_mask..
+ * load balancing on behalf of all those cpus.
  *
- * While stopping the tick, this cpu will become the ilb owner if there
- * is no other owner. And will be the owner till that cpu becomes busy
- * or if all cpus in the system stop their ticks at which point
- * there is no need for ilb owner.
+ * When the ilb owner becomes busy, we will not have new ilb owner until some
+ * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
+ * idle load balancing by kicking one of the idle CPUs.
  *
- * When the ilb owner becomes busy, it nominates another owner, during the
- * next busy scheduler_tick()
+ * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
+ * ilb owner CPU in future (when there is a need for idle load balancing on
+ * behalf of all idle CPUs).
  */
-int select_nohz_load_balancer(int stop_tick)
+void select_nohz_load_balancer(int stop_tick)
 {
        int cpu = smp_processor_id();
 
        if (stop_tick) {
-               cpu_rq(cpu)->in_nohz_recently = 1;
-
                if (!cpu_active(cpu)) {
                        if (atomic_read(&nohz.load_balancer) != cpu)
-                               return 0;
+                               return;
 
                        /*
                         * If we are going offline and still the leader,
                         * give up!
                         */
-                       if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+                       if (atomic_cmpxchg(&nohz.load_balancer, cpu,
+                                          nr_cpu_ids) != cpu)
                                BUG();
 
-                       return 0;
+                       return;
                }
 
-               cpumask_set_cpu(cpu, nohz.cpu_mask);
+               cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
 
-               /* time for ilb owner also to sleep */
-               if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
-                       if (atomic_read(&nohz.load_balancer) == cpu)
-                               atomic_set(&nohz.load_balancer, -1);
-                       return 0;
-               }
+               if (atomic_read(&nohz.first_pick_cpu) == cpu)
+                       atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
+               if (atomic_read(&nohz.second_pick_cpu) == cpu)
+                       atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
 
-               if (atomic_read(&nohz.load_balancer) == -1) {
-                       /* make me the ilb owner */
-                       if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
-                               return 1;
-               } else if (atomic_read(&nohz.load_balancer) == cpu) {
+               if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
                        int new_ilb;
 
-                       if (!(sched_smt_power_savings ||
-                                               sched_mc_power_savings))
-                               return 1;
+                       /* make me the ilb owner */
+                       if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
+                                          cpu) != nr_cpu_ids)
+                               return;
+
                        /*
                         * Check to see if there is a more power-efficient
                         * ilb.
                         */
                        new_ilb = find_new_ilb(cpu);
                        if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
-                               atomic_set(&nohz.load_balancer, -1);
+                               atomic_set(&nohz.load_balancer, nr_cpu_ids);
                                resched_cpu(new_ilb);
-                               return 0;
+                               return;
                        }
-                       return 1;
+                       return;
                }
        } else {
-               if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
-                       return 0;
+               if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+                       return;
 
-               cpumask_clear_cpu(cpu, nohz.cpu_mask);
+               cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
 
                if (atomic_read(&nohz.load_balancer) == cpu)
-                       if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+                       if (atomic_cmpxchg(&nohz.load_balancer, cpu,
+                                          nr_cpu_ids) != cpu)
                                BUG();
        }
-       return 0;
+       return;
 }
 #endif
 
@@ -3385,11 +3569,102 @@ out:
                rq->next_balance = next_balance;
 }
 
+#ifdef CONFIG_NO_HZ
 /*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
- * In CONFIG_NO_HZ case, the idle load balance owner will do the
+ * In CONFIG_NO_HZ case, the idle balance kickee will do the
  * rebalancing for all the cpus for whom scheduler ticks are stopped.
  */
+static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
+{
+       struct rq *this_rq = cpu_rq(this_cpu);
+       struct rq *rq;
+       int balance_cpu;
+
+       if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
+               return;
+
+       for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+               if (balance_cpu == this_cpu)
+                       continue;
+
+               /*
+                * If this cpu gets work to do, stop the load balancing
+                * work being done for other cpus. Next load
+                * balancing owner will pick it up.
+                */
+               if (need_resched()) {
+                       this_rq->nohz_balance_kick = 0;
+                       break;
+               }
+
+               raw_spin_lock_irq(&this_rq->lock);
+               update_rq_clock(this_rq);
+               update_cpu_load(this_rq);
+               raw_spin_unlock_irq(&this_rq->lock);
+
+               rebalance_domains(balance_cpu, CPU_IDLE);
+
+               rq = cpu_rq(balance_cpu);
+               if (time_after(this_rq->next_balance, rq->next_balance))
+                       this_rq->next_balance = rq->next_balance;
+       }
+       nohz.next_balance = this_rq->next_balance;
+       this_rq->nohz_balance_kick = 0;
+}
+
+/*
+ * Current heuristic for kicking the idle load balancer
+ * - first_pick_cpu is the one of the busy CPUs. It will kick
+ *   idle load balancer when it has more than one process active. This
+ *   eliminates the need for idle load balancing altogether when we have
+ *   only one running process in the system (common case).
+ * - If there are more than one busy CPU, idle load balancer may have
+ *   to run for active_load_balance to happen (i.e., two busy CPUs are
+ *   SMT or core siblings and can run better if they move to different
+ *   physical CPUs). So, second_pick_cpu is the second of the busy CPUs
+ *   which will kick idle load balancer as soon as it has any load.
+ */
+static inline int nohz_kick_needed(struct rq *rq, int cpu)
+{
+       unsigned long now = jiffies;
+       int ret;
+       int first_pick_cpu, second_pick_cpu;
+
+       if (time_before(now, nohz.next_balance))
+               return 0;
+
+       if (!rq->nr_running)
+               return 0;
+
+       first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
+       second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
+
+       if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
+           second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
+               return 0;
+
+       ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
+       if (ret == nr_cpu_ids || ret == cpu) {
+               atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
+               if (rq->nr_running > 1)
+                       return 1;
+       } else {
+               ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
+               if (ret == nr_cpu_ids || ret == cpu) {
+                       if (rq->nr_running)
+                               return 1;
+               }
+       }
+       return 0;
+}
+#else
+static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
+#endif
+
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
+ */
 static void run_rebalance_domains(struct softirq_action *h)
 {
        int this_cpu = smp_processor_id();
@@ -3399,37 +3674,12 @@ static void run_rebalance_domains(struct softirq_action *h)
 
        rebalance_domains(this_cpu, idle);
 
-#ifdef CONFIG_NO_HZ
        /*
-        * If this cpu is the owner for idle load balancing, then do the
+        * If this cpu has a pending nohz_balance_kick, then do the
         * balancing on behalf of the other idle cpus whose ticks are
         * stopped.
         */
-       if (this_rq->idle_at_tick &&
-           atomic_read(&nohz.load_balancer) == this_cpu) {
-               struct rq *rq;
-               int balance_cpu;
-
-               for_each_cpu(balance_cpu, nohz.cpu_mask) {
-                       if (balance_cpu == this_cpu)
-                               continue;
-
-                       /*
-                        * If this cpu gets work to do, stop the load balancing
-                        * work being done for other cpus. Next load
-                        * balancing owner will pick it up.
-                        */
-                       if (need_resched())
-                               break;
-
-                       rebalance_domains(balance_cpu, CPU_IDLE);
-
-                       rq = cpu_rq(balance_cpu);
-                       if (time_after(this_rq->next_balance, rq->next_balance))
-                               this_rq->next_balance = rq->next_balance;
-               }
-       }
-#endif
+       nohz_idle_balance(this_cpu, idle);
 }
 
 static inline int on_null_domain(int cpu)
@@ -3439,57 +3689,17 @@ static inline int on_null_domain(int cpu)
 
 /*
  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
- *
- * In case of CONFIG_NO_HZ, this is the place where we nominate a new
- * idle load balancing owner or decide to stop the periodic load balancing,
- * if the whole system is idle.
  */
 static inline void trigger_load_balance(struct rq *rq, int cpu)
 {
-#ifdef CONFIG_NO_HZ
-       /*
-        * If we were in the nohz mode recently and busy at the current
-        * scheduler tick, then check if we need to nominate new idle
-        * load balancer.
-        */
-       if (rq->in_nohz_recently && !rq->idle_at_tick) {
-               rq->in_nohz_recently = 0;
-
-               if (atomic_read(&nohz.load_balancer) == cpu) {
-                       cpumask_clear_cpu(cpu, nohz.cpu_mask);
-                       atomic_set(&nohz.load_balancer, -1);
-               }
-
-               if (atomic_read(&nohz.load_balancer) == -1) {
-                       int ilb = find_new_ilb(cpu);
-
-                       if (ilb < nr_cpu_ids)
-                               resched_cpu(ilb);
-               }
-       }
-
-       /*
-        * If this cpu is idle and doing idle load balancing for all the
-        * cpus with ticks stopped, is it time for that to stop?
-        */
-       if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
-           cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
-               resched_cpu(cpu);
-               return;
-       }
-
-       /*
-        * If this cpu is idle and the idle load balancing is done by
-        * someone else, then no need raise the SCHED_SOFTIRQ
-        */
-       if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
-           cpumask_test_cpu(cpu, nohz.cpu_mask))
-               return;
-#endif
        /* Don't need to rebalance while attached to NULL domain */
        if (time_after_eq(jiffies, rq->next_balance) &&
            likely(!on_null_domain(cpu)))
                raise_softirq(SCHED_SOFTIRQ);
+#ifdef CONFIG_NO_HZ
+       else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
+               nohz_balancer_kick(cpu);
+#endif
 }
 
 static void rq_online_fair(struct rq *rq)