[net-next-2.6.git] / net / core / flow.c

/* flow.c: Generic flow cache.
 *
 * Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru)
 * Copyright (C) 2003 David S. Miller (davem@redhat.com)
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/list.h>
#include <linux/jhash.h>
#include <linux/interrupt.h>
#include <linux/mm.h>
#include <linux/random.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/smp.h>
#include <linux/completion.h>
#include <linux/percpu.h>
#include <linux/bitops.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/mutex.h>
#include <net/flow.h>
#include <asm/atomic.h>
#include <linux/security.h>

struct flow_cache_entry {
	union {
		struct hlist_node	hlist;
		struct list_head	gc_list;
	} u;
	u16				family;
	u8				dir;
	u32				genid;
	struct flowi			key;
	struct flow_cache_object	*object;
};

struct flow_cache_percpu {
	struct hlist_head		*hash_table;
	int				hash_count;
	u32				hash_rnd;
	int				hash_rnd_recalc;
	struct tasklet_struct		flush_tasklet;
};

struct flow_flush_info {
	struct flow_cache		*cache;
	atomic_t			cpuleft;
	struct completion		completion;
};

struct flow_cache {
	u32				hash_shift;
	unsigned long			order;
	struct flow_cache_percpu	*percpu;
	struct notifier_block		hotcpu_notifier;
	int				low_watermark;
	int				high_watermark;
	struct timer_list		rnd_timer;
};

atomic_t flow_cache_genid = ATOMIC_INIT(0);
EXPORT_SYMBOL(flow_cache_genid);
static struct flow_cache flow_cache_global;
static struct kmem_cache *flow_cachep;

static DEFINE_SPINLOCK(flow_cache_gc_lock);
static LIST_HEAD(flow_cache_gc_list);

#define flow_cache_hash_size(cache)	(1 << (cache)->hash_shift)
#define FLOW_HASH_RND_PERIOD		(10 * 60 * HZ)

static void flow_cache_new_hashrnd(unsigned long arg)
{
	struct flow_cache *fc = (void *) arg;
	int i;

	for_each_possible_cpu(i)
		per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1;

	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
	add_timer(&fc->rnd_timer);
}

static int flow_entry_valid(struct flow_cache_entry *fle)
{
	if (atomic_read(&flow_cache_genid) != fle->genid)
		return 0;
	if (fle->object && !fle->object->ops->check(fle->object))
		return 0;
	return 1;
}

static void flow_entry_kill(struct flow_cache_entry *fle)
{
	if (fle->object)
		fle->object->ops->delete(fle->object);
	kmem_cache_free(flow_cachep, fle);
}

static void flow_cache_gc_task(struct work_struct *work)
{
	struct list_head gc_list;
	struct flow_cache_entry *fce, *n;

	INIT_LIST_HEAD(&gc_list);
	spin_lock_bh(&flow_cache_gc_lock);
	list_splice_tail_init(&flow_cache_gc_list, &gc_list);
	spin_unlock_bh(&flow_cache_gc_lock);

	list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
		flow_entry_kill(fce);
}
static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task);

static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
				     int deleted, struct list_head *gc_list)
{
	if (deleted) {
		fcp->hash_count -= deleted;
		spin_lock_bh(&flow_cache_gc_lock);
		list_splice_tail(gc_list, &flow_cache_gc_list);
		spin_unlock_bh(&flow_cache_gc_lock);
		schedule_work(&flow_cache_gc_work);
	}
}

static void __flow_cache_shrink(struct flow_cache *fc,
				struct flow_cache_percpu *fcp,
				int shrink_to)
{
	struct flow_cache_entry *fle;
	struct hlist_node *entry, *tmp;
	LIST_HEAD(gc_list);
	int i, deleted = 0;

	for (i = 0; i < flow_cache_hash_size(fc); i++) {
		int saved = 0;

		hlist_for_each_entry_safe(fle, entry, tmp,
					  &fcp->hash_table[i], u.hlist) {
			if (saved < shrink_to &&
			    flow_entry_valid(fle)) {
				saved++;
			} else {
				deleted++;
				hlist_del(&fle->u.hlist);
				list_add_tail(&fle->u.gc_list, &gc_list);
			}
		}
	}

	flow_cache_queue_garbage(fcp, deleted, &gc_list);
}

static void flow_cache_shrink(struct flow_cache *fc,
			      struct flow_cache_percpu *fcp)
{
	int shrink_to = fc->low_watermark / flow_cache_hash_size(fc);

	__flow_cache_shrink(fc, fcp, shrink_to);
}

static void flow_new_hash_rnd(struct flow_cache *fc,
			      struct flow_cache_percpu *fcp)
{
	get_random_bytes(&fcp->hash_rnd, sizeof(u32));
	fcp->hash_rnd_recalc = 0;
	__flow_cache_shrink(fc, fcp, 0);
}

static u32 flow_hash_code(struct flow_cache *fc,
			  struct flow_cache_percpu *fcp,
			  struct flowi *key)
{
	u32 *k = (u32 *) key;

	return (jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd)
		& (flow_cache_hash_size(fc) - 1));
}

#if (BITS_PER_LONG == 64)
typedef u64 flow_compare_t;
#else
typedef u32 flow_compare_t;
#endif

/* I hear what you're saying, use memcmp.  But memcmp cannot make
 * important assumptions that we can here, such as alignment and
 * constant size.
 */
static int flow_key_compare(struct flowi *key1, struct flowi *key2)
{
	flow_compare_t *k1, *k1_lim, *k2;
	const int n_elem = sizeof(struct flowi) / sizeof(flow_compare_t);

	BUILD_BUG_ON(sizeof(struct flowi) % sizeof(flow_compare_t));

	k1 = (flow_compare_t *) key1;
	k1_lim = k1 + n_elem;

	k2 = (flow_compare_t *) key2;

	do {
		if (*k1++ != *k2++)
			return 1;
	} while (k1 < k1_lim);

	return 0;
}

struct flow_cache_object *
flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
		  flow_resolve_t resolver, void *ctx)
{
	struct flow_cache *fc = &flow_cache_global;
	struct flow_cache_percpu *fcp;
	struct flow_cache_entry *fle, *tfle;
	struct hlist_node *entry;
	struct flow_cache_object *flo;
	unsigned int hash;

	local_bh_disable();
	fcp = this_cpu_ptr(fc->percpu);

	fle = NULL;
	flo = NULL;
	/* Packet really early in init?  Making flow_cache_init a
	 * pre-smp initcall would solve this.  --RR */
	if (!fcp->hash_table)
		goto nocache;

	if (fcp->hash_rnd_recalc)
		flow_new_hash_rnd(fc, fcp);

	hash = flow_hash_code(fc, fcp, key);
	hlist_for_each_entry(tfle, entry, &fcp->hash_table[hash], u.hlist) {
		if (tfle->family == family &&
		    tfle->dir == dir &&
		    flow_key_compare(key, &tfle->key) == 0) {
			fle = tfle;
			break;
		}
	}

	if (unlikely(!fle)) {
		if (fcp->hash_count > fc->high_watermark)
			flow_cache_shrink(fc, fcp);

		fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
		if (fle) {
			fle->family = family;
			fle->dir = dir;
			memcpy(&fle->key, key, sizeof(*key));
			fle->object = NULL;
			hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
			fcp->hash_count++;
		}
	} else if (likely(fle->genid == atomic_read(&flow_cache_genid))) {
		flo = fle->object;
		if (!flo)
			goto ret_object;
		flo = flo->ops->get(flo);
		if (flo)
			goto ret_object;
	} else if (fle->object) {
	        flo = fle->object;
	        flo->ops->delete(flo);
	        fle->object = NULL;
	}

nocache:
	flo = NULL;
	if (fle) {
		flo = fle->object;
		fle->object = NULL;
	}
	flo = resolver(net, key, family, dir, flo, ctx);
	if (fle) {
		fle->genid = atomic_read(&flow_cache_genid);
		if (!IS_ERR(flo))
			fle->object = flo;
		else
			fle->genid--;
	} else {
		if (flo && !IS_ERR(flo))
			flo->ops->delete(flo);
	}
ret_object:
	local_bh_enable();
	return flo;
}
EXPORT_SYMBOL(flow_cache_lookup);

static void flow_cache_flush_tasklet(unsigned long data)
{
	struct flow_flush_info *info = (void *)data;
	struct flow_cache *fc = info->cache;
	struct flow_cache_percpu *fcp;
	struct flow_cache_entry *fle;
	struct hlist_node *entry, *tmp;
	LIST_HEAD(gc_list);
	int i, deleted = 0;

	fcp = this_cpu_ptr(fc->percpu);
	for (i = 0; i < flow_cache_hash_size(fc); i++) {
		hlist_for_each_entry_safe(fle, entry, tmp,
					  &fcp->hash_table[i], u.hlist) {
			if (flow_entry_valid(fle))
				continue;

			deleted++;
			hlist_del(&fle->u.hlist);
			list_add_tail(&fle->u.gc_list, &gc_list);
		}
	}

	flow_cache_queue_garbage(fcp, deleted, &gc_list);

	if (atomic_dec_and_test(&info->cpuleft))
		complete(&info->completion);
}

static void flow_cache_flush_per_cpu(void *data)
{
	struct flow_flush_info *info = data;
	int cpu;
	struct tasklet_struct *tasklet;

	cpu = smp_processor_id();
	tasklet = &per_cpu_ptr(info->cache->percpu, cpu)->flush_tasklet;
	tasklet->data = (unsigned long)info;
	tasklet_schedule(tasklet);
}

void flow_cache_flush(void)
{
	struct flow_flush_info info;
	static DEFINE_MUTEX(flow_flush_sem);

	/* Don't want cpus going down or up during this. */
	get_online_cpus();
	mutex_lock(&flow_flush_sem);
	info.cache = &flow_cache_global;
	atomic_set(&info.cpuleft, num_online_cpus());
	init_completion(&info.completion);

	local_bh_disable();
	smp_call_function(flow_cache_flush_per_cpu, &info, 0);
	flow_cache_flush_tasklet((unsigned long)&info);
	local_bh_enable();

	wait_for_completion(&info.completion);
	mutex_unlock(&flow_flush_sem);
	put_online_cpus();
}

static void __init flow_cache_cpu_prepare(struct flow_cache *fc,
					  struct flow_cache_percpu *fcp)
{
	fcp->hash_table = (struct hlist_head *)
		__get_free_pages(GFP_KERNEL|__GFP_ZERO, fc->order);
	if (!fcp->hash_table)
		panic("NET: failed to allocate flow cache order %lu\n", fc->order);

	fcp->hash_rnd_recalc = 1;
	fcp->hash_count = 0;
	tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
}

static int flow_cache_cpu(struct notifier_block *nfb,
			  unsigned long action,
			  void *hcpu)
{
	struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier);
	int cpu = (unsigned long) hcpu;
	struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);

	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
		__flow_cache_shrink(fc, fcp, 0);
	return NOTIFY_OK;
}

static int flow_cache_init(struct flow_cache *fc)
{
	unsigned long order;
	int i;

	fc->hash_shift = 10;
	fc->low_watermark = 2 * flow_cache_hash_size(fc);
	fc->high_watermark = 4 * flow_cache_hash_size(fc);

	for (order = 0;
	     (PAGE_SIZE << order) <
		     (sizeof(struct hlist_head)*flow_cache_hash_size(fc));
	     order++)
		/* NOTHING */;
	fc->order = order;
	fc->percpu = alloc_percpu(struct flow_cache_percpu);

	setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
		    (unsigned long) fc);
	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
	add_timer(&fc->rnd_timer);

	for_each_possible_cpu(i)
		flow_cache_cpu_prepare(fc, per_cpu_ptr(fc->percpu, i));

	fc->hotcpu_notifier = (struct notifier_block){
		.notifier_call = flow_cache_cpu,
	};
	register_hotcpu_notifier(&fc->hotcpu_notifier);

	return 0;
}

static int __init flow_cache_init_global(void)
{
	flow_cachep = kmem_cache_create("flow_cache",
					sizeof(struct flow_cache_entry),
					0, SLAB_PANIC, NULL);

	return flow_cache_init(&flow_cache_global);
}

module_init(flow_cache_init_global);
Commit	Line	Data
1da177e4 LT	1	/* flow.c: Generic flow cache.
	2	*
	3	* Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru)
	4	* Copyright (C) 2003 David S. Miller (davem@redhat.com)
	5	*/
	6
	7	#include <linux/kernel.h>
	8	#include <linux/module.h>
	9	#include <linux/list.h>
	10	#include <linux/jhash.h>
	11	#include <linux/interrupt.h>
	12	#include <linux/mm.h>
	13	#include <linux/random.h>
	14	#include <linux/init.h>
	15	#include <linux/slab.h>
	16	#include <linux/smp.h>
	17	#include <linux/completion.h>
	18	#include <linux/percpu.h>
	19	#include <linux/bitops.h>
	20	#include <linux/notifier.h>
	21	#include <linux/cpu.h>
	22	#include <linux/cpumask.h>
4a3e2f71	23	#include <linux/mutex.h>
1da177e4 LT	24	#include <net/flow.h>
1da177e4 LT	25	#include <asm/atomic.h>
df71837d	26	#include <linux/security.h>
1da177e4 LT	27
1da177e4 LT	28	struct flow_cache_entry {
8e479560 TT	29	union {
	30	struct hlist_node hlist;
	31	struct list_head gc_list;
	32	} u;
fe1a5f03 TT	33	u16 family;
	34	u8 dir;
	35	u32 genid;
	36	struct flowi key;
	37	struct flow_cache_object *object;
1da177e4 LT	38	};
1da177e4 LT	39
d7997fe1	40	struct flow_cache_percpu {
8e479560	41	struct hlist_head *hash_table;
d7997fe1 TT	42	int hash_count;
	43	u32 hash_rnd;
	44	int hash_rnd_recalc;
	45	struct tasklet_struct flush_tasklet;
5f58a5c8	46	};
1da177e4 LT	47
1da177e4 LT	48	struct flow_flush_info {
fe1a5f03	49	struct flow_cache *cache;
d7997fe1 TT	50	atomic_t cpuleft;
d7997fe1 TT	51	struct completion completion;
1da177e4	52	};
1da177e4	53
d7997fe1 TT	54	struct flow_cache {
	55	u32 hash_shift;
	56	unsigned long order;
fe1a5f03	57	struct flow_cache_percpu *percpu;
d7997fe1 TT	58	struct notifier_block hotcpu_notifier;
	59	int low_watermark;
	60	int high_watermark;
	61	struct timer_list rnd_timer;
	62	};
	63
	64	atomic_t flow_cache_genid = ATOMIC_INIT(0);
9e34a5b5	65	EXPORT_SYMBOL(flow_cache_genid);
d7997fe1 TT	66	static struct flow_cache flow_cache_global;
	67	static struct kmem_cache *flow_cachep;
	68
8e479560 TT	69	static DEFINE_SPINLOCK(flow_cache_gc_lock);
	70	static LIST_HEAD(flow_cache_gc_list);
	71
d7997fe1 TT	72	#define flow_cache_hash_size(cache) (1 << (cache)->hash_shift)
d7997fe1 TT	73	#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ)
1da177e4 LT	74
	75	static void flow_cache_new_hashrnd(unsigned long arg)
	76	{
d7997fe1	77	struct flow_cache fc = (void ) arg;
1da177e4 LT	78	int i;
1da177e4 LT	79
6f912042	80	for_each_possible_cpu(i)
d7997fe1	81	per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1;
1da177e4	82
d7997fe1 TT	83	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
d7997fe1 TT	84	add_timer(&fc->rnd_timer);
1da177e4 LT	85	}
1da177e4 LT	86
fe1a5f03 TT	87	static int flow_entry_valid(struct flow_cache_entry *fle)
	88	{
	89	if (atomic_read(&flow_cache_genid) != fle->genid)
	90	return 0;
	91	if (fle->object && !fle->object->ops->check(fle->object))
	92	return 0;
	93	return 1;
	94	}
	95
8e479560	96	static void flow_entry_kill(struct flow_cache_entry *fle)
134b0fc5 JM	97	{
134b0fc5 JM	98	if (fle->object)
fe1a5f03	99	fle->object->ops->delete(fle->object);
134b0fc5	100	kmem_cache_free(flow_cachep, fle);
8e479560 TT	101	}
	102
	103	static void flow_cache_gc_task(struct work_struct *work)
	104	{
	105	struct list_head gc_list;
	106	struct flow_cache_entry fce, n;
	107
	108	INIT_LIST_HEAD(&gc_list);
	109	spin_lock_bh(&flow_cache_gc_lock);
	110	list_splice_tail_init(&flow_cache_gc_list, &gc_list);
	111	spin_unlock_bh(&flow_cache_gc_lock);
	112
	113	list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
	114	flow_entry_kill(fce);
	115	}
	116	static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task);
	117
	118	static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
	119	int deleted, struct list_head *gc_list)
	120	{
	121	if (deleted) {
	122	fcp->hash_count -= deleted;
	123	spin_lock_bh(&flow_cache_gc_lock);
	124	list_splice_tail(gc_list, &flow_cache_gc_list);
	125	spin_unlock_bh(&flow_cache_gc_lock);
	126	schedule_work(&flow_cache_gc_work);
	127	}
134b0fc5 JM	128	}
134b0fc5 JM	129
d7997fe1 TT	130	static void __flow_cache_shrink(struct flow_cache *fc,
	131	struct flow_cache_percpu *fcp,
	132	int shrink_to)
1da177e4	133	{
8e479560 TT	134	struct flow_cache_entry *fle;
	135	struct hlist_node entry, tmp;
	136	LIST_HEAD(gc_list);
	137	int i, deleted = 0;
1da177e4	138
d7997fe1	139	for (i = 0; i < flow_cache_hash_size(fc); i++) {
fe1a5f03	140	int saved = 0;
1da177e4	141
8e479560 TT	142	hlist_for_each_entry_safe(fle, entry, tmp,
8e479560 TT	143	&fcp->hash_table[i], u.hlist) {
fe1a5f03 TT	144	if (saved < shrink_to &&
	145	flow_entry_valid(fle)) {
	146	saved++;
fe1a5f03	147	} else {
8e479560 TT	148	deleted++;
	149	hlist_del(&fle->u.hlist);
	150	list_add_tail(&fle->u.gc_list, &gc_list);
fe1a5f03	151	}
1da177e4 LT	152	}
1da177e4 LT	153	}
8e479560 TT	154
8e479560 TT	155	flow_cache_queue_garbage(fcp, deleted, &gc_list);
1da177e4 LT	156	}
1da177e4 LT	157
d7997fe1 TT	158	static void flow_cache_shrink(struct flow_cache *fc,
d7997fe1 TT	159	struct flow_cache_percpu *fcp)
1da177e4	160	{
d7997fe1	161	int shrink_to = fc->low_watermark / flow_cache_hash_size(fc);
1da177e4	162
d7997fe1	163	__flow_cache_shrink(fc, fcp, shrink_to);
1da177e4 LT	164	}
1da177e4 LT	165
d7997fe1 TT	166	static void flow_new_hash_rnd(struct flow_cache *fc,
d7997fe1 TT	167	struct flow_cache_percpu *fcp)
1da177e4	168	{
d7997fe1 TT	169	get_random_bytes(&fcp->hash_rnd, sizeof(u32));
	170	fcp->hash_rnd_recalc = 0;
	171	__flow_cache_shrink(fc, fcp, 0);
1da177e4 LT	172	}
1da177e4 LT	173
d7997fe1 TT	174	static u32 flow_hash_code(struct flow_cache *fc,
	175	struct flow_cache_percpu *fcp,
	176	struct flowi *key)
1da177e4 LT	177	{
	178	u32 k = (u32 ) key;
	179
d7997fe1 TT	180	return (jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd)
d7997fe1 TT	181	& (flow_cache_hash_size(fc) - 1));
1da177e4 LT	182	}
	183
	184	#if (BITS_PER_LONG == 64)
	185	typedef u64 flow_compare_t;
	186	#else
	187	typedef u32 flow_compare_t;
	188	#endif
	189
1da177e4 LT	190	/* I hear what you're saying, use memcmp. But memcmp cannot make
	191	* important assumptions that we can here, such as alignment and
	192	* constant size.
	193	*/
	194	static int flow_key_compare(struct flowi key1, struct flowi key2)
	195	{
	196	flow_compare_t k1, k1_lim, *k2;
	197	const int n_elem = sizeof(struct flowi) / sizeof(flow_compare_t);
	198
f0fe91de	199	BUILD_BUG_ON(sizeof(struct flowi) % sizeof(flow_compare_t));
1da177e4 LT	200
	201	k1 = (flow_compare_t *) key1;
	202	k1_lim = k1 + n_elem;
	203
	204	k2 = (flow_compare_t *) key2;
	205
	206	do {
	207	if (k1++ != k2++)
	208	return 1;
	209	} while (k1 < k1_lim);
	210
	211	return 0;
	212	}
	213
fe1a5f03 TT	214	struct flow_cache_object *
	215	flow_cache_lookup(struct net net, struct flowi key, u16 family, u8 dir,
	216	flow_resolve_t resolver, void *ctx)
1da177e4	217	{
d7997fe1 TT	218	struct flow_cache *fc = &flow_cache_global;
d7997fe1 TT	219	struct flow_cache_percpu *fcp;
8e479560 TT	220	struct flow_cache_entry fle, tfle;
8e479560 TT	221	struct hlist_node *entry;
fe1a5f03	222	struct flow_cache_object *flo;
1da177e4	223	unsigned int hash;
1da177e4 LT	224
1da177e4 LT	225	local_bh_disable();
7a9b2d59	226	fcp = this_cpu_ptr(fc->percpu);
1da177e4 LT	227
1da177e4 LT	228	fle = NULL;
fe1a5f03	229	flo = NULL;
1da177e4 LT	230	/* Packet really early in init? Making flow_cache_init a
1da177e4 LT	231	* pre-smp initcall would solve this. --RR */
d7997fe1	232	if (!fcp->hash_table)
1da177e4 LT	233	goto nocache;
1da177e4 LT	234
d7997fe1 TT	235	if (fcp->hash_rnd_recalc)
d7997fe1 TT	236	flow_new_hash_rnd(fc, fcp);
1da177e4	237
fe1a5f03	238	hash = flow_hash_code(fc, fcp, key);
8e479560 TT	239	hlist_for_each_entry(tfle, entry, &fcp->hash_table[hash], u.hlist) {
	240	if (tfle->family == family &&
	241	tfle->dir == dir &&
	242	flow_key_compare(key, &tfle->key) == 0) {
	243	fle = tfle;
1da177e4	244	break;
8e479560	245	}
1da177e4 LT	246	}
1da177e4 LT	247
fe1a5f03	248	if (unlikely(!fle)) {
d7997fe1 TT	249	if (fcp->hash_count > fc->high_watermark)
d7997fe1 TT	250	flow_cache_shrink(fc, fcp);
1da177e4	251
54e6ecb2	252	fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
1da177e4	253	if (fle) {
1da177e4 LT	254	fle->family = family;
	255	fle->dir = dir;
	256	memcpy(&fle->key, key, sizeof(*key));
	257	fle->object = NULL;
8e479560	258	hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
d7997fe1	259	fcp->hash_count++;
1da177e4	260	}
fe1a5f03 TT	261	} else if (likely(fle->genid == atomic_read(&flow_cache_genid))) {
	262	flo = fle->object;
	263	if (!flo)
	264	goto ret_object;
	265	flo = flo->ops->get(flo);
	266	if (flo)
	267	goto ret_object;
	268	} else if (fle->object) {
	269	flo = fle->object;
	270	flo->ops->delete(flo);
	271	fle->object = NULL;
1da177e4 LT	272	}
	273
	274	nocache:
fe1a5f03 TT	275	flo = NULL;
	276	if (fle) {
	277	flo = fle->object;
	278	fle->object = NULL;
	279	}
	280	flo = resolver(net, key, family, dir, flo, ctx);
	281	if (fle) {
	282	fle->genid = atomic_read(&flow_cache_genid);
	283	if (!IS_ERR(flo))
	284	fle->object = flo;
	285	else
	286	fle->genid--;
	287	} else {
	288	if (flo && !IS_ERR(flo))
	289	flo->ops->delete(flo);
1da177e4	290	}
fe1a5f03 TT	291	ret_object:
	292	local_bh_enable();
	293	return flo;
1da177e4	294	}
9e34a5b5	295	EXPORT_SYMBOL(flow_cache_lookup);
1da177e4 LT	296
	297	static void flow_cache_flush_tasklet(unsigned long data)
	298	{
	299	struct flow_flush_info info = (void )data;
d7997fe1 TT	300	struct flow_cache *fc = info->cache;
d7997fe1 TT	301	struct flow_cache_percpu *fcp;
8e479560 TT	302	struct flow_cache_entry *fle;
	303	struct hlist_node entry, tmp;
	304	LIST_HEAD(gc_list);
	305	int i, deleted = 0;
1da177e4	306
7a9b2d59	307	fcp = this_cpu_ptr(fc->percpu);
d7997fe1	308	for (i = 0; i < flow_cache_hash_size(fc); i++) {
8e479560 TT	309	hlist_for_each_entry_safe(fle, entry, tmp,
8e479560 TT	310	&fcp->hash_table[i], u.hlist) {
fe1a5f03	311	if (flow_entry_valid(fle))
1da177e4 LT	312	continue;
1da177e4 LT	313
8e479560 TT	314	deleted++;
	315	hlist_del(&fle->u.hlist);
	316	list_add_tail(&fle->u.gc_list, &gc_list);
1da177e4 LT	317	}
	318	}
	319
8e479560 TT	320	flow_cache_queue_garbage(fcp, deleted, &gc_list);
8e479560 TT	321
1da177e4 LT	322	if (atomic_dec_and_test(&info->cpuleft))
	323	complete(&info->completion);
	324	}
	325
1da177e4 LT	326	static void flow_cache_flush_per_cpu(void *data)
	327	{
	328	struct flow_flush_info *info = data;
	329	int cpu;
	330	struct tasklet_struct *tasklet;
	331
	332	cpu = smp_processor_id();
d7997fe1	333	tasklet = &per_cpu_ptr(info->cache->percpu, cpu)->flush_tasklet;
1da177e4 LT	334	tasklet->data = (unsigned long)info;
	335	tasklet_schedule(tasklet);
	336	}
	337
	338	void flow_cache_flush(void)
	339	{
	340	struct flow_flush_info info;
4a3e2f71	341	static DEFINE_MUTEX(flow_flush_sem);
1da177e4 LT	342
1da177e4 LT	343	/* Don't want cpus going down or up during this. */
86ef5c9a	344	get_online_cpus();
4a3e2f71	345	mutex_lock(&flow_flush_sem);
d7997fe1	346	info.cache = &flow_cache_global;
1da177e4 LT	347	atomic_set(&info.cpuleft, num_online_cpus());
	348	init_completion(&info.completion);
	349
	350	local_bh_disable();
8691e5a8	351	smp_call_function(flow_cache_flush_per_cpu, &info, 0);
1da177e4 LT	352	flow_cache_flush_tasklet((unsigned long)&info);
	353	local_bh_enable();
	354
	355	wait_for_completion(&info.completion);
4a3e2f71	356	mutex_unlock(&flow_flush_sem);
86ef5c9a	357	put_online_cpus();
1da177e4 LT	358	}
1da177e4 LT	359
d7997fe1 TT	360	static void __init flow_cache_cpu_prepare(struct flow_cache *fc,
d7997fe1 TT	361	struct flow_cache_percpu *fcp)
1da177e4	362	{
8e479560	363	fcp->hash_table = (struct hlist_head *)
d7997fe1 TT	364	__get_free_pages(GFP_KERNEL\|__GFP_ZERO, fc->order);
	365	if (!fcp->hash_table)
	366	panic("NET: failed to allocate flow cache order %lu\n", fc->order);
	367
	368	fcp->hash_rnd_recalc = 1;
	369	fcp->hash_count = 0;
	370	tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
1da177e4 LT	371	}
1da177e4 LT	372
1da177e4 LT	373	static int flow_cache_cpu(struct notifier_block *nfb,
	374	unsigned long action,
	375	void *hcpu)
	376	{
d7997fe1 TT	377	struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier);
	378	int cpu = (unsigned long) hcpu;
	379	struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
	380
8bb78442	381	if (action == CPU_DEAD \|\| action == CPU_DEAD_FROZEN)
d7997fe1	382	__flow_cache_shrink(fc, fcp, 0);
1da177e4 LT	383	return NOTIFY_OK;
1da177e4 LT	384	}
1da177e4	385
d7997fe1	386	static int flow_cache_init(struct flow_cache *fc)
1da177e4	387	{
d7997fe1	388	unsigned long order;
1da177e4 LT	389	int i;
1da177e4 LT	390
d7997fe1 TT	391	fc->hash_shift = 10;
	392	fc->low_watermark = 2 * flow_cache_hash_size(fc);
	393	fc->high_watermark = 4 * flow_cache_hash_size(fc);
	394
	395	for (order = 0;
	396	(PAGE_SIZE << order) <
8e479560	397	(sizeof(struct hlist_head)*flow_cache_hash_size(fc));
d7997fe1 TT	398	order++)
	399	/* NOTHING */;
	400	fc->order = order;
	401	fc->percpu = alloc_percpu(struct flow_cache_percpu);
1da177e4	402
d7997fe1 TT	403	setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
	404	(unsigned long) fc);
	405	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
	406	add_timer(&fc->rnd_timer);
1da177e4	407
6f912042	408	for_each_possible_cpu(i)
d7997fe1 TT	409	flow_cache_cpu_prepare(fc, per_cpu_ptr(fc->percpu, i));
	410
	411	fc->hotcpu_notifier = (struct notifier_block){
	412	.notifier_call = flow_cache_cpu,
	413	};
	414	register_hotcpu_notifier(&fc->hotcpu_notifier);
1da177e4	415
1da177e4 LT	416	return 0;
	417	}
	418
d7997fe1 TT	419	static int __init flow_cache_init_global(void)
	420	{
	421	flow_cachep = kmem_cache_create("flow_cache",
	422	sizeof(struct flow_cache_entry),
	423	0, SLAB_PANIC, NULL);
	424
	425	return flow_cache_init(&flow_cache_global);
	426	}
	427
	428	module_init(flow_cache_init_global);