]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/netfilter/nf_conntrack_core.c
[NETFILTER]: Add nf_conntrack subsystem.
[net-next-2.6.git] / net / netfilter / nf_conntrack_core.c
CommitLineData
9fb9cbb1
YK
1/* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
4
5/* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org>
7 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
14 * - new API and handling of conntrack/nat helpers
15 * - now capable of multiple expectations for one master
16 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
17 * - add usage/reference counts to ip_conntrack_expect
18 * - export ip_conntrack[_expect]_{find_get,put} functions
19 * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
20 * - generalize L3 protocol denendent part.
21 * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
22 * - add support various size of conntrack structures.
23 *
24 * Derived from net/ipv4/netfilter/ip_conntrack_core.c
25 */
26
27#include <linux/config.h>
28#include <linux/types.h>
29#include <linux/netfilter.h>
30#include <linux/module.h>
31#include <linux/skbuff.h>
32#include <linux/proc_fs.h>
33#include <linux/vmalloc.h>
34#include <linux/stddef.h>
35#include <linux/slab.h>
36#include <linux/random.h>
37#include <linux/jhash.h>
38#include <linux/err.h>
39#include <linux/percpu.h>
40#include <linux/moduleparam.h>
41#include <linux/notifier.h>
42#include <linux/kernel.h>
43#include <linux/netdevice.h>
44#include <linux/socket.h>
45
46/* This rwlock protects the main hash table, protocol/helper/expected
47 registrations, conntrack timers*/
48#define ASSERT_READ_LOCK(x)
49#define ASSERT_WRITE_LOCK(x)
50
51#include <net/netfilter/nf_conntrack.h>
52#include <net/netfilter/nf_conntrack_l3proto.h>
53#include <net/netfilter/nf_conntrack_protocol.h>
54#include <net/netfilter/nf_conntrack_helper.h>
55#include <net/netfilter/nf_conntrack_core.h>
56#include <linux/netfilter_ipv4/listhelp.h>
57
58#define NF_CONNTRACK_VERSION "0.4.1"
59
60#if 0
61#define DEBUGP printk
62#else
63#define DEBUGP(format, args...)
64#endif
65
66DEFINE_RWLOCK(nf_conntrack_lock);
67
68/* nf_conntrack_standalone needs this */
69atomic_t nf_conntrack_count = ATOMIC_INIT(0);
70
71void (*nf_conntrack_destroyed)(struct nf_conn *conntrack) = NULL;
72LIST_HEAD(nf_conntrack_expect_list);
73struct nf_conntrack_protocol **nf_ct_protos[PF_MAX];
74struct nf_conntrack_l3proto *nf_ct_l3protos[PF_MAX];
75static LIST_HEAD(helpers);
76unsigned int nf_conntrack_htable_size = 0;
77int nf_conntrack_max;
78struct list_head *nf_conntrack_hash;
79static kmem_cache_t *nf_conntrack_expect_cachep;
80struct nf_conn nf_conntrack_untracked;
81unsigned int nf_ct_log_invalid;
82static LIST_HEAD(unconfirmed);
83static int nf_conntrack_vmalloc;
84
85#ifdef CONFIG_NF_CONNTRACK_EVENTS
86struct notifier_block *nf_conntrack_chain;
87struct notifier_block *nf_conntrack_expect_chain;
88
89DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
90
91/* deliver cached events and clear cache entry - must be called with locally
92 * disabled softirqs */
93static inline void
94__nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
95{
96 DEBUGP("ecache: delivering events for %p\n", ecache->ct);
97 if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
98 && ecache->events)
99 notifier_call_chain(&nf_conntrack_chain, ecache->events,
100 ecache->ct);
101
102 ecache->events = 0;
103 nf_ct_put(ecache->ct);
104 ecache->ct = NULL;
105}
106
107/* Deliver all cached events for a particular conntrack. This is called
108 * by code prior to async packet handling for freeing the skb */
109void nf_ct_deliver_cached_events(const struct nf_conn *ct)
110{
111 struct nf_conntrack_ecache *ecache;
112
113 local_bh_disable();
114 ecache = &__get_cpu_var(nf_conntrack_ecache);
115 if (ecache->ct == ct)
116 __nf_ct_deliver_cached_events(ecache);
117 local_bh_enable();
118}
119
120/* Deliver cached events for old pending events, if current conntrack != old */
121void __nf_ct_event_cache_init(struct nf_conn *ct)
122{
123 struct nf_conntrack_ecache *ecache;
124
125 /* take care of delivering potentially old events */
126 ecache = &__get_cpu_var(nf_conntrack_ecache);
127 BUG_ON(ecache->ct == ct);
128 if (ecache->ct)
129 __nf_ct_deliver_cached_events(ecache);
130 /* initialize for this conntrack/packet */
131 ecache->ct = ct;
132 nf_conntrack_get(&ct->ct_general);
133}
134
135/* flush the event cache - touches other CPU's data and must not be called
136 * while packets are still passing through the code */
137static void nf_ct_event_cache_flush(void)
138{
139 struct nf_conntrack_ecache *ecache;
140 int cpu;
141
142 for_each_cpu(cpu) {
143 ecache = &per_cpu(nf_conntrack_ecache, cpu);
144 if (ecache->ct)
145 nf_ct_put(ecache->ct);
146 }
147}
148#else
149static inline void nf_ct_event_cache_flush(void) {}
150#endif /* CONFIG_NF_CONNTRACK_EVENTS */
151
152DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
153EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat);
154
155/*
156 * This scheme offers various size of "struct nf_conn" dependent on
157 * features(helper, nat, ...)
158 */
159
160#define NF_CT_FEATURES_NAMELEN 256
161static struct {
162 /* name of slab cache. printed in /proc/slabinfo */
163 char *name;
164
165 /* size of slab cache */
166 size_t size;
167
168 /* slab cache pointer */
169 kmem_cache_t *cachep;
170
171 /* allocated slab cache + modules which uses this slab cache */
172 int use;
173
174 /* Initialization */
175 int (*init_conntrack)(struct nf_conn *, u_int32_t);
176
177} nf_ct_cache[NF_CT_F_NUM];
178
179/* protect members of nf_ct_cache except of "use" */
180DEFINE_RWLOCK(nf_ct_cache_lock);
181
182/* This avoids calling kmem_cache_create() with same name simultaneously */
183DECLARE_MUTEX(nf_ct_cache_mutex);
184
185extern struct nf_conntrack_protocol nf_conntrack_generic_protocol;
186struct nf_conntrack_protocol *
187nf_ct_find_proto(u_int16_t l3proto, u_int8_t protocol)
188{
189 if (unlikely(nf_ct_protos[l3proto] == NULL))
190 return &nf_conntrack_generic_protocol;
191
192 return nf_ct_protos[l3proto][protocol];
193}
194
195static int nf_conntrack_hash_rnd_initted;
196static unsigned int nf_conntrack_hash_rnd;
197
198static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
199 unsigned int size, unsigned int rnd)
200{
201 unsigned int a, b;
202 a = jhash((void *)tuple->src.u3.all, sizeof(tuple->src.u3.all),
203 ((tuple->src.l3num) << 16) | tuple->dst.protonum);
204 b = jhash((void *)tuple->dst.u3.all, sizeof(tuple->dst.u3.all),
205 (tuple->src.u.all << 16) | tuple->dst.u.all);
206
207 return jhash_2words(a, b, rnd) % size;
208}
209
210static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple)
211{
212 return __hash_conntrack(tuple, nf_conntrack_htable_size,
213 nf_conntrack_hash_rnd);
214}
215
216/* Initialize "struct nf_conn" which has spaces for helper */
217static int
218init_conntrack_for_helper(struct nf_conn *conntrack, u_int32_t features)
219{
220
221 conntrack->help = (union nf_conntrack_help *)
222 (((unsigned long)conntrack->data
223 + (__alignof__(union nf_conntrack_help) - 1))
224 & (~((unsigned long)(__alignof__(union nf_conntrack_help) -1))));
225 return 0;
226}
227
228int nf_conntrack_register_cache(u_int32_t features, const char *name,
229 size_t size,
230 int (*init)(struct nf_conn *, u_int32_t))
231{
232 int ret = 0;
233 char *cache_name;
234 kmem_cache_t *cachep;
235
236 DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n",
237 features, name, size);
238
239 if (features < NF_CT_F_BASIC || features >= NF_CT_F_NUM) {
240 DEBUGP("nf_conntrack_register_cache: invalid features.: 0x%x\n",
241 features);
242 return -EINVAL;
243 }
244
245 down(&nf_ct_cache_mutex);
246
247 write_lock_bh(&nf_ct_cache_lock);
248 /* e.g: multiple helpers are loaded */
249 if (nf_ct_cache[features].use > 0) {
250 DEBUGP("nf_conntrack_register_cache: already resisterd.\n");
251 if ((!strncmp(nf_ct_cache[features].name, name,
252 NF_CT_FEATURES_NAMELEN))
253 && nf_ct_cache[features].size == size
254 && nf_ct_cache[features].init_conntrack == init) {
255 DEBUGP("nf_conntrack_register_cache: reusing.\n");
256 nf_ct_cache[features].use++;
257 ret = 0;
258 } else
259 ret = -EBUSY;
260
261 write_unlock_bh(&nf_ct_cache_lock);
262 up(&nf_ct_cache_mutex);
263 return ret;
264 }
265 write_unlock_bh(&nf_ct_cache_lock);
266
267 /*
268 * The memory space for name of slab cache must be alive until
269 * cache is destroyed.
270 */
271 cache_name = kmalloc(sizeof(char)*NF_CT_FEATURES_NAMELEN, GFP_ATOMIC);
272 if (cache_name == NULL) {
273 DEBUGP("nf_conntrack_register_cache: can't alloc cache_name\n");
274 ret = -ENOMEM;
275 goto out_up_mutex;
276 }
277
278 if (strlcpy(cache_name, name, NF_CT_FEATURES_NAMELEN)
279 >= NF_CT_FEATURES_NAMELEN) {
280 printk("nf_conntrack_register_cache: name too long\n");
281 ret = -EINVAL;
282 goto out_free_name;
283 }
284
285 cachep = kmem_cache_create(cache_name, size, 0, 0,
286 NULL, NULL);
287 if (!cachep) {
288 printk("nf_conntrack_register_cache: Can't create slab cache "
289 "for the features = 0x%x\n", features);
290 ret = -ENOMEM;
291 goto out_free_name;
292 }
293
294 write_lock_bh(&nf_ct_cache_lock);
295 nf_ct_cache[features].use = 1;
296 nf_ct_cache[features].size = size;
297 nf_ct_cache[features].init_conntrack = init;
298 nf_ct_cache[features].cachep = cachep;
299 nf_ct_cache[features].name = cache_name;
300 write_unlock_bh(&nf_ct_cache_lock);
301
302 goto out_up_mutex;
303
304out_free_name:
305 kfree(cache_name);
306out_up_mutex:
307 up(&nf_ct_cache_mutex);
308 return ret;
309}
310
311/* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */
312void nf_conntrack_unregister_cache(u_int32_t features)
313{
314 kmem_cache_t *cachep;
315 char *name;
316
317 /*
318 * This assures that kmem_cache_create() isn't called before destroying
319 * slab cache.
320 */
321 DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features);
322 down(&nf_ct_cache_mutex);
323
324 write_lock_bh(&nf_ct_cache_lock);
325 if (--nf_ct_cache[features].use > 0) {
326 write_unlock_bh(&nf_ct_cache_lock);
327 up(&nf_ct_cache_mutex);
328 return;
329 }
330 cachep = nf_ct_cache[features].cachep;
331 name = nf_ct_cache[features].name;
332 nf_ct_cache[features].cachep = NULL;
333 nf_ct_cache[features].name = NULL;
334 nf_ct_cache[features].init_conntrack = NULL;
335 nf_ct_cache[features].size = 0;
336 write_unlock_bh(&nf_ct_cache_lock);
337
338 synchronize_net();
339
340 kmem_cache_destroy(cachep);
341 kfree(name);
342
343 up(&nf_ct_cache_mutex);
344}
345
346int
347nf_ct_get_tuple(const struct sk_buff *skb,
348 unsigned int nhoff,
349 unsigned int dataoff,
350 u_int16_t l3num,
351 u_int8_t protonum,
352 struct nf_conntrack_tuple *tuple,
353 const struct nf_conntrack_l3proto *l3proto,
354 const struct nf_conntrack_protocol *protocol)
355{
356 NF_CT_TUPLE_U_BLANK(tuple);
357
358 tuple->src.l3num = l3num;
359 if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
360 return 0;
361
362 tuple->dst.protonum = protonum;
363 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
364
365 return protocol->pkt_to_tuple(skb, dataoff, tuple);
366}
367
368int
369nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
370 const struct nf_conntrack_tuple *orig,
371 const struct nf_conntrack_l3proto *l3proto,
372 const struct nf_conntrack_protocol *protocol)
373{
374 NF_CT_TUPLE_U_BLANK(inverse);
375
376 inverse->src.l3num = orig->src.l3num;
377 if (l3proto->invert_tuple(inverse, orig) == 0)
378 return 0;
379
380 inverse->dst.dir = !orig->dst.dir;
381
382 inverse->dst.protonum = orig->dst.protonum;
383 return protocol->invert_tuple(inverse, orig);
384}
385
386/* nf_conntrack_expect helper functions */
387static void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
388{
389 ASSERT_WRITE_LOCK(&nf_conntrack_lock);
390 NF_CT_ASSERT(!timer_pending(&exp_timeout));
391 list_del(&exp->list);
392 NF_CT_STAT_INC(expect_delete);
393 exp->master->expecting--;
394 nf_conntrack_expect_put(exp);
395}
396
397static void expectation_timed_out(unsigned long ul_expect)
398{
399 struct nf_conntrack_expect *exp = (void *)ul_expect;
400
401 write_lock_bh(&nf_conntrack_lock);
402 nf_ct_unlink_expect(exp);
403 write_unlock_bh(&nf_conntrack_lock);
404 nf_conntrack_expect_put(exp);
405}
406
407/* If an expectation for this connection is found, it gets delete from
408 * global list then returned. */
409static struct nf_conntrack_expect *
410find_expectation(const struct nf_conntrack_tuple *tuple)
411{
412 struct nf_conntrack_expect *i;
413
414 list_for_each_entry(i, &nf_conntrack_expect_list, list) {
415 /* If master is not in hash table yet (ie. packet hasn't left
416 this machine yet), how can other end know about expected?
417 Hence these are not the droids you are looking for (if
418 master ct never got confirmed, we'd hold a reference to it
419 and weird things would happen to future packets). */
420 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
421 && nf_ct_is_confirmed(i->master)) {
422 if (i->flags & NF_CT_EXPECT_PERMANENT) {
423 atomic_inc(&i->use);
424 return i;
425 } else if (del_timer(&i->timeout)) {
426 nf_ct_unlink_expect(i);
427 return i;
428 }
429 }
430 }
431 return NULL;
432}
433
434/* delete all expectations for this conntrack */
435static void remove_expectations(struct nf_conn *ct)
436{
437 struct nf_conntrack_expect *i, *tmp;
438
439 /* Optimization: most connection never expect any others. */
440 if (ct->expecting == 0)
441 return;
442
443 list_for_each_entry_safe(i, tmp, &nf_conntrack_expect_list, list) {
444 if (i->master == ct && del_timer(&i->timeout)) {
445 nf_ct_unlink_expect(i);
446 nf_conntrack_expect_put(i);
447 }
448 }
449}
450
451static void
452clean_from_lists(struct nf_conn *ct)
453{
454 unsigned int ho, hr;
455
456 DEBUGP("clean_from_lists(%p)\n", ct);
457 ASSERT_WRITE_LOCK(&nf_conntrack_lock);
458
459 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
460 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
461 LIST_DELETE(&nf_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
462 LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
463
464 /* Destroy all pending expectations */
465 remove_expectations(ct);
466}
467
468static void
469destroy_conntrack(struct nf_conntrack *nfct)
470{
471 struct nf_conn *ct = (struct nf_conn *)nfct;
472 struct nf_conntrack_l3proto *l3proto;
473 struct nf_conntrack_protocol *proto;
474
475 DEBUGP("destroy_conntrack(%p)\n", ct);
476 NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
477 NF_CT_ASSERT(!timer_pending(&ct->timeout));
478
479 nf_conntrack_event(IPCT_DESTROY, ct);
480 set_bit(IPS_DYING_BIT, &ct->status);
481
482 /* To make sure we don't get any weird locking issues here:
483 * destroy_conntrack() MUST NOT be called with a write lock
484 * to nf_conntrack_lock!!! -HW */
485 l3proto = nf_ct_find_l3proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num);
486 if (l3proto && l3proto->destroy)
487 l3proto->destroy(ct);
488
489 proto = nf_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num,
490 ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
491 if (proto && proto->destroy)
492 proto->destroy(ct);
493
494 if (nf_conntrack_destroyed)
495 nf_conntrack_destroyed(ct);
496
497 write_lock_bh(&nf_conntrack_lock);
498 /* Expectations will have been removed in clean_from_lists,
499 * except TFTP can create an expectation on the first packet,
500 * before connection is in the list, so we need to clean here,
501 * too. */
502 remove_expectations(ct);
503
504 /* We overload first tuple to link into unconfirmed list. */
505 if (!nf_ct_is_confirmed(ct)) {
506 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
507 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
508 }
509
510 NF_CT_STAT_INC(delete);
511 write_unlock_bh(&nf_conntrack_lock);
512
513 if (ct->master)
514 nf_ct_put(ct->master);
515
516 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
517 nf_conntrack_free(ct);
518}
519
520static void death_by_timeout(unsigned long ul_conntrack)
521{
522 struct nf_conn *ct = (void *)ul_conntrack;
523
524 write_lock_bh(&nf_conntrack_lock);
525 /* Inside lock so preempt is disabled on module removal path.
526 * Otherwise we can get spurious warnings. */
527 NF_CT_STAT_INC(delete_list);
528 clean_from_lists(ct);
529 write_unlock_bh(&nf_conntrack_lock);
530 nf_ct_put(ct);
531}
532
533static inline int
534conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i,
535 const struct nf_conntrack_tuple *tuple,
536 const struct nf_conn *ignored_conntrack)
537{
538 ASSERT_READ_LOCK(&nf_conntrack_lock);
539 return nf_ct_tuplehash_to_ctrack(i) != ignored_conntrack
540 && nf_ct_tuple_equal(tuple, &i->tuple);
541}
542
543static struct nf_conntrack_tuple_hash *
544__nf_conntrack_find(const struct nf_conntrack_tuple *tuple,
545 const struct nf_conn *ignored_conntrack)
546{
547 struct nf_conntrack_tuple_hash *h;
548 unsigned int hash = hash_conntrack(tuple);
549
550 ASSERT_READ_LOCK(&nf_conntrack_lock);
551 list_for_each_entry(h, &nf_conntrack_hash[hash], list) {
552 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
553 NF_CT_STAT_INC(found);
554 return h;
555 }
556 NF_CT_STAT_INC(searched);
557 }
558
559 return NULL;
560}
561
562/* Find a connection corresponding to a tuple. */
563struct nf_conntrack_tuple_hash *
564nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple,
565 const struct nf_conn *ignored_conntrack)
566{
567 struct nf_conntrack_tuple_hash *h;
568
569 read_lock_bh(&nf_conntrack_lock);
570 h = __nf_conntrack_find(tuple, ignored_conntrack);
571 if (h)
572 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
573 read_unlock_bh(&nf_conntrack_lock);
574
575 return h;
576}
577
578/* Confirm a connection given skb; places it in hash table */
579int
580__nf_conntrack_confirm(struct sk_buff **pskb)
581{
582 unsigned int hash, repl_hash;
583 struct nf_conn *ct;
584 enum ip_conntrack_info ctinfo;
585
586 ct = nf_ct_get(*pskb, &ctinfo);
587
588 /* ipt_REJECT uses nf_conntrack_attach to attach related
589 ICMP/TCP RST packets in other direction. Actual packet
590 which created connection will be IP_CT_NEW or for an
591 expected connection, IP_CT_RELATED. */
592 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
593 return NF_ACCEPT;
594
595 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
596 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
597
598 /* We're not in hash table, and we refuse to set up related
599 connections for unconfirmed conns. But packet copies and
600 REJECT will give spurious warnings here. */
601 /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
602
603 /* No external references means noone else could have
604 confirmed us. */
605 NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
606 DEBUGP("Confirming conntrack %p\n", ct);
607
608 write_lock_bh(&nf_conntrack_lock);
609
610 /* See if there's one in the list already, including reverse:
611 NAT could have grabbed it without realizing, since we're
612 not in the hash. If there is, we lost race. */
613 if (!LIST_FIND(&nf_conntrack_hash[hash],
614 conntrack_tuple_cmp,
615 struct nf_conntrack_tuple_hash *,
616 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
617 && !LIST_FIND(&nf_conntrack_hash[repl_hash],
618 conntrack_tuple_cmp,
619 struct nf_conntrack_tuple_hash *,
620 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
621 /* Remove from unconfirmed list */
622 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
623
624 list_prepend(&nf_conntrack_hash[hash],
625 &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
626 list_prepend(&nf_conntrack_hash[repl_hash],
627 &ct->tuplehash[IP_CT_DIR_REPLY]);
628 /* Timer relative to confirmation time, not original
629 setting time, otherwise we'd get timer wrap in
630 weird delay cases. */
631 ct->timeout.expires += jiffies;
632 add_timer(&ct->timeout);
633 atomic_inc(&ct->ct_general.use);
634 set_bit(IPS_CONFIRMED_BIT, &ct->status);
635 NF_CT_STAT_INC(insert);
636 write_unlock_bh(&nf_conntrack_lock);
637 if (ct->helper)
638 nf_conntrack_event_cache(IPCT_HELPER, *pskb);
639#ifdef CONFIG_NF_NAT_NEEDED
640 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
641 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
642 nf_conntrack_event_cache(IPCT_NATINFO, *pskb);
643#endif
644 nf_conntrack_event_cache(master_ct(ct) ?
645 IPCT_RELATED : IPCT_NEW, *pskb);
646 return NF_ACCEPT;
647 }
648
649 NF_CT_STAT_INC(insert_failed);
650 write_unlock_bh(&nf_conntrack_lock);
651 return NF_DROP;
652}
653
654/* Returns true if a connection correspondings to the tuple (required
655 for NAT). */
656int
657nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
658 const struct nf_conn *ignored_conntrack)
659{
660 struct nf_conntrack_tuple_hash *h;
661
662 read_lock_bh(&nf_conntrack_lock);
663 h = __nf_conntrack_find(tuple, ignored_conntrack);
664 read_unlock_bh(&nf_conntrack_lock);
665
666 return h != NULL;
667}
668
669/* There's a small race here where we may free a just-assured
670 connection. Too bad: we're in trouble anyway. */
671static inline int unreplied(const struct nf_conntrack_tuple_hash *i)
672{
673 return !(test_bit(IPS_ASSURED_BIT,
674 &nf_ct_tuplehash_to_ctrack(i)->status));
675}
676
677static int early_drop(struct list_head *chain)
678{
679 /* Traverse backwards: gives us oldest, which is roughly LRU */
680 struct nf_conntrack_tuple_hash *h;
681 struct nf_conn *ct = NULL;
682 int dropped = 0;
683
684 read_lock_bh(&nf_conntrack_lock);
685 h = LIST_FIND_B(chain, unreplied, struct nf_conntrack_tuple_hash *);
686 if (h) {
687 ct = nf_ct_tuplehash_to_ctrack(h);
688 atomic_inc(&ct->ct_general.use);
689 }
690 read_unlock_bh(&nf_conntrack_lock);
691
692 if (!ct)
693 return dropped;
694
695 if (del_timer(&ct->timeout)) {
696 death_by_timeout((unsigned long)ct);
697 dropped = 1;
698 NF_CT_STAT_INC(early_drop);
699 }
700 nf_ct_put(ct);
701 return dropped;
702}
703
704static inline int helper_cmp(const struct nf_conntrack_helper *i,
705 const struct nf_conntrack_tuple *rtuple)
706{
707 return nf_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
708}
709
710static struct nf_conntrack_helper *
711nf_ct_find_helper(const struct nf_conntrack_tuple *tuple)
712{
713 return LIST_FIND(&helpers, helper_cmp,
714 struct nf_conntrack_helper *,
715 tuple);
716}
717
718static struct nf_conn *
719__nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
720 const struct nf_conntrack_tuple *repl,
721 const struct nf_conntrack_l3proto *l3proto)
722{
723 struct nf_conn *conntrack = NULL;
724 u_int32_t features = 0;
725
726 if (!nf_conntrack_hash_rnd_initted) {
727 get_random_bytes(&nf_conntrack_hash_rnd, 4);
728 nf_conntrack_hash_rnd_initted = 1;
729 }
730
731 if (nf_conntrack_max
732 && atomic_read(&nf_conntrack_count) >= nf_conntrack_max) {
733 unsigned int hash = hash_conntrack(orig);
734 /* Try dropping from this hash chain. */
735 if (!early_drop(&nf_conntrack_hash[hash])) {
736 if (net_ratelimit())
737 printk(KERN_WARNING
738 "nf_conntrack: table full, dropping"
739 " packet.\n");
740 return ERR_PTR(-ENOMEM);
741 }
742 }
743
744 /* find features needed by this conntrack. */
745 features = l3proto->get_features(orig);
746 read_lock_bh(&nf_conntrack_lock);
747 if (nf_ct_find_helper(repl) != NULL)
748 features |= NF_CT_F_HELP;
749 read_unlock_bh(&nf_conntrack_lock);
750
751 DEBUGP("nf_conntrack_alloc: features=0x%x\n", features);
752
753 read_lock_bh(&nf_ct_cache_lock);
754
755 if (!nf_ct_cache[features].use) {
756 DEBUGP("nf_conntrack_alloc: not supported features = 0x%x\n",
757 features);
758 goto out;
759 }
760
761 conntrack = kmem_cache_alloc(nf_ct_cache[features].cachep, GFP_ATOMIC);
762 if (conntrack == NULL) {
763 DEBUGP("nf_conntrack_alloc: Can't alloc conntrack from cache\n");
764 goto out;
765 }
766
767 memset(conntrack, 0, nf_ct_cache[features].size);
768 conntrack->features = features;
769 if (nf_ct_cache[features].init_conntrack &&
770 nf_ct_cache[features].init_conntrack(conntrack, features) < 0) {
771 DEBUGP("nf_conntrack_alloc: failed to init\n");
772 kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
773 conntrack = NULL;
774 goto out;
775 }
776
777 atomic_set(&conntrack->ct_general.use, 1);
778 conntrack->ct_general.destroy = destroy_conntrack;
779 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
780 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
781 /* Don't set timer yet: wait for confirmation */
782 init_timer(&conntrack->timeout);
783 conntrack->timeout.data = (unsigned long)conntrack;
784 conntrack->timeout.function = death_by_timeout;
785
786 atomic_inc(&nf_conntrack_count);
787out:
788 read_unlock_bh(&nf_ct_cache_lock);
789 return conntrack;
790}
791
792struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
793 const struct nf_conntrack_tuple *repl)
794{
795 struct nf_conntrack_l3proto *l3proto;
796
797 l3proto = nf_ct_find_l3proto(orig->src.l3num);
798 return __nf_conntrack_alloc(orig, repl, l3proto);
799}
800
801void nf_conntrack_free(struct nf_conn *conntrack)
802{
803 u_int32_t features = conntrack->features;
804 NF_CT_ASSERT(features >= NF_CT_F_BASIC && features < NF_CT_F_NUM);
805 DEBUGP("nf_conntrack_free: features = 0x%x, conntrack=%p\n", features,
806 conntrack);
807 kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
808 atomic_dec(&nf_conntrack_count);
809}
810
811/* Allocate a new conntrack: we return -ENOMEM if classification
812 failed due to stress. Otherwise it really is unclassifiable. */
813static struct nf_conntrack_tuple_hash *
814init_conntrack(const struct nf_conntrack_tuple *tuple,
815 struct nf_conntrack_l3proto *l3proto,
816 struct nf_conntrack_protocol *protocol,
817 struct sk_buff *skb,
818 unsigned int dataoff)
819{
820 struct nf_conn *conntrack;
821 struct nf_conntrack_tuple repl_tuple;
822 struct nf_conntrack_expect *exp;
823
824 if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, protocol)) {
825 DEBUGP("Can't invert tuple.\n");
826 return NULL;
827 }
828
829 conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto);
830 if (conntrack == NULL || IS_ERR(conntrack)) {
831 DEBUGP("Can't allocate conntrack.\n");
832 return (struct nf_conntrack_tuple_hash *)conntrack;
833 }
834
835 if (!protocol->new(conntrack, skb, dataoff)) {
836 nf_conntrack_free(conntrack);
837 DEBUGP("init conntrack: can't track with proto module\n");
838 return NULL;
839 }
840
841 write_lock_bh(&nf_conntrack_lock);
842 exp = find_expectation(tuple);
843
844 if (exp) {
845 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
846 conntrack, exp);
847 /* Welcome, Mr. Bond. We've been expecting you... */
848 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
849 conntrack->master = exp->master;
850#ifdef CONFIG_NF_CONNTRACK_MARK
851 conntrack->mark = exp->master->mark;
852#endif
853 nf_conntrack_get(&conntrack->master->ct_general);
854 NF_CT_STAT_INC(expect_new);
855 } else {
856 conntrack->helper = nf_ct_find_helper(&repl_tuple);
857
858 NF_CT_STAT_INC(new);
859 }
860
861 /* Overload tuple linked list to put us in unconfirmed list. */
862 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
863
864 write_unlock_bh(&nf_conntrack_lock);
865
866 if (exp) {
867 if (exp->expectfn)
868 exp->expectfn(conntrack, exp);
869 nf_conntrack_expect_put(exp);
870 }
871
872 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
873}
874
875/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
876static inline struct nf_conn *
877resolve_normal_ct(struct sk_buff *skb,
878 unsigned int dataoff,
879 u_int16_t l3num,
880 u_int8_t protonum,
881 struct nf_conntrack_l3proto *l3proto,
882 struct nf_conntrack_protocol *proto,
883 int *set_reply,
884 enum ip_conntrack_info *ctinfo)
885{
886 struct nf_conntrack_tuple tuple;
887 struct nf_conntrack_tuple_hash *h;
888 struct nf_conn *ct;
889
890 if (!nf_ct_get_tuple(skb, (unsigned int)(skb->nh.raw - skb->data),
891 dataoff, l3num, protonum, &tuple, l3proto,
892 proto)) {
893 DEBUGP("resolve_normal_ct: Can't get tuple\n");
894 return NULL;
895 }
896
897 /* look for tuple match */
898 h = nf_conntrack_find_get(&tuple, NULL);
899 if (!h) {
900 h = init_conntrack(&tuple, l3proto, proto, skb, dataoff);
901 if (!h)
902 return NULL;
903 if (IS_ERR(h))
904 return (void *)h;
905 }
906 ct = nf_ct_tuplehash_to_ctrack(h);
907
908 /* It exists; we have (non-exclusive) reference. */
909 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
910 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
911 /* Please set reply bit if this packet OK */
912 *set_reply = 1;
913 } else {
914 /* Once we've had two way comms, always ESTABLISHED. */
915 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
916 DEBUGP("nf_conntrack_in: normal packet for %p\n", ct);
917 *ctinfo = IP_CT_ESTABLISHED;
918 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
919 DEBUGP("nf_conntrack_in: related packet for %p\n", ct);
920 *ctinfo = IP_CT_RELATED;
921 } else {
922 DEBUGP("nf_conntrack_in: new packet for %p\n", ct);
923 *ctinfo = IP_CT_NEW;
924 }
925 *set_reply = 0;
926 }
927 skb->nfct = &ct->ct_general;
928 skb->nfctinfo = *ctinfo;
929 return ct;
930}
931
932unsigned int
933nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb)
934{
935 struct nf_conn *ct;
936 enum ip_conntrack_info ctinfo;
937 struct nf_conntrack_l3proto *l3proto;
938 struct nf_conntrack_protocol *proto;
939 unsigned int dataoff;
940 u_int8_t protonum;
941 int set_reply = 0;
942 int ret;
943
944 /* Previously seen (loopback or untracked)? Ignore. */
945 if ((*pskb)->nfct) {
946 NF_CT_STAT_INC(ignore);
947 return NF_ACCEPT;
948 }
949
950 l3proto = nf_ct_find_l3proto((u_int16_t)pf);
951 if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) {
952 DEBUGP("not prepared to track yet or error occured\n");
953 return -ret;
954 }
955
956 proto = nf_ct_find_proto((u_int16_t)pf, protonum);
957
958 /* It may be an special packet, error, unclean...
959 * inverse of the return code tells to the netfilter
960 * core what to do with the packet. */
961 if (proto->error != NULL &&
962 (ret = proto->error(*pskb, dataoff, &ctinfo, pf, hooknum)) <= 0) {
963 NF_CT_STAT_INC(error);
964 NF_CT_STAT_INC(invalid);
965 return -ret;
966 }
967
968 ct = resolve_normal_ct(*pskb, dataoff, pf, protonum, l3proto, proto,
969 &set_reply, &ctinfo);
970 if (!ct) {
971 /* Not valid part of a connection */
972 NF_CT_STAT_INC(invalid);
973 return NF_ACCEPT;
974 }
975
976 if (IS_ERR(ct)) {
977 /* Too stressed to deal. */
978 NF_CT_STAT_INC(drop);
979 return NF_DROP;
980 }
981
982 NF_CT_ASSERT((*pskb)->nfct);
983
984 ret = proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum);
985 if (ret < 0) {
986 /* Invalid: inverse of the return code tells
987 * the netfilter core what to do */
988 DEBUGP("nf_conntrack_in: Can't track with proto module\n");
989 nf_conntrack_put((*pskb)->nfct);
990 (*pskb)->nfct = NULL;
991 NF_CT_STAT_INC(invalid);
992 return -ret;
993 }
994
995 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
996 nf_conntrack_event_cache(IPCT_STATUS, *pskb);
997
998 return ret;
999}
1000
1001int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
1002 const struct nf_conntrack_tuple *orig)
1003{
1004 return nf_ct_invert_tuple(inverse, orig,
1005 nf_ct_find_l3proto(orig->src.l3num),
1006 nf_ct_find_proto(orig->src.l3num,
1007 orig->dst.protonum));
1008}
1009
1010/* Would two expected things clash? */
1011static inline int expect_clash(const struct nf_conntrack_expect *a,
1012 const struct nf_conntrack_expect *b)
1013{
1014 /* Part covered by intersection of masks must be unequal,
1015 otherwise they clash */
1016 struct nf_conntrack_tuple intersect_mask;
1017 int count;
1018
1019 intersect_mask.src.l3num = a->mask.src.l3num & b->mask.src.l3num;
1020 intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
1021 intersect_mask.dst.u.all = a->mask.dst.u.all & b->mask.dst.u.all;
1022 intersect_mask.dst.protonum = a->mask.dst.protonum
1023 & b->mask.dst.protonum;
1024
1025 for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1026 intersect_mask.src.u3.all[count] =
1027 a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
1028 }
1029
1030 for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1031 intersect_mask.dst.u3.all[count] =
1032 a->mask.dst.u3.all[count] & b->mask.dst.u3.all[count];
1033 }
1034
1035 return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
1036}
1037
1038static inline int expect_matches(const struct nf_conntrack_expect *a,
1039 const struct nf_conntrack_expect *b)
1040{
1041 return a->master == b->master
1042 && nf_ct_tuple_equal(&a->tuple, &b->tuple)
1043 && nf_ct_tuple_equal(&a->mask, &b->mask);
1044}
1045
1046/* Generally a bad idea to call this: could have matched already. */
1047void nf_conntrack_unexpect_related(struct nf_conntrack_expect *exp)
1048{
1049 struct nf_conntrack_expect *i;
1050
1051 write_lock_bh(&nf_conntrack_lock);
1052 /* choose the the oldest expectation to evict */
1053 list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1054 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
1055 nf_ct_unlink_expect(i);
1056 write_unlock_bh(&nf_conntrack_lock);
1057 nf_conntrack_expect_put(i);
1058 return;
1059 }
1060 }
1061 write_unlock_bh(&nf_conntrack_lock);
1062}
1063
1064/* We don't increase the master conntrack refcount for non-fulfilled
1065 * conntracks. During the conntrack destruction, the expectations are
1066 * always killed before the conntrack itself */
1067struct nf_conntrack_expect *nf_conntrack_expect_alloc(struct nf_conn *me)
1068{
1069 struct nf_conntrack_expect *new;
1070
1071 new = kmem_cache_alloc(nf_conntrack_expect_cachep, GFP_ATOMIC);
1072 if (!new) {
1073 DEBUGP("expect_related: OOM allocating expect\n");
1074 return NULL;
1075 }
1076 new->master = me;
1077 atomic_set(&new->use, 1);
1078 return new;
1079}
1080
1081void nf_conntrack_expect_put(struct nf_conntrack_expect *exp)
1082{
1083 if (atomic_dec_and_test(&exp->use))
1084 kmem_cache_free(nf_conntrack_expect_cachep, exp);
1085}
1086
1087static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp)
1088{
1089 atomic_inc(&exp->use);
1090 exp->master->expecting++;
1091 list_add(&exp->list, &nf_conntrack_expect_list);
1092
1093 init_timer(&exp->timeout);
1094 exp->timeout.data = (unsigned long)exp;
1095 exp->timeout.function = expectation_timed_out;
1096 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
1097 add_timer(&exp->timeout);
1098
1099 atomic_inc(&exp->use);
1100 NF_CT_STAT_INC(expect_create);
1101}
1102
1103/* Race with expectations being used means we could have none to find; OK. */
1104static void evict_oldest_expect(struct nf_conn *master)
1105{
1106 struct nf_conntrack_expect *i;
1107
1108 list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1109 if (i->master == master) {
1110 if (del_timer(&i->timeout)) {
1111 nf_ct_unlink_expect(i);
1112 nf_conntrack_expect_put(i);
1113 }
1114 break;
1115 }
1116 }
1117}
1118
1119static inline int refresh_timer(struct nf_conntrack_expect *i)
1120{
1121 if (!del_timer(&i->timeout))
1122 return 0;
1123
1124 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
1125 add_timer(&i->timeout);
1126 return 1;
1127}
1128
1129int nf_conntrack_expect_related(struct nf_conntrack_expect *expect)
1130{
1131 struct nf_conntrack_expect *i;
1132 int ret;
1133
1134 DEBUGP("nf_conntrack_expect_related %p\n", related_to);
1135 DEBUGP("tuple: "); NF_CT_DUMP_TUPLE(&expect->tuple);
1136 DEBUGP("mask: "); NF_CT_DUMP_TUPLE(&expect->mask);
1137
1138 write_lock_bh(&nf_conntrack_lock);
1139 list_for_each_entry(i, &nf_conntrack_expect_list, list) {
1140 if (expect_matches(i, expect)) {
1141 /* Refresh timer: if it's dying, ignore.. */
1142 if (refresh_timer(i)) {
1143 ret = 0;
1144 goto out;
1145 }
1146 } else if (expect_clash(i, expect)) {
1147 ret = -EBUSY;
1148 goto out;
1149 }
1150 }
1151 /* Will be over limit? */
1152 if (expect->master->helper->max_expected &&
1153 expect->master->expecting >= expect->master->helper->max_expected)
1154 evict_oldest_expect(expect->master);
1155
1156 nf_conntrack_expect_insert(expect);
1157 nf_conntrack_expect_event(IPEXP_NEW, expect);
1158 ret = 0;
1159out:
1160 write_unlock_bh(&nf_conntrack_lock);
1161 return ret;
1162}
1163
1164/* Alter reply tuple (maybe alter helper). This is for NAT, and is
1165 implicitly racy: see __nf_conntrack_confirm */
1166void nf_conntrack_alter_reply(struct nf_conn *conntrack,
1167 const struct nf_conntrack_tuple *newreply)
1168{
1169 write_lock_bh(&nf_conntrack_lock);
1170 /* Should be unconfirmed, so not in hash table yet */
1171 NF_CT_ASSERT(!nf_ct_is_confirmed(conntrack));
1172
1173 DEBUGP("Altering reply tuple of %p to ", conntrack);
1174 NF_CT_DUMP_TUPLE(newreply);
1175
1176 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1177 if (!conntrack->master && conntrack->expecting == 0)
1178 conntrack->helper = nf_ct_find_helper(newreply);
1179 write_unlock_bh(&nf_conntrack_lock);
1180}
1181
1182int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
1183{
1184 int ret;
1185 BUG_ON(me->timeout == 0);
1186
1187 ret = nf_conntrack_register_cache(NF_CT_F_HELP, "nf_conntrack:help",
1188 sizeof(struct nf_conn)
1189 + sizeof(union nf_conntrack_help)
1190 + __alignof__(union nf_conntrack_help),
1191 init_conntrack_for_helper);
1192 if (ret < 0) {
1193 printk(KERN_ERR "nf_conntrack_helper_reigster: Unable to create slab cache for conntracks\n");
1194 return ret;
1195 }
1196 write_lock_bh(&nf_conntrack_lock);
1197 list_prepend(&helpers, me);
1198 write_unlock_bh(&nf_conntrack_lock);
1199
1200 return 0;
1201}
1202
1203static inline int unhelp(struct nf_conntrack_tuple_hash *i,
1204 const struct nf_conntrack_helper *me)
1205{
1206 if (nf_ct_tuplehash_to_ctrack(i)->helper == me) {
1207 nf_conntrack_event(IPCT_HELPER, nf_ct_tuplehash_to_ctrack(i));
1208 nf_ct_tuplehash_to_ctrack(i)->helper = NULL;
1209 }
1210 return 0;
1211}
1212
1213void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
1214{
1215 unsigned int i;
1216 struct nf_conntrack_expect *exp, *tmp;
1217
1218 /* Need write lock here, to delete helper. */
1219 write_lock_bh(&nf_conntrack_lock);
1220 LIST_DELETE(&helpers, me);
1221
1222 /* Get rid of expectations */
1223 list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, list) {
1224 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1225 nf_ct_unlink_expect(exp);
1226 nf_conntrack_expect_put(exp);
1227 }
1228 }
1229
1230 /* Get rid of expecteds, set helpers to NULL. */
1231 LIST_FIND_W(&unconfirmed, unhelp, struct nf_conntrack_tuple_hash*, me);
1232 for (i = 0; i < nf_conntrack_htable_size; i++)
1233 LIST_FIND_W(&nf_conntrack_hash[i], unhelp,
1234 struct nf_conntrack_tuple_hash *, me);
1235 write_unlock_bh(&nf_conntrack_lock);
1236
1237 /* Someone could be still looking at the helper in a bh. */
1238 synchronize_net();
1239}
1240
1241/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1242void __nf_ct_refresh_acct(struct nf_conn *ct,
1243 enum ip_conntrack_info ctinfo,
1244 const struct sk_buff *skb,
1245 unsigned long extra_jiffies,
1246 int do_acct)
1247{
1248 int event = 0;
1249
1250 NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
1251 NF_CT_ASSERT(skb);
1252
1253 write_lock_bh(&nf_conntrack_lock);
1254
1255 /* If not in hash table, timer will not be active yet */
1256 if (!nf_ct_is_confirmed(ct)) {
1257 ct->timeout.expires = extra_jiffies;
1258 event = IPCT_REFRESH;
1259 } else {
1260 /* Need del_timer for race avoidance (may already be dying). */
1261 if (del_timer(&ct->timeout)) {
1262 ct->timeout.expires = jiffies + extra_jiffies;
1263 add_timer(&ct->timeout);
1264 event = IPCT_REFRESH;
1265 }
1266 }
1267
1268#ifdef CONFIG_NF_CT_ACCT
1269 if (do_acct) {
1270 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1271 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1272 skb->len - (unsigned int)(skb->nh.raw - skb->data);
1273 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1274 || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1275 event |= IPCT_COUNTER_FILLING;
1276 }
1277#endif
1278
1279 write_unlock_bh(&nf_conntrack_lock);
1280
1281 /* must be unlocked when calling event cache */
1282 if (event)
1283 nf_conntrack_event_cache(event, skb);
1284}
1285
1286/* Used by ipt_REJECT and ip6t_REJECT. */
1287void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1288{
1289 struct nf_conn *ct;
1290 enum ip_conntrack_info ctinfo;
1291
1292 /* This ICMP is in reverse direction to the packet which caused it */
1293 ct = nf_ct_get(skb, &ctinfo);
1294 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1295 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1296 else
1297 ctinfo = IP_CT_RELATED;
1298
1299 /* Attach to new skbuff, and increment count */
1300 nskb->nfct = &ct->ct_general;
1301 nskb->nfctinfo = ctinfo;
1302 nf_conntrack_get(nskb->nfct);
1303}
1304
1305static inline int
1306do_iter(const struct nf_conntrack_tuple_hash *i,
1307 int (*iter)(struct nf_conn *i, void *data),
1308 void *data)
1309{
1310 return iter(nf_ct_tuplehash_to_ctrack(i), data);
1311}
1312
1313/* Bring out ya dead! */
1314static struct nf_conntrack_tuple_hash *
1315get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
1316 void *data, unsigned int *bucket)
1317{
1318 struct nf_conntrack_tuple_hash *h = NULL;
1319
1320 write_lock_bh(&nf_conntrack_lock);
1321 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
1322 h = LIST_FIND_W(&nf_conntrack_hash[*bucket], do_iter,
1323 struct nf_conntrack_tuple_hash *, iter, data);
1324 if (h)
1325 break;
1326 }
1327 if (!h)
1328 h = LIST_FIND_W(&unconfirmed, do_iter,
1329 struct nf_conntrack_tuple_hash *, iter, data);
1330 if (h)
1331 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
1332 write_unlock_bh(&nf_conntrack_lock);
1333
1334 return h;
1335}
1336
1337void
1338nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data)
1339{
1340 struct nf_conntrack_tuple_hash *h;
1341 unsigned int bucket = 0;
1342
1343 while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1344 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
1345 /* Time to push up daises... */
1346 if (del_timer(&ct->timeout))
1347 death_by_timeout((unsigned long)ct);
1348 /* ... else the timer will get him soon. */
1349
1350 nf_ct_put(ct);
1351 }
1352}
1353
1354static int kill_all(struct nf_conn *i, void *data)
1355{
1356 return 1;
1357}
1358
1359static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size)
1360{
1361 if (vmalloced)
1362 vfree(hash);
1363 else
1364 free_pages((unsigned long)hash,
1365 get_order(sizeof(struct list_head) * size));
1366}
1367
1368/* Mishearing the voices in his head, our hero wonders how he's
1369 supposed to kill the mall. */
1370void nf_conntrack_cleanup(void)
1371{
1372 int i;
1373
1374 /* This makes sure all current packets have passed through
1375 netfilter framework. Roll on, two-stage module
1376 delete... */
1377 synchronize_net();
1378
1379 nf_ct_event_cache_flush();
1380 i_see_dead_people:
1381 nf_ct_iterate_cleanup(kill_all, NULL);
1382 if (atomic_read(&nf_conntrack_count) != 0) {
1383 schedule();
1384 goto i_see_dead_people;
1385 }
1386
1387 for (i = 0; i < NF_CT_F_NUM; i++) {
1388 if (nf_ct_cache[i].use == 0)
1389 continue;
1390
1391 NF_CT_ASSERT(nf_ct_cache[i].use == 1);
1392 nf_ct_cache[i].use = 1;
1393 nf_conntrack_unregister_cache(i);
1394 }
1395 kmem_cache_destroy(nf_conntrack_expect_cachep);
1396 free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1397 nf_conntrack_htable_size);
1398}
1399
1400static struct list_head *alloc_hashtable(int size, int *vmalloced)
1401{
1402 struct list_head *hash;
1403 unsigned int i;
1404
1405 *vmalloced = 0;
1406 hash = (void*)__get_free_pages(GFP_KERNEL,
1407 get_order(sizeof(struct list_head)
1408 * size));
1409 if (!hash) {
1410 *vmalloced = 1;
1411 printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
1412 hash = vmalloc(sizeof(struct list_head) * size);
1413 }
1414
1415 if (hash)
1416 for (i = 0; i < size; i++)
1417 INIT_LIST_HEAD(&hash[i]);
1418
1419 return hash;
1420}
1421
1422int set_hashsize(const char *val, struct kernel_param *kp)
1423{
1424 int i, bucket, hashsize, vmalloced;
1425 int old_vmalloced, old_size;
1426 int rnd;
1427 struct list_head *hash, *old_hash;
1428 struct nf_conntrack_tuple_hash *h;
1429
1430 /* On boot, we can set this without any fancy locking. */
1431 if (!nf_conntrack_htable_size)
1432 return param_set_uint(val, kp);
1433
1434 hashsize = simple_strtol(val, NULL, 0);
1435 if (!hashsize)
1436 return -EINVAL;
1437
1438 hash = alloc_hashtable(hashsize, &vmalloced);
1439 if (!hash)
1440 return -ENOMEM;
1441
1442 /* We have to rehahs for the new table anyway, so we also can
1443 * use a newrandom seed */
1444 get_random_bytes(&rnd, 4);
1445
1446 write_lock_bh(&nf_conntrack_lock);
1447 for (i = 0; i < nf_conntrack_htable_size; i++) {
1448 while (!list_empty(&nf_conntrack_hash[i])) {
1449 h = list_entry(nf_conntrack_hash[i].next,
1450 struct nf_conntrack_tuple_hash, list);
1451 list_del(&h->list);
1452 bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1453 list_add_tail(&h->list, &hash[bucket]);
1454 }
1455 }
1456 old_size = nf_conntrack_htable_size;
1457 old_vmalloced = nf_conntrack_vmalloc;
1458 old_hash = nf_conntrack_hash;
1459
1460 nf_conntrack_htable_size = hashsize;
1461 nf_conntrack_vmalloc = vmalloced;
1462 nf_conntrack_hash = hash;
1463 nf_conntrack_hash_rnd = rnd;
1464 write_unlock_bh(&nf_conntrack_lock);
1465
1466 free_conntrack_hash(old_hash, old_vmalloced, old_size);
1467 return 0;
1468}
1469
1470module_param_call(hashsize, set_hashsize, param_get_uint,
1471 &nf_conntrack_htable_size, 0600);
1472
1473int __init nf_conntrack_init(void)
1474{
1475 unsigned int i;
1476 int ret;
1477
1478 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1479 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1480 if (!nf_conntrack_htable_size) {
1481 nf_conntrack_htable_size
1482 = (((num_physpages << PAGE_SHIFT) / 16384)
1483 / sizeof(struct list_head));
1484 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1485 nf_conntrack_htable_size = 8192;
1486 if (nf_conntrack_htable_size < 16)
1487 nf_conntrack_htable_size = 16;
1488 }
1489 nf_conntrack_max = 8 * nf_conntrack_htable_size;
1490
1491 printk("nf_conntrack version %s (%u buckets, %d max)\n",
1492 NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
1493 nf_conntrack_max);
1494
1495 nf_conntrack_hash = alloc_hashtable(nf_conntrack_htable_size,
1496 &nf_conntrack_vmalloc);
1497 if (!nf_conntrack_hash) {
1498 printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
1499 goto err_out;
1500 }
1501
1502 ret = nf_conntrack_register_cache(NF_CT_F_BASIC, "nf_conntrack:basic",
1503 sizeof(struct nf_conn), NULL);
1504 if (ret < 0) {
1505 printk(KERN_ERR "Unable to create nf_conn slab cache\n");
1506 goto err_free_hash;
1507 }
1508
1509 nf_conntrack_expect_cachep = kmem_cache_create("nf_conntrack_expect",
1510 sizeof(struct nf_conntrack_expect),
1511 0, 0, NULL, NULL);
1512 if (!nf_conntrack_expect_cachep) {
1513 printk(KERN_ERR "Unable to create nf_expect slab cache\n");
1514 goto err_free_conntrack_slab;
1515 }
1516
1517 /* Don't NEED lock here, but good form anyway. */
1518 write_lock_bh(&nf_conntrack_lock);
1519 for (i = 0; i < PF_MAX; i++)
1520 nf_ct_l3protos[i] = &nf_conntrack_generic_l3proto;
1521 write_unlock_bh(&nf_conntrack_lock);
1522
1523 /* Set up fake conntrack:
1524 - to never be deleted, not in any hashes */
1525 atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
1526 /* - and look it like as a confirmed connection */
1527 set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
1528
1529 return ret;
1530
1531err_free_conntrack_slab:
1532 nf_conntrack_unregister_cache(NF_CT_F_BASIC);
1533err_free_hash:
1534 free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1535 nf_conntrack_htable_size);
1536err_out:
1537 return -ENOMEM;
1538}