]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/netfilter/nf_conntrack_core.c
[NETFILTER]: remove unused include file
[net-next-2.6.git] / net / netfilter / nf_conntrack_core.c
CommitLineData
9fb9cbb1
YK
1/* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
4
5/* (C) 1999-2001 Paul `Rusty' Russell
dc808fe2 6 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
9fb9cbb1
YK
7 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
14 * - new API and handling of conntrack/nat helpers
15 * - now capable of multiple expectations for one master
16 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
17 * - add usage/reference counts to ip_conntrack_expect
18 * - export ip_conntrack[_expect]_{find_get,put} functions
19 * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
20 * - generalize L3 protocol denendent part.
21 * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
22 * - add support various size of conntrack structures.
dc808fe2
HW
23 * 26 Jan 2006: Harald Welte <laforge@netfilter.org>
24 * - restructure nf_conn (introduce nf_conn_help)
25 * - redesign 'features' how they were originally intended
b9f78f9f
PNA
26 * 26 Feb 2006: Pablo Neira Ayuso <pablo@eurodev.net>
27 * - add support for L3 protocol module load on demand.
9fb9cbb1
YK
28 *
29 * Derived from net/ipv4/netfilter/ip_conntrack_core.c
30 */
31
9fb9cbb1
YK
32#include <linux/types.h>
33#include <linux/netfilter.h>
34#include <linux/module.h>
35#include <linux/skbuff.h>
36#include <linux/proc_fs.h>
37#include <linux/vmalloc.h>
38#include <linux/stddef.h>
39#include <linux/slab.h>
40#include <linux/random.h>
41#include <linux/jhash.h>
42#include <linux/err.h>
43#include <linux/percpu.h>
44#include <linux/moduleparam.h>
45#include <linux/notifier.h>
46#include <linux/kernel.h>
47#include <linux/netdevice.h>
48#include <linux/socket.h>
49
50/* This rwlock protects the main hash table, protocol/helper/expected
51 registrations, conntrack timers*/
52#define ASSERT_READ_LOCK(x)
53#define ASSERT_WRITE_LOCK(x)
54
55#include <net/netfilter/nf_conntrack.h>
56#include <net/netfilter/nf_conntrack_l3proto.h>
57#include <net/netfilter/nf_conntrack_protocol.h>
58#include <net/netfilter/nf_conntrack_helper.h>
59#include <net/netfilter/nf_conntrack_core.h>
60#include <linux/netfilter_ipv4/listhelp.h>
61
dc808fe2 62#define NF_CONNTRACK_VERSION "0.5.0"
9fb9cbb1
YK
63
64#if 0
65#define DEBUGP printk
66#else
67#define DEBUGP(format, args...)
68#endif
69
70DEFINE_RWLOCK(nf_conntrack_lock);
71
72/* nf_conntrack_standalone needs this */
73atomic_t nf_conntrack_count = ATOMIC_INIT(0);
74
75void (*nf_conntrack_destroyed)(struct nf_conn *conntrack) = NULL;
76LIST_HEAD(nf_conntrack_expect_list);
77struct nf_conntrack_protocol **nf_ct_protos[PF_MAX];
78struct nf_conntrack_l3proto *nf_ct_l3protos[PF_MAX];
79static LIST_HEAD(helpers);
94aec08e
BH
80unsigned int nf_conntrack_htable_size __read_mostly = 0;
81int nf_conntrack_max __read_mostly;
9fb9cbb1
YK
82struct list_head *nf_conntrack_hash;
83static kmem_cache_t *nf_conntrack_expect_cachep;
84struct nf_conn nf_conntrack_untracked;
94aec08e 85unsigned int nf_ct_log_invalid __read_mostly;
9fb9cbb1
YK
86static LIST_HEAD(unconfirmed);
87static int nf_conntrack_vmalloc;
88
4e3882f7
PNA
89static unsigned int nf_conntrack_next_id;
90static unsigned int nf_conntrack_expect_next_id;
9fb9cbb1 91#ifdef CONFIG_NF_CONNTRACK_EVENTS
e041c683
AS
92ATOMIC_NOTIFIER_HEAD(nf_conntrack_chain);
93ATOMIC_NOTIFIER_HEAD(nf_conntrack_expect_chain);
9fb9cbb1
YK
94
95DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
96
97/* deliver cached events and clear cache entry - must be called with locally
98 * disabled softirqs */
99static inline void
100__nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
101{
102 DEBUGP("ecache: delivering events for %p\n", ecache->ct);
103 if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
104 && ecache->events)
e041c683 105 atomic_notifier_call_chain(&nf_conntrack_chain, ecache->events,
9fb9cbb1
YK
106 ecache->ct);
107
108 ecache->events = 0;
109 nf_ct_put(ecache->ct);
110 ecache->ct = NULL;
111}
112
113/* Deliver all cached events for a particular conntrack. This is called
114 * by code prior to async packet handling for freeing the skb */
115void nf_ct_deliver_cached_events(const struct nf_conn *ct)
116{
117 struct nf_conntrack_ecache *ecache;
118
119 local_bh_disable();
120 ecache = &__get_cpu_var(nf_conntrack_ecache);
121 if (ecache->ct == ct)
122 __nf_ct_deliver_cached_events(ecache);
123 local_bh_enable();
124}
125
126/* Deliver cached events for old pending events, if current conntrack != old */
127void __nf_ct_event_cache_init(struct nf_conn *ct)
128{
129 struct nf_conntrack_ecache *ecache;
130
131 /* take care of delivering potentially old events */
132 ecache = &__get_cpu_var(nf_conntrack_ecache);
133 BUG_ON(ecache->ct == ct);
134 if (ecache->ct)
135 __nf_ct_deliver_cached_events(ecache);
136 /* initialize for this conntrack/packet */
137 ecache->ct = ct;
138 nf_conntrack_get(&ct->ct_general);
139}
140
141/* flush the event cache - touches other CPU's data and must not be called
142 * while packets are still passing through the code */
143static void nf_ct_event_cache_flush(void)
144{
145 struct nf_conntrack_ecache *ecache;
146 int cpu;
147
6f912042 148 for_each_possible_cpu(cpu) {
9fb9cbb1
YK
149 ecache = &per_cpu(nf_conntrack_ecache, cpu);
150 if (ecache->ct)
151 nf_ct_put(ecache->ct);
152 }
153}
154#else
155static inline void nf_ct_event_cache_flush(void) {}
156#endif /* CONFIG_NF_CONNTRACK_EVENTS */
157
158DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
159EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat);
160
161/*
162 * This scheme offers various size of "struct nf_conn" dependent on
163 * features(helper, nat, ...)
164 */
165
166#define NF_CT_FEATURES_NAMELEN 256
167static struct {
168 /* name of slab cache. printed in /proc/slabinfo */
169 char *name;
170
171 /* size of slab cache */
172 size_t size;
173
174 /* slab cache pointer */
175 kmem_cache_t *cachep;
176
177 /* allocated slab cache + modules which uses this slab cache */
178 int use;
179
9fb9cbb1
YK
180} nf_ct_cache[NF_CT_F_NUM];
181
182/* protect members of nf_ct_cache except of "use" */
183DEFINE_RWLOCK(nf_ct_cache_lock);
184
185/* This avoids calling kmem_cache_create() with same name simultaneously */
57b47a53 186static DEFINE_MUTEX(nf_ct_cache_mutex);
9fb9cbb1
YK
187
188extern struct nf_conntrack_protocol nf_conntrack_generic_protocol;
189struct nf_conntrack_protocol *
c1d10adb 190__nf_ct_proto_find(u_int16_t l3proto, u_int8_t protocol)
9fb9cbb1 191{
ddc8d029 192 if (unlikely(l3proto >= AF_MAX || nf_ct_protos[l3proto] == NULL))
9fb9cbb1
YK
193 return &nf_conntrack_generic_protocol;
194
195 return nf_ct_protos[l3proto][protocol];
196}
197
c1d10adb
PNA
198/* this is guaranteed to always return a valid protocol helper, since
199 * it falls back to generic_protocol */
200struct nf_conntrack_protocol *
201nf_ct_proto_find_get(u_int16_t l3proto, u_int8_t protocol)
202{
203 struct nf_conntrack_protocol *p;
204
205 preempt_disable();
206 p = __nf_ct_proto_find(l3proto, protocol);
e1bbdebd
YK
207 if (!try_module_get(p->me))
208 p = &nf_conntrack_generic_protocol;
c1d10adb
PNA
209 preempt_enable();
210
211 return p;
212}
213
214void nf_ct_proto_put(struct nf_conntrack_protocol *p)
215{
216 module_put(p->me);
217}
218
219struct nf_conntrack_l3proto *
220nf_ct_l3proto_find_get(u_int16_t l3proto)
221{
222 struct nf_conntrack_l3proto *p;
223
224 preempt_disable();
225 p = __nf_ct_l3proto_find(l3proto);
e1bbdebd
YK
226 if (!try_module_get(p->me))
227 p = &nf_conntrack_generic_l3proto;
c1d10adb
PNA
228 preempt_enable();
229
230 return p;
231}
232
233void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p)
234{
235 module_put(p->me);
236}
237
b9f78f9f
PNA
238int
239nf_ct_l3proto_try_module_get(unsigned short l3proto)
240{
241 int ret;
242 struct nf_conntrack_l3proto *p;
243
244retry: p = nf_ct_l3proto_find_get(l3proto);
245 if (p == &nf_conntrack_generic_l3proto) {
246 ret = request_module("nf_conntrack-%d", l3proto);
247 if (!ret)
248 goto retry;
249
250 return -EPROTOTYPE;
251 }
252
253 return 0;
254}
255
256void nf_ct_l3proto_module_put(unsigned short l3proto)
257{
258 struct nf_conntrack_l3proto *p;
259
260 preempt_disable();
261 p = __nf_ct_l3proto_find(l3proto);
262 preempt_enable();
263
264 module_put(p->me);
265}
266
9fb9cbb1
YK
267static int nf_conntrack_hash_rnd_initted;
268static unsigned int nf_conntrack_hash_rnd;
269
270static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
271 unsigned int size, unsigned int rnd)
272{
273 unsigned int a, b;
274 a = jhash((void *)tuple->src.u3.all, sizeof(tuple->src.u3.all),
275 ((tuple->src.l3num) << 16) | tuple->dst.protonum);
276 b = jhash((void *)tuple->dst.u3.all, sizeof(tuple->dst.u3.all),
277 (tuple->src.u.all << 16) | tuple->dst.u.all);
278
279 return jhash_2words(a, b, rnd) % size;
280}
281
282static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple)
283{
284 return __hash_conntrack(tuple, nf_conntrack_htable_size,
285 nf_conntrack_hash_rnd);
286}
287
9fb9cbb1 288int nf_conntrack_register_cache(u_int32_t features, const char *name,
dc808fe2 289 size_t size)
9fb9cbb1
YK
290{
291 int ret = 0;
292 char *cache_name;
293 kmem_cache_t *cachep;
294
295 DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n",
296 features, name, size);
297
298 if (features < NF_CT_F_BASIC || features >= NF_CT_F_NUM) {
299 DEBUGP("nf_conntrack_register_cache: invalid features.: 0x%x\n",
300 features);
301 return -EINVAL;
302 }
303
57b47a53 304 mutex_lock(&nf_ct_cache_mutex);
9fb9cbb1
YK
305
306 write_lock_bh(&nf_ct_cache_lock);
307 /* e.g: multiple helpers are loaded */
308 if (nf_ct_cache[features].use > 0) {
309 DEBUGP("nf_conntrack_register_cache: already resisterd.\n");
310 if ((!strncmp(nf_ct_cache[features].name, name,
311 NF_CT_FEATURES_NAMELEN))
dc808fe2 312 && nf_ct_cache[features].size == size) {
9fb9cbb1
YK
313 DEBUGP("nf_conntrack_register_cache: reusing.\n");
314 nf_ct_cache[features].use++;
315 ret = 0;
316 } else
317 ret = -EBUSY;
318
319 write_unlock_bh(&nf_ct_cache_lock);
57b47a53 320 mutex_unlock(&nf_ct_cache_mutex);
9fb9cbb1
YK
321 return ret;
322 }
323 write_unlock_bh(&nf_ct_cache_lock);
324
325 /*
326 * The memory space for name of slab cache must be alive until
327 * cache is destroyed.
328 */
329 cache_name = kmalloc(sizeof(char)*NF_CT_FEATURES_NAMELEN, GFP_ATOMIC);
330 if (cache_name == NULL) {
331 DEBUGP("nf_conntrack_register_cache: can't alloc cache_name\n");
332 ret = -ENOMEM;
333 goto out_up_mutex;
334 }
335
336 if (strlcpy(cache_name, name, NF_CT_FEATURES_NAMELEN)
337 >= NF_CT_FEATURES_NAMELEN) {
338 printk("nf_conntrack_register_cache: name too long\n");
339 ret = -EINVAL;
340 goto out_free_name;
341 }
342
343 cachep = kmem_cache_create(cache_name, size, 0, 0,
344 NULL, NULL);
345 if (!cachep) {
346 printk("nf_conntrack_register_cache: Can't create slab cache "
347 "for the features = 0x%x\n", features);
348 ret = -ENOMEM;
349 goto out_free_name;
350 }
351
352 write_lock_bh(&nf_ct_cache_lock);
353 nf_ct_cache[features].use = 1;
354 nf_ct_cache[features].size = size;
9fb9cbb1
YK
355 nf_ct_cache[features].cachep = cachep;
356 nf_ct_cache[features].name = cache_name;
357 write_unlock_bh(&nf_ct_cache_lock);
358
359 goto out_up_mutex;
360
361out_free_name:
362 kfree(cache_name);
363out_up_mutex:
57b47a53 364 mutex_unlock(&nf_ct_cache_mutex);
9fb9cbb1
YK
365 return ret;
366}
367
368/* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */
369void nf_conntrack_unregister_cache(u_int32_t features)
370{
371 kmem_cache_t *cachep;
372 char *name;
373
374 /*
375 * This assures that kmem_cache_create() isn't called before destroying
376 * slab cache.
377 */
378 DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features);
57b47a53 379 mutex_lock(&nf_ct_cache_mutex);
9fb9cbb1
YK
380
381 write_lock_bh(&nf_ct_cache_lock);
382 if (--nf_ct_cache[features].use > 0) {
383 write_unlock_bh(&nf_ct_cache_lock);
57b47a53 384 mutex_unlock(&nf_ct_cache_mutex);
9fb9cbb1
YK
385 return;
386 }
387 cachep = nf_ct_cache[features].cachep;
388 name = nf_ct_cache[features].name;
389 nf_ct_cache[features].cachep = NULL;
390 nf_ct_cache[features].name = NULL;
9fb9cbb1
YK
391 nf_ct_cache[features].size = 0;
392 write_unlock_bh(&nf_ct_cache_lock);
393
394 synchronize_net();
395
396 kmem_cache_destroy(cachep);
397 kfree(name);
398
57b47a53 399 mutex_unlock(&nf_ct_cache_mutex);
9fb9cbb1
YK
400}
401
402int
403nf_ct_get_tuple(const struct sk_buff *skb,
404 unsigned int nhoff,
405 unsigned int dataoff,
406 u_int16_t l3num,
407 u_int8_t protonum,
408 struct nf_conntrack_tuple *tuple,
409 const struct nf_conntrack_l3proto *l3proto,
410 const struct nf_conntrack_protocol *protocol)
411{
412 NF_CT_TUPLE_U_BLANK(tuple);
413
414 tuple->src.l3num = l3num;
415 if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
416 return 0;
417
418 tuple->dst.protonum = protonum;
419 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
420
421 return protocol->pkt_to_tuple(skb, dataoff, tuple);
422}
423
424int
425nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
426 const struct nf_conntrack_tuple *orig,
427 const struct nf_conntrack_l3proto *l3proto,
428 const struct nf_conntrack_protocol *protocol)
429{
430 NF_CT_TUPLE_U_BLANK(inverse);
431
432 inverse->src.l3num = orig->src.l3num;
433 if (l3proto->invert_tuple(inverse, orig) == 0)
434 return 0;
435
436 inverse->dst.dir = !orig->dst.dir;
437
438 inverse->dst.protonum = orig->dst.protonum;
439 return protocol->invert_tuple(inverse, orig);
440}
441
442/* nf_conntrack_expect helper functions */
c1d10adb 443void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
9fb9cbb1 444{
dc808fe2
HW
445 struct nf_conn_help *master_help = nfct_help(exp->master);
446
447 NF_CT_ASSERT(master_help);
9fb9cbb1 448 ASSERT_WRITE_LOCK(&nf_conntrack_lock);
4a59a810 449 NF_CT_ASSERT(!timer_pending(&exp->timeout));
dc808fe2 450
9fb9cbb1
YK
451 list_del(&exp->list);
452 NF_CT_STAT_INC(expect_delete);
dc808fe2 453 master_help->expecting--;
9fb9cbb1
YK
454 nf_conntrack_expect_put(exp);
455}
456
457static void expectation_timed_out(unsigned long ul_expect)
458{
459 struct nf_conntrack_expect *exp = (void *)ul_expect;
460
461 write_lock_bh(&nf_conntrack_lock);
462 nf_ct_unlink_expect(exp);
463 write_unlock_bh(&nf_conntrack_lock);
464 nf_conntrack_expect_put(exp);
465}
466
c1d10adb
PNA
467struct nf_conntrack_expect *
468__nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
469{
470 struct nf_conntrack_expect *i;
471
472 list_for_each_entry(i, &nf_conntrack_expect_list, list) {
473 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
474 atomic_inc(&i->use);
475 return i;
476 }
477 }
478 return NULL;
479}
480
481/* Just find a expectation corresponding to a tuple. */
482struct nf_conntrack_expect *
483nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
484{
485 struct nf_conntrack_expect *i;
486
487 read_lock_bh(&nf_conntrack_lock);
488 i = __nf_conntrack_expect_find(tuple);
489 read_unlock_bh(&nf_conntrack_lock);
490
491 return i;
492}
493
9fb9cbb1
YK
494/* If an expectation for this connection is found, it gets delete from
495 * global list then returned. */
496static struct nf_conntrack_expect *
497find_expectation(const struct nf_conntrack_tuple *tuple)
498{
499 struct nf_conntrack_expect *i;
500
501 list_for_each_entry(i, &nf_conntrack_expect_list, list) {
502 /* If master is not in hash table yet (ie. packet hasn't left
503 this machine yet), how can other end know about expected?
504 Hence these are not the droids you are looking for (if
505 master ct never got confirmed, we'd hold a reference to it
506 and weird things would happen to future packets). */
507 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
508 && nf_ct_is_confirmed(i->master)) {
509 if (i->flags & NF_CT_EXPECT_PERMANENT) {
510 atomic_inc(&i->use);
511 return i;
512 } else if (del_timer(&i->timeout)) {
513 nf_ct_unlink_expect(i);
514 return i;
515 }
516 }
517 }
518 return NULL;
519}
520
521/* delete all expectations for this conntrack */
c1d10adb 522void nf_ct_remove_expectations(struct nf_conn *ct)
9fb9cbb1
YK
523{
524 struct nf_conntrack_expect *i, *tmp;
dc808fe2 525 struct nf_conn_help *help = nfct_help(ct);
9fb9cbb1
YK
526
527 /* Optimization: most connection never expect any others. */
dc808fe2 528 if (!help || help->expecting == 0)
9fb9cbb1
YK
529 return;
530
531 list_for_each_entry_safe(i, tmp, &nf_conntrack_expect_list, list) {
532 if (i->master == ct && del_timer(&i->timeout)) {
533 nf_ct_unlink_expect(i);
534 nf_conntrack_expect_put(i);
535 }
536 }
537}
538
539static void
540clean_from_lists(struct nf_conn *ct)
541{
542 unsigned int ho, hr;
543
544 DEBUGP("clean_from_lists(%p)\n", ct);
545 ASSERT_WRITE_LOCK(&nf_conntrack_lock);
546
547 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
548 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
549 LIST_DELETE(&nf_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
550 LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
551
552 /* Destroy all pending expectations */
c1d10adb 553 nf_ct_remove_expectations(ct);
9fb9cbb1
YK
554}
555
556static void
557destroy_conntrack(struct nf_conntrack *nfct)
558{
559 struct nf_conn *ct = (struct nf_conn *)nfct;
560 struct nf_conntrack_l3proto *l3proto;
561 struct nf_conntrack_protocol *proto;
562
563 DEBUGP("destroy_conntrack(%p)\n", ct);
564 NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
565 NF_CT_ASSERT(!timer_pending(&ct->timeout));
566
567 nf_conntrack_event(IPCT_DESTROY, ct);
568 set_bit(IPS_DYING_BIT, &ct->status);
569
570 /* To make sure we don't get any weird locking issues here:
571 * destroy_conntrack() MUST NOT be called with a write lock
572 * to nf_conntrack_lock!!! -HW */
c1d10adb 573 l3proto = __nf_ct_l3proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num);
9fb9cbb1
YK
574 if (l3proto && l3proto->destroy)
575 l3proto->destroy(ct);
576
c1d10adb 577 proto = __nf_ct_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num, ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
9fb9cbb1
YK
578 if (proto && proto->destroy)
579 proto->destroy(ct);
580
581 if (nf_conntrack_destroyed)
582 nf_conntrack_destroyed(ct);
583
584 write_lock_bh(&nf_conntrack_lock);
585 /* Expectations will have been removed in clean_from_lists,
586 * except TFTP can create an expectation on the first packet,
587 * before connection is in the list, so we need to clean here,
588 * too. */
c1d10adb 589 nf_ct_remove_expectations(ct);
9fb9cbb1
YK
590
591 /* We overload first tuple to link into unconfirmed list. */
592 if (!nf_ct_is_confirmed(ct)) {
593 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
594 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
595 }
596
597 NF_CT_STAT_INC(delete);
598 write_unlock_bh(&nf_conntrack_lock);
599
600 if (ct->master)
601 nf_ct_put(ct->master);
602
603 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
604 nf_conntrack_free(ct);
605}
606
607static void death_by_timeout(unsigned long ul_conntrack)
608{
609 struct nf_conn *ct = (void *)ul_conntrack;
610
611 write_lock_bh(&nf_conntrack_lock);
612 /* Inside lock so preempt is disabled on module removal path.
613 * Otherwise we can get spurious warnings. */
614 NF_CT_STAT_INC(delete_list);
615 clean_from_lists(ct);
616 write_unlock_bh(&nf_conntrack_lock);
617 nf_ct_put(ct);
618}
619
620static inline int
621conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i,
622 const struct nf_conntrack_tuple *tuple,
623 const struct nf_conn *ignored_conntrack)
624{
625 ASSERT_READ_LOCK(&nf_conntrack_lock);
626 return nf_ct_tuplehash_to_ctrack(i) != ignored_conntrack
627 && nf_ct_tuple_equal(tuple, &i->tuple);
628}
629
c1d10adb 630struct nf_conntrack_tuple_hash *
9fb9cbb1
YK
631__nf_conntrack_find(const struct nf_conntrack_tuple *tuple,
632 const struct nf_conn *ignored_conntrack)
633{
634 struct nf_conntrack_tuple_hash *h;
635 unsigned int hash = hash_conntrack(tuple);
636
637 ASSERT_READ_LOCK(&nf_conntrack_lock);
638 list_for_each_entry(h, &nf_conntrack_hash[hash], list) {
639 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
640 NF_CT_STAT_INC(found);
641 return h;
642 }
643 NF_CT_STAT_INC(searched);
644 }
645
646 return NULL;
647}
648
649/* Find a connection corresponding to a tuple. */
650struct nf_conntrack_tuple_hash *
651nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple,
652 const struct nf_conn *ignored_conntrack)
653{
654 struct nf_conntrack_tuple_hash *h;
655
656 read_lock_bh(&nf_conntrack_lock);
657 h = __nf_conntrack_find(tuple, ignored_conntrack);
658 if (h)
659 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
660 read_unlock_bh(&nf_conntrack_lock);
661
662 return h;
663}
664
c1d10adb
PNA
665static void __nf_conntrack_hash_insert(struct nf_conn *ct,
666 unsigned int hash,
667 unsigned int repl_hash)
668{
669 ct->id = ++nf_conntrack_next_id;
670 list_prepend(&nf_conntrack_hash[hash],
671 &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
672 list_prepend(&nf_conntrack_hash[repl_hash],
673 &ct->tuplehash[IP_CT_DIR_REPLY].list);
674}
675
676void nf_conntrack_hash_insert(struct nf_conn *ct)
677{
678 unsigned int hash, repl_hash;
679
680 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
681 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
682
683 write_lock_bh(&nf_conntrack_lock);
684 __nf_conntrack_hash_insert(ct, hash, repl_hash);
685 write_unlock_bh(&nf_conntrack_lock);
686}
687
9fb9cbb1
YK
688/* Confirm a connection given skb; places it in hash table */
689int
690__nf_conntrack_confirm(struct sk_buff **pskb)
691{
692 unsigned int hash, repl_hash;
693 struct nf_conn *ct;
694 enum ip_conntrack_info ctinfo;
695
696 ct = nf_ct_get(*pskb, &ctinfo);
697
698 /* ipt_REJECT uses nf_conntrack_attach to attach related
699 ICMP/TCP RST packets in other direction. Actual packet
700 which created connection will be IP_CT_NEW or for an
701 expected connection, IP_CT_RELATED. */
702 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
703 return NF_ACCEPT;
704
705 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
706 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
707
708 /* We're not in hash table, and we refuse to set up related
709 connections for unconfirmed conns. But packet copies and
710 REJECT will give spurious warnings here. */
711 /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
712
713 /* No external references means noone else could have
714 confirmed us. */
715 NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
716 DEBUGP("Confirming conntrack %p\n", ct);
717
718 write_lock_bh(&nf_conntrack_lock);
719
720 /* See if there's one in the list already, including reverse:
721 NAT could have grabbed it without realizing, since we're
722 not in the hash. If there is, we lost race. */
723 if (!LIST_FIND(&nf_conntrack_hash[hash],
724 conntrack_tuple_cmp,
725 struct nf_conntrack_tuple_hash *,
726 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
727 && !LIST_FIND(&nf_conntrack_hash[repl_hash],
728 conntrack_tuple_cmp,
729 struct nf_conntrack_tuple_hash *,
730 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
dc808fe2 731 struct nf_conn_help *help;
9fb9cbb1
YK
732 /* Remove from unconfirmed list */
733 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
734
c1d10adb 735 __nf_conntrack_hash_insert(ct, hash, repl_hash);
9fb9cbb1
YK
736 /* Timer relative to confirmation time, not original
737 setting time, otherwise we'd get timer wrap in
738 weird delay cases. */
739 ct->timeout.expires += jiffies;
740 add_timer(&ct->timeout);
741 atomic_inc(&ct->ct_general.use);
742 set_bit(IPS_CONFIRMED_BIT, &ct->status);
743 NF_CT_STAT_INC(insert);
744 write_unlock_bh(&nf_conntrack_lock);
dc808fe2
HW
745 help = nfct_help(ct);
746 if (help && help->helper)
9fb9cbb1
YK
747 nf_conntrack_event_cache(IPCT_HELPER, *pskb);
748#ifdef CONFIG_NF_NAT_NEEDED
749 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
750 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
751 nf_conntrack_event_cache(IPCT_NATINFO, *pskb);
752#endif
753 nf_conntrack_event_cache(master_ct(ct) ?
754 IPCT_RELATED : IPCT_NEW, *pskb);
755 return NF_ACCEPT;
756 }
757
758 NF_CT_STAT_INC(insert_failed);
759 write_unlock_bh(&nf_conntrack_lock);
760 return NF_DROP;
761}
762
763/* Returns true if a connection correspondings to the tuple (required
764 for NAT). */
765int
766nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
767 const struct nf_conn *ignored_conntrack)
768{
769 struct nf_conntrack_tuple_hash *h;
770
771 read_lock_bh(&nf_conntrack_lock);
772 h = __nf_conntrack_find(tuple, ignored_conntrack);
773 read_unlock_bh(&nf_conntrack_lock);
774
775 return h != NULL;
776}
777
778/* There's a small race here where we may free a just-assured
779 connection. Too bad: we're in trouble anyway. */
780static inline int unreplied(const struct nf_conntrack_tuple_hash *i)
781{
782 return !(test_bit(IPS_ASSURED_BIT,
783 &nf_ct_tuplehash_to_ctrack(i)->status));
784}
785
786static int early_drop(struct list_head *chain)
787{
788 /* Traverse backwards: gives us oldest, which is roughly LRU */
789 struct nf_conntrack_tuple_hash *h;
790 struct nf_conn *ct = NULL;
791 int dropped = 0;
792
793 read_lock_bh(&nf_conntrack_lock);
794 h = LIST_FIND_B(chain, unreplied, struct nf_conntrack_tuple_hash *);
795 if (h) {
796 ct = nf_ct_tuplehash_to_ctrack(h);
797 atomic_inc(&ct->ct_general.use);
798 }
799 read_unlock_bh(&nf_conntrack_lock);
800
801 if (!ct)
802 return dropped;
803
804 if (del_timer(&ct->timeout)) {
805 death_by_timeout((unsigned long)ct);
806 dropped = 1;
807 NF_CT_STAT_INC(early_drop);
808 }
809 nf_ct_put(ct);
810 return dropped;
811}
812
813static inline int helper_cmp(const struct nf_conntrack_helper *i,
814 const struct nf_conntrack_tuple *rtuple)
815{
816 return nf_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
817}
818
819static struct nf_conntrack_helper *
c1d10adb 820__nf_ct_helper_find(const struct nf_conntrack_tuple *tuple)
9fb9cbb1
YK
821{
822 return LIST_FIND(&helpers, helper_cmp,
823 struct nf_conntrack_helper *,
824 tuple);
825}
826
c1d10adb
PNA
827struct nf_conntrack_helper *
828nf_ct_helper_find_get( const struct nf_conntrack_tuple *tuple)
829{
830 struct nf_conntrack_helper *helper;
831
832 /* need nf_conntrack_lock to assure that helper exists until
833 * try_module_get() is called */
834 read_lock_bh(&nf_conntrack_lock);
835
836 helper = __nf_ct_helper_find(tuple);
837 if (helper) {
838 /* need to increase module usage count to assure helper will
839 * not go away while the caller is e.g. busy putting a
840 * conntrack in the hash that uses the helper */
841 if (!try_module_get(helper->me))
842 helper = NULL;
843 }
844
845 read_unlock_bh(&nf_conntrack_lock);
846
847 return helper;
848}
849
850void nf_ct_helper_put(struct nf_conntrack_helper *helper)
851{
852 module_put(helper->me);
853}
854
9fb9cbb1
YK
855static struct nf_conn *
856__nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
857 const struct nf_conntrack_tuple *repl,
858 const struct nf_conntrack_l3proto *l3proto)
859{
860 struct nf_conn *conntrack = NULL;
861 u_int32_t features = 0;
dc808fe2 862 struct nf_conntrack_helper *helper;
9fb9cbb1 863
dc808fe2 864 if (unlikely(!nf_conntrack_hash_rnd_initted)) {
9fb9cbb1
YK
865 get_random_bytes(&nf_conntrack_hash_rnd, 4);
866 nf_conntrack_hash_rnd_initted = 1;
867 }
868
869 if (nf_conntrack_max
870 && atomic_read(&nf_conntrack_count) >= nf_conntrack_max) {
871 unsigned int hash = hash_conntrack(orig);
872 /* Try dropping from this hash chain. */
873 if (!early_drop(&nf_conntrack_hash[hash])) {
874 if (net_ratelimit())
875 printk(KERN_WARNING
876 "nf_conntrack: table full, dropping"
877 " packet.\n");
878 return ERR_PTR(-ENOMEM);
879 }
880 }
881
882 /* find features needed by this conntrack. */
883 features = l3proto->get_features(orig);
dc808fe2
HW
884
885 /* FIXME: protect helper list per RCU */
9fb9cbb1 886 read_lock_bh(&nf_conntrack_lock);
dc808fe2
HW
887 helper = __nf_ct_helper_find(repl);
888 if (helper)
9fb9cbb1
YK
889 features |= NF_CT_F_HELP;
890 read_unlock_bh(&nf_conntrack_lock);
891
892 DEBUGP("nf_conntrack_alloc: features=0x%x\n", features);
893
894 read_lock_bh(&nf_ct_cache_lock);
895
dc808fe2 896 if (unlikely(!nf_ct_cache[features].use)) {
9fb9cbb1
YK
897 DEBUGP("nf_conntrack_alloc: not supported features = 0x%x\n",
898 features);
899 goto out;
900 }
901
902 conntrack = kmem_cache_alloc(nf_ct_cache[features].cachep, GFP_ATOMIC);
903 if (conntrack == NULL) {
904 DEBUGP("nf_conntrack_alloc: Can't alloc conntrack from cache\n");
905 goto out;
906 }
907
908 memset(conntrack, 0, nf_ct_cache[features].size);
909 conntrack->features = features;
dc808fe2
HW
910 if (helper) {
911 struct nf_conn_help *help = nfct_help(conntrack);
912 NF_CT_ASSERT(help);
913 help->helper = helper;
9fb9cbb1
YK
914 }
915
916 atomic_set(&conntrack->ct_general.use, 1);
917 conntrack->ct_general.destroy = destroy_conntrack;
918 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
919 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
920 /* Don't set timer yet: wait for confirmation */
921 init_timer(&conntrack->timeout);
922 conntrack->timeout.data = (unsigned long)conntrack;
923 conntrack->timeout.function = death_by_timeout;
924
925 atomic_inc(&nf_conntrack_count);
926out:
927 read_unlock_bh(&nf_ct_cache_lock);
928 return conntrack;
929}
930
931struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
932 const struct nf_conntrack_tuple *repl)
933{
934 struct nf_conntrack_l3proto *l3proto;
935
c1d10adb 936 l3proto = __nf_ct_l3proto_find(orig->src.l3num);
9fb9cbb1
YK
937 return __nf_conntrack_alloc(orig, repl, l3proto);
938}
939
940void nf_conntrack_free(struct nf_conn *conntrack)
941{
942 u_int32_t features = conntrack->features;
943 NF_CT_ASSERT(features >= NF_CT_F_BASIC && features < NF_CT_F_NUM);
944 DEBUGP("nf_conntrack_free: features = 0x%x, conntrack=%p\n", features,
945 conntrack);
946 kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
947 atomic_dec(&nf_conntrack_count);
948}
949
950/* Allocate a new conntrack: we return -ENOMEM if classification
951 failed due to stress. Otherwise it really is unclassifiable. */
952static struct nf_conntrack_tuple_hash *
953init_conntrack(const struct nf_conntrack_tuple *tuple,
954 struct nf_conntrack_l3proto *l3proto,
955 struct nf_conntrack_protocol *protocol,
956 struct sk_buff *skb,
957 unsigned int dataoff)
958{
959 struct nf_conn *conntrack;
960 struct nf_conntrack_tuple repl_tuple;
961 struct nf_conntrack_expect *exp;
962
963 if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, protocol)) {
964 DEBUGP("Can't invert tuple.\n");
965 return NULL;
966 }
967
968 conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto);
969 if (conntrack == NULL || IS_ERR(conntrack)) {
970 DEBUGP("Can't allocate conntrack.\n");
971 return (struct nf_conntrack_tuple_hash *)conntrack;
972 }
973
974 if (!protocol->new(conntrack, skb, dataoff)) {
975 nf_conntrack_free(conntrack);
976 DEBUGP("init conntrack: can't track with proto module\n");
977 return NULL;
978 }
979
980 write_lock_bh(&nf_conntrack_lock);
981 exp = find_expectation(tuple);
982
983 if (exp) {
984 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
985 conntrack, exp);
986 /* Welcome, Mr. Bond. We've been expecting you... */
987 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
988 conntrack->master = exp->master;
989#ifdef CONFIG_NF_CONNTRACK_MARK
990 conntrack->mark = exp->master->mark;
7c9728c3
JM
991#endif
992#ifdef CONFIG_NF_CONNTRACK_SECMARK
993 conntrack->secmark = exp->master->secmark;
9fb9cbb1
YK
994#endif
995 nf_conntrack_get(&conntrack->master->ct_general);
996 NF_CT_STAT_INC(expect_new);
dc808fe2 997 } else
9fb9cbb1 998 NF_CT_STAT_INC(new);
9fb9cbb1
YK
999
1000 /* Overload tuple linked list to put us in unconfirmed list. */
1001 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
1002
1003 write_unlock_bh(&nf_conntrack_lock);
1004
1005 if (exp) {
1006 if (exp->expectfn)
1007 exp->expectfn(conntrack, exp);
1008 nf_conntrack_expect_put(exp);
1009 }
1010
1011 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
1012}
1013
1014/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
1015static inline struct nf_conn *
1016resolve_normal_ct(struct sk_buff *skb,
1017 unsigned int dataoff,
1018 u_int16_t l3num,
1019 u_int8_t protonum,
1020 struct nf_conntrack_l3proto *l3proto,
1021 struct nf_conntrack_protocol *proto,
1022 int *set_reply,
1023 enum ip_conntrack_info *ctinfo)
1024{
1025 struct nf_conntrack_tuple tuple;
1026 struct nf_conntrack_tuple_hash *h;
1027 struct nf_conn *ct;
1028
1029 if (!nf_ct_get_tuple(skb, (unsigned int)(skb->nh.raw - skb->data),
1030 dataoff, l3num, protonum, &tuple, l3proto,
1031 proto)) {
1032 DEBUGP("resolve_normal_ct: Can't get tuple\n");
1033 return NULL;
1034 }
1035
1036 /* look for tuple match */
1037 h = nf_conntrack_find_get(&tuple, NULL);
1038 if (!h) {
1039 h = init_conntrack(&tuple, l3proto, proto, skb, dataoff);
1040 if (!h)
1041 return NULL;
1042 if (IS_ERR(h))
1043 return (void *)h;
1044 }
1045 ct = nf_ct_tuplehash_to_ctrack(h);
1046
1047 /* It exists; we have (non-exclusive) reference. */
1048 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
1049 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
1050 /* Please set reply bit if this packet OK */
1051 *set_reply = 1;
1052 } else {
1053 /* Once we've had two way comms, always ESTABLISHED. */
1054 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1055 DEBUGP("nf_conntrack_in: normal packet for %p\n", ct);
1056 *ctinfo = IP_CT_ESTABLISHED;
1057 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
1058 DEBUGP("nf_conntrack_in: related packet for %p\n", ct);
1059 *ctinfo = IP_CT_RELATED;
1060 } else {
1061 DEBUGP("nf_conntrack_in: new packet for %p\n", ct);
1062 *ctinfo = IP_CT_NEW;
1063 }
1064 *set_reply = 0;
1065 }
1066 skb->nfct = &ct->ct_general;
1067 skb->nfctinfo = *ctinfo;
1068 return ct;
1069}
1070
1071unsigned int
1072nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb)
1073{
1074 struct nf_conn *ct;
1075 enum ip_conntrack_info ctinfo;
1076 struct nf_conntrack_l3proto *l3proto;
1077 struct nf_conntrack_protocol *proto;
1078 unsigned int dataoff;
1079 u_int8_t protonum;
1080 int set_reply = 0;
1081 int ret;
1082
1083 /* Previously seen (loopback or untracked)? Ignore. */
1084 if ((*pskb)->nfct) {
1085 NF_CT_STAT_INC(ignore);
1086 return NF_ACCEPT;
1087 }
1088
c1d10adb 1089 l3proto = __nf_ct_l3proto_find((u_int16_t)pf);
9fb9cbb1
YK
1090 if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) {
1091 DEBUGP("not prepared to track yet or error occured\n");
1092 return -ret;
1093 }
1094
c1d10adb 1095 proto = __nf_ct_proto_find((u_int16_t)pf, protonum);
9fb9cbb1
YK
1096
1097 /* It may be an special packet, error, unclean...
1098 * inverse of the return code tells to the netfilter
1099 * core what to do with the packet. */
1100 if (proto->error != NULL &&
1101 (ret = proto->error(*pskb, dataoff, &ctinfo, pf, hooknum)) <= 0) {
1102 NF_CT_STAT_INC(error);
1103 NF_CT_STAT_INC(invalid);
1104 return -ret;
1105 }
1106
1107 ct = resolve_normal_ct(*pskb, dataoff, pf, protonum, l3proto, proto,
1108 &set_reply, &ctinfo);
1109 if (!ct) {
1110 /* Not valid part of a connection */
1111 NF_CT_STAT_INC(invalid);
1112 return NF_ACCEPT;
1113 }
1114
1115 if (IS_ERR(ct)) {
1116 /* Too stressed to deal. */
1117 NF_CT_STAT_INC(drop);
1118 return NF_DROP;
1119 }
1120
1121 NF_CT_ASSERT((*pskb)->nfct);
1122
1123 ret = proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum);
1124 if (ret < 0) {
1125 /* Invalid: inverse of the return code tells
1126 * the netfilter core what to do */
1127 DEBUGP("nf_conntrack_in: Can't track with proto module\n");
1128 nf_conntrack_put((*pskb)->nfct);
1129 (*pskb)->nfct = NULL;
1130 NF_CT_STAT_INC(invalid);
1131 return -ret;
1132 }
1133
1134 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
1135 nf_conntrack_event_cache(IPCT_STATUS, *pskb);
1136
1137 return ret;
1138}
1139
1140int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
1141 const struct nf_conntrack_tuple *orig)
1142{
1143 return nf_ct_invert_tuple(inverse, orig,
c1d10adb
PNA
1144 __nf_ct_l3proto_find(orig->src.l3num),
1145 __nf_ct_proto_find(orig->src.l3num,
1146 orig->dst.protonum));
9fb9cbb1
YK
1147}
1148
1149/* Would two expected things clash? */
1150static inline int expect_clash(const struct nf_conntrack_expect *a,
1151 const struct nf_conntrack_expect *b)
1152{
1153 /* Part covered by intersection of masks must be unequal,
1154 otherwise they clash */
1155 struct nf_conntrack_tuple intersect_mask;
1156 int count;
1157
1158 intersect_mask.src.l3num = a->mask.src.l3num & b->mask.src.l3num;
1159 intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
1160 intersect_mask.dst.u.all = a->mask.dst.u.all & b->mask.dst.u.all;
1161 intersect_mask.dst.protonum = a->mask.dst.protonum
1162 & b->mask.dst.protonum;
1163
1164 for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1165 intersect_mask.src.u3.all[count] =
1166 a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
1167 }
1168
1169 for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1170 intersect_mask.dst.u3.all[count] =
1171 a->mask.dst.u3.all[count] & b->mask.dst.u3.all[count];
1172 }
1173
1174 return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
1175}
1176
1177static inline int expect_matches(const struct nf_conntrack_expect *a,
1178 const struct nf_conntrack_expect *b)
1179{
1180 return a->master == b->master
1181 && nf_ct_tuple_equal(&a->tuple, &b->tuple)
1182 && nf_ct_tuple_equal(&a->mask, &b->mask);
1183}
1184
1185/* Generally a bad idea to call this: could have matched already. */
1186void nf_conntrack_unexpect_related(struct nf_conntrack_expect *exp)
1187{
1188 struct nf_conntrack_expect *i;
1189
1190 write_lock_bh(&nf_conntrack_lock);
1191 /* choose the the oldest expectation to evict */
1192 list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1193 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
1194 nf_ct_unlink_expect(i);
1195 write_unlock_bh(&nf_conntrack_lock);
1196 nf_conntrack_expect_put(i);
1197 return;
1198 }
1199 }
1200 write_unlock_bh(&nf_conntrack_lock);
1201}
1202
1203/* We don't increase the master conntrack refcount for non-fulfilled
1204 * conntracks. During the conntrack destruction, the expectations are
1205 * always killed before the conntrack itself */
1206struct nf_conntrack_expect *nf_conntrack_expect_alloc(struct nf_conn *me)
1207{
1208 struct nf_conntrack_expect *new;
1209
1210 new = kmem_cache_alloc(nf_conntrack_expect_cachep, GFP_ATOMIC);
1211 if (!new) {
1212 DEBUGP("expect_related: OOM allocating expect\n");
1213 return NULL;
1214 }
1215 new->master = me;
1216 atomic_set(&new->use, 1);
1217 return new;
1218}
1219
1220void nf_conntrack_expect_put(struct nf_conntrack_expect *exp)
1221{
1222 if (atomic_dec_and_test(&exp->use))
1223 kmem_cache_free(nf_conntrack_expect_cachep, exp);
1224}
1225
1226static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp)
1227{
dc808fe2
HW
1228 struct nf_conn_help *master_help = nfct_help(exp->master);
1229
9fb9cbb1 1230 atomic_inc(&exp->use);
dc808fe2 1231 master_help->expecting++;
9fb9cbb1
YK
1232 list_add(&exp->list, &nf_conntrack_expect_list);
1233
1234 init_timer(&exp->timeout);
1235 exp->timeout.data = (unsigned long)exp;
1236 exp->timeout.function = expectation_timed_out;
dc808fe2 1237 exp->timeout.expires = jiffies + master_help->helper->timeout * HZ;
9fb9cbb1
YK
1238 add_timer(&exp->timeout);
1239
c1d10adb 1240 exp->id = ++nf_conntrack_expect_next_id;
9fb9cbb1
YK
1241 atomic_inc(&exp->use);
1242 NF_CT_STAT_INC(expect_create);
1243}
1244
1245/* Race with expectations being used means we could have none to find; OK. */
1246static void evict_oldest_expect(struct nf_conn *master)
1247{
1248 struct nf_conntrack_expect *i;
1249
1250 list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1251 if (i->master == master) {
1252 if (del_timer(&i->timeout)) {
1253 nf_ct_unlink_expect(i);
1254 nf_conntrack_expect_put(i);
1255 }
1256 break;
1257 }
1258 }
1259}
1260
1261static inline int refresh_timer(struct nf_conntrack_expect *i)
1262{
dc808fe2
HW
1263 struct nf_conn_help *master_help = nfct_help(i->master);
1264
9fb9cbb1
YK
1265 if (!del_timer(&i->timeout))
1266 return 0;
1267
dc808fe2 1268 i->timeout.expires = jiffies + master_help->helper->timeout*HZ;
9fb9cbb1
YK
1269 add_timer(&i->timeout);
1270 return 1;
1271}
1272
1273int nf_conntrack_expect_related(struct nf_conntrack_expect *expect)
1274{
1275 struct nf_conntrack_expect *i;
d695aa8a 1276 struct nf_conn *master = expect->master;
dc808fe2 1277 struct nf_conn_help *master_help = nfct_help(master);
9fb9cbb1
YK
1278 int ret;
1279
dc808fe2
HW
1280 NF_CT_ASSERT(master_help);
1281
9fb9cbb1
YK
1282 DEBUGP("nf_conntrack_expect_related %p\n", related_to);
1283 DEBUGP("tuple: "); NF_CT_DUMP_TUPLE(&expect->tuple);
1284 DEBUGP("mask: "); NF_CT_DUMP_TUPLE(&expect->mask);
1285
1286 write_lock_bh(&nf_conntrack_lock);
1287 list_for_each_entry(i, &nf_conntrack_expect_list, list) {
1288 if (expect_matches(i, expect)) {
1289 /* Refresh timer: if it's dying, ignore.. */
1290 if (refresh_timer(i)) {
1291 ret = 0;
1292 goto out;
1293 }
1294 } else if (expect_clash(i, expect)) {
1295 ret = -EBUSY;
1296 goto out;
1297 }
1298 }
1299 /* Will be over limit? */
dc808fe2
HW
1300 if (master_help->helper->max_expected &&
1301 master_help->expecting >= master_help->helper->max_expected)
d695aa8a 1302 evict_oldest_expect(master);
9fb9cbb1
YK
1303
1304 nf_conntrack_expect_insert(expect);
1305 nf_conntrack_expect_event(IPEXP_NEW, expect);
1306 ret = 0;
1307out:
1308 write_unlock_bh(&nf_conntrack_lock);
1309 return ret;
1310}
1311
9fb9cbb1
YK
1312int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
1313{
1314 int ret;
1315 BUG_ON(me->timeout == 0);
1316
1317 ret = nf_conntrack_register_cache(NF_CT_F_HELP, "nf_conntrack:help",
1318 sizeof(struct nf_conn)
dc808fe2
HW
1319 + sizeof(struct nf_conn_help)
1320 + __alignof__(struct nf_conn_help));
9fb9cbb1
YK
1321 if (ret < 0) {
1322 printk(KERN_ERR "nf_conntrack_helper_reigster: Unable to create slab cache for conntracks\n");
1323 return ret;
1324 }
1325 write_lock_bh(&nf_conntrack_lock);
1326 list_prepend(&helpers, me);
1327 write_unlock_bh(&nf_conntrack_lock);
1328
1329 return 0;
1330}
1331
c1d10adb
PNA
1332struct nf_conntrack_helper *
1333__nf_conntrack_helper_find_byname(const char *name)
1334{
1335 struct nf_conntrack_helper *h;
1336
1337 list_for_each_entry(h, &helpers, list) {
1338 if (!strcmp(h->name, name))
1339 return h;
1340 }
1341
1342 return NULL;
1343}
1344
9fb9cbb1
YK
1345static inline int unhelp(struct nf_conntrack_tuple_hash *i,
1346 const struct nf_conntrack_helper *me)
1347{
dc808fe2
HW
1348 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i);
1349 struct nf_conn_help *help = nfct_help(ct);
1350
1351 if (help && help->helper == me) {
1352 nf_conntrack_event(IPCT_HELPER, ct);
1353 help->helper = NULL;
9fb9cbb1
YK
1354 }
1355 return 0;
1356}
1357
1358void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
1359{
1360 unsigned int i;
1361 struct nf_conntrack_expect *exp, *tmp;
1362
1363 /* Need write lock here, to delete helper. */
1364 write_lock_bh(&nf_conntrack_lock);
1365 LIST_DELETE(&helpers, me);
1366
1367 /* Get rid of expectations */
1368 list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, list) {
dc808fe2
HW
1369 struct nf_conn_help *help = nfct_help(exp->master);
1370 if (help->helper == me && del_timer(&exp->timeout)) {
9fb9cbb1
YK
1371 nf_ct_unlink_expect(exp);
1372 nf_conntrack_expect_put(exp);
1373 }
1374 }
1375
1376 /* Get rid of expecteds, set helpers to NULL. */
1377 LIST_FIND_W(&unconfirmed, unhelp, struct nf_conntrack_tuple_hash*, me);
1378 for (i = 0; i < nf_conntrack_htable_size; i++)
1379 LIST_FIND_W(&nf_conntrack_hash[i], unhelp,
1380 struct nf_conntrack_tuple_hash *, me);
1381 write_unlock_bh(&nf_conntrack_lock);
1382
1383 /* Someone could be still looking at the helper in a bh. */
1384 synchronize_net();
1385}
1386
1387/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1388void __nf_ct_refresh_acct(struct nf_conn *ct,
1389 enum ip_conntrack_info ctinfo,
1390 const struct sk_buff *skb,
1391 unsigned long extra_jiffies,
1392 int do_acct)
1393{
1394 int event = 0;
1395
1396 NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
1397 NF_CT_ASSERT(skb);
1398
1399 write_lock_bh(&nf_conntrack_lock);
1400
997ae831
EL
1401 /* Only update if this is not a fixed timeout */
1402 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1403 write_unlock_bh(&nf_conntrack_lock);
1404 return;
1405 }
1406
9fb9cbb1
YK
1407 /* If not in hash table, timer will not be active yet */
1408 if (!nf_ct_is_confirmed(ct)) {
1409 ct->timeout.expires = extra_jiffies;
1410 event = IPCT_REFRESH;
1411 } else {
1412 /* Need del_timer for race avoidance (may already be dying). */
1413 if (del_timer(&ct->timeout)) {
1414 ct->timeout.expires = jiffies + extra_jiffies;
1415 add_timer(&ct->timeout);
1416 event = IPCT_REFRESH;
1417 }
1418 }
1419
1420#ifdef CONFIG_NF_CT_ACCT
1421 if (do_acct) {
1422 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1423 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1424 skb->len - (unsigned int)(skb->nh.raw - skb->data);
1425 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1426 || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1427 event |= IPCT_COUNTER_FILLING;
1428 }
1429#endif
1430
1431 write_unlock_bh(&nf_conntrack_lock);
1432
1433 /* must be unlocked when calling event cache */
1434 if (event)
1435 nf_conntrack_event_cache(event, skb);
1436}
1437
c1d10adb
PNA
1438#if defined(CONFIG_NF_CT_NETLINK) || \
1439 defined(CONFIG_NF_CT_NETLINK_MODULE)
1440
1441#include <linux/netfilter/nfnetlink.h>
1442#include <linux/netfilter/nfnetlink_conntrack.h>
57b47a53
IM
1443#include <linux/mutex.h>
1444
c1d10adb
PNA
1445
1446/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1447 * in ip_conntrack_core, since we don't want the protocols to autoload
1448 * or depend on ctnetlink */
1449int nf_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1450 const struct nf_conntrack_tuple *tuple)
1451{
1452 NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1453 &tuple->src.u.tcp.port);
1454 NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1455 &tuple->dst.u.tcp.port);
1456 return 0;
1457
1458nfattr_failure:
1459 return -1;
1460}
1461
1462static const size_t cta_min_proto[CTA_PROTO_MAX] = {
1463 [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t),
1464 [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t)
1465};
1466
1467int nf_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1468 struct nf_conntrack_tuple *t)
1469{
1470 if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1471 return -EINVAL;
1472
1473 if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
1474 return -EINVAL;
1475
1476 t->src.u.tcp.port =
1477 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1478 t->dst.u.tcp.port =
1479 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1480
1481 return 0;
1482}
1483#endif
1484
9fb9cbb1
YK
1485/* Used by ipt_REJECT and ip6t_REJECT. */
1486void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1487{
1488 struct nf_conn *ct;
1489 enum ip_conntrack_info ctinfo;
1490
1491 /* This ICMP is in reverse direction to the packet which caused it */
1492 ct = nf_ct_get(skb, &ctinfo);
1493 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1494 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1495 else
1496 ctinfo = IP_CT_RELATED;
1497
1498 /* Attach to new skbuff, and increment count */
1499 nskb->nfct = &ct->ct_general;
1500 nskb->nfctinfo = ctinfo;
1501 nf_conntrack_get(nskb->nfct);
1502}
1503
1504static inline int
1505do_iter(const struct nf_conntrack_tuple_hash *i,
1506 int (*iter)(struct nf_conn *i, void *data),
1507 void *data)
1508{
1509 return iter(nf_ct_tuplehash_to_ctrack(i), data);
1510}
1511
1512/* Bring out ya dead! */
1513static struct nf_conntrack_tuple_hash *
1514get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
1515 void *data, unsigned int *bucket)
1516{
1517 struct nf_conntrack_tuple_hash *h = NULL;
1518
1519 write_lock_bh(&nf_conntrack_lock);
1520 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
1521 h = LIST_FIND_W(&nf_conntrack_hash[*bucket], do_iter,
1522 struct nf_conntrack_tuple_hash *, iter, data);
1523 if (h)
1524 break;
1525 }
1526 if (!h)
1527 h = LIST_FIND_W(&unconfirmed, do_iter,
1528 struct nf_conntrack_tuple_hash *, iter, data);
1529 if (h)
1530 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
1531 write_unlock_bh(&nf_conntrack_lock);
1532
1533 return h;
1534}
1535
1536void
1537nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data)
1538{
1539 struct nf_conntrack_tuple_hash *h;
1540 unsigned int bucket = 0;
1541
1542 while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1543 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
1544 /* Time to push up daises... */
1545 if (del_timer(&ct->timeout))
1546 death_by_timeout((unsigned long)ct);
1547 /* ... else the timer will get him soon. */
1548
1549 nf_ct_put(ct);
1550 }
1551}
1552
1553static int kill_all(struct nf_conn *i, void *data)
1554{
1555 return 1;
1556}
1557
1558static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size)
1559{
1560 if (vmalloced)
1561 vfree(hash);
1562 else
1563 free_pages((unsigned long)hash,
1564 get_order(sizeof(struct list_head) * size));
1565}
1566
c1d10adb
PNA
1567void nf_conntrack_flush()
1568{
1569 nf_ct_iterate_cleanup(kill_all, NULL);
1570}
1571
9fb9cbb1
YK
1572/* Mishearing the voices in his head, our hero wonders how he's
1573 supposed to kill the mall. */
1574void nf_conntrack_cleanup(void)
1575{
1576 int i;
1577
7d3cdc6b
YK
1578 ip_ct_attach = NULL;
1579
9fb9cbb1
YK
1580 /* This makes sure all current packets have passed through
1581 netfilter framework. Roll on, two-stage module
1582 delete... */
1583 synchronize_net();
1584
1585 nf_ct_event_cache_flush();
1586 i_see_dead_people:
c1d10adb 1587 nf_conntrack_flush();
9fb9cbb1
YK
1588 if (atomic_read(&nf_conntrack_count) != 0) {
1589 schedule();
1590 goto i_see_dead_people;
1591 }
6636568c
PM
1592 /* wait until all references to nf_conntrack_untracked are dropped */
1593 while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
1594 schedule();
9fb9cbb1
YK
1595
1596 for (i = 0; i < NF_CT_F_NUM; i++) {
1597 if (nf_ct_cache[i].use == 0)
1598 continue;
1599
1600 NF_CT_ASSERT(nf_ct_cache[i].use == 1);
1601 nf_ct_cache[i].use = 1;
1602 nf_conntrack_unregister_cache(i);
1603 }
1604 kmem_cache_destroy(nf_conntrack_expect_cachep);
1605 free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1606 nf_conntrack_htable_size);
5a6f294e
KK
1607
1608 /* free l3proto protocol tables */
1609 for (i = 0; i < PF_MAX; i++)
1610 if (nf_ct_protos[i]) {
1611 kfree(nf_ct_protos[i]);
1612 nf_ct_protos[i] = NULL;
1613 }
9fb9cbb1
YK
1614}
1615
1616static struct list_head *alloc_hashtable(int size, int *vmalloced)
1617{
1618 struct list_head *hash;
1619 unsigned int i;
1620
1621 *vmalloced = 0;
1622 hash = (void*)__get_free_pages(GFP_KERNEL,
1623 get_order(sizeof(struct list_head)
1624 * size));
1625 if (!hash) {
1626 *vmalloced = 1;
1627 printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
1628 hash = vmalloc(sizeof(struct list_head) * size);
1629 }
1630
1631 if (hash)
1632 for (i = 0; i < size; i++)
1633 INIT_LIST_HEAD(&hash[i]);
1634
1635 return hash;
1636}
1637
1638int set_hashsize(const char *val, struct kernel_param *kp)
1639{
1640 int i, bucket, hashsize, vmalloced;
1641 int old_vmalloced, old_size;
1642 int rnd;
1643 struct list_head *hash, *old_hash;
1644 struct nf_conntrack_tuple_hash *h;
1645
1646 /* On boot, we can set this without any fancy locking. */
1647 if (!nf_conntrack_htable_size)
1648 return param_set_uint(val, kp);
1649
1650 hashsize = simple_strtol(val, NULL, 0);
1651 if (!hashsize)
1652 return -EINVAL;
1653
1654 hash = alloc_hashtable(hashsize, &vmalloced);
1655 if (!hash)
1656 return -ENOMEM;
1657
1658 /* We have to rehahs for the new table anyway, so we also can
1659 * use a newrandom seed */
1660 get_random_bytes(&rnd, 4);
1661
1662 write_lock_bh(&nf_conntrack_lock);
1663 for (i = 0; i < nf_conntrack_htable_size; i++) {
1664 while (!list_empty(&nf_conntrack_hash[i])) {
1665 h = list_entry(nf_conntrack_hash[i].next,
1666 struct nf_conntrack_tuple_hash, list);
1667 list_del(&h->list);
1668 bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1669 list_add_tail(&h->list, &hash[bucket]);
1670 }
1671 }
1672 old_size = nf_conntrack_htable_size;
1673 old_vmalloced = nf_conntrack_vmalloc;
1674 old_hash = nf_conntrack_hash;
1675
1676 nf_conntrack_htable_size = hashsize;
1677 nf_conntrack_vmalloc = vmalloced;
1678 nf_conntrack_hash = hash;
1679 nf_conntrack_hash_rnd = rnd;
1680 write_unlock_bh(&nf_conntrack_lock);
1681
1682 free_conntrack_hash(old_hash, old_vmalloced, old_size);
1683 return 0;
1684}
1685
1686module_param_call(hashsize, set_hashsize, param_get_uint,
1687 &nf_conntrack_htable_size, 0600);
1688
1689int __init nf_conntrack_init(void)
1690{
1691 unsigned int i;
1692 int ret;
1693
1694 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1695 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1696 if (!nf_conntrack_htable_size) {
1697 nf_conntrack_htable_size
1698 = (((num_physpages << PAGE_SHIFT) / 16384)
1699 / sizeof(struct list_head));
1700 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1701 nf_conntrack_htable_size = 8192;
1702 if (nf_conntrack_htable_size < 16)
1703 nf_conntrack_htable_size = 16;
1704 }
1705 nf_conntrack_max = 8 * nf_conntrack_htable_size;
1706
1707 printk("nf_conntrack version %s (%u buckets, %d max)\n",
1708 NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
1709 nf_conntrack_max);
1710
1711 nf_conntrack_hash = alloc_hashtable(nf_conntrack_htable_size,
1712 &nf_conntrack_vmalloc);
1713 if (!nf_conntrack_hash) {
1714 printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
1715 goto err_out;
1716 }
1717
1718 ret = nf_conntrack_register_cache(NF_CT_F_BASIC, "nf_conntrack:basic",
dc808fe2 1719 sizeof(struct nf_conn));
9fb9cbb1
YK
1720 if (ret < 0) {
1721 printk(KERN_ERR "Unable to create nf_conn slab cache\n");
1722 goto err_free_hash;
1723 }
1724
1725 nf_conntrack_expect_cachep = kmem_cache_create("nf_conntrack_expect",
1726 sizeof(struct nf_conntrack_expect),
1727 0, 0, NULL, NULL);
1728 if (!nf_conntrack_expect_cachep) {
1729 printk(KERN_ERR "Unable to create nf_expect slab cache\n");
1730 goto err_free_conntrack_slab;
1731 }
1732
1733 /* Don't NEED lock here, but good form anyway. */
1734 write_lock_bh(&nf_conntrack_lock);
1735 for (i = 0; i < PF_MAX; i++)
1736 nf_ct_l3protos[i] = &nf_conntrack_generic_l3proto;
1737 write_unlock_bh(&nf_conntrack_lock);
1738
7d3cdc6b
YK
1739 /* For use by REJECT target */
1740 ip_ct_attach = __nf_conntrack_attach;
1741
9fb9cbb1
YK
1742 /* Set up fake conntrack:
1743 - to never be deleted, not in any hashes */
1744 atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
1745 /* - and look it like as a confirmed connection */
1746 set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
1747
1748 return ret;
1749
1750err_free_conntrack_slab:
1751 nf_conntrack_unregister_cache(NF_CT_F_BASIC);
1752err_free_hash:
1753 free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1754 nf_conntrack_htable_size);
1755err_out:
1756 return -ENOMEM;
1757}