]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/core/filter.c
pkt_sched: ingress socket filter by mark
[net-next-2.6.git] / net / core / filter.c
1 /*
2  * Linux Socket Filter - Kernel level socket filtering
3  *
4  * Author:
5  *     Jay Schulist <jschlst@samba.org>
6  *
7  * Based on the design of:
8  *     - The Berkeley Packet Filter
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public License
12  * as published by the Free Software Foundation; either version
13  * 2 of the License, or (at your option) any later version.
14  *
15  * Andi Kleen - Fix a few bad bugs and races.
16  * Kris Katterjohn - Added many additional checks in sk_chk_filter()
17  */
18
19 #include <linux/module.h>
20 #include <linux/types.h>
21 #include <linux/mm.h>
22 #include <linux/fcntl.h>
23 #include <linux/socket.h>
24 #include <linux/in.h>
25 #include <linux/inet.h>
26 #include <linux/netdevice.h>
27 #include <linux/if_packet.h>
28 #include <net/ip.h>
29 #include <net/protocol.h>
30 #include <net/netlink.h>
31 #include <linux/skbuff.h>
32 #include <net/sock.h>
33 #include <linux/errno.h>
34 #include <linux/timer.h>
35 #include <asm/system.h>
36 #include <asm/uaccess.h>
37 #include <asm/unaligned.h>
38 #include <linux/filter.h>
39
40 /* No hurry in this branch */
41 static void *__load_pointer(struct sk_buff *skb, int k)
42 {
43         u8 *ptr = NULL;
44
45         if (k >= SKF_NET_OFF)
46                 ptr = skb_network_header(skb) + k - SKF_NET_OFF;
47         else if (k >= SKF_LL_OFF)
48                 ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
49
50         if (ptr >= skb->head && ptr < skb_tail_pointer(skb))
51                 return ptr;
52         return NULL;
53 }
54
55 static inline void *load_pointer(struct sk_buff *skb, int k,
56                                  unsigned int size, void *buffer)
57 {
58         if (k >= 0)
59                 return skb_header_pointer(skb, k, size, buffer);
60         else {
61                 if (k >= SKF_AD_OFF)
62                         return NULL;
63                 return __load_pointer(skb, k);
64         }
65 }
66
67 /**
68  *      sk_filter - run a packet through a socket filter
69  *      @sk: sock associated with &sk_buff
70  *      @skb: buffer to filter
71  *
72  * Run the filter code and then cut skb->data to correct size returned by
73  * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller
74  * than pkt_len we keep whole skb->data. This is the socket level
75  * wrapper to sk_run_filter. It returns 0 if the packet should
76  * be accepted or -EPERM if the packet should be tossed.
77  *
78  */
79 int sk_filter(struct sock *sk, struct sk_buff *skb)
80 {
81         int err;
82         struct sk_filter *filter;
83
84         err = security_sock_rcv_skb(sk, skb);
85         if (err)
86                 return err;
87
88         rcu_read_lock_bh();
89         filter = rcu_dereference(sk->sk_filter);
90         if (filter) {
91                 unsigned int pkt_len = sk_run_filter(skb, filter->insns,
92                                 filter->len);
93                 err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
94         }
95         rcu_read_unlock_bh();
96
97         return err;
98 }
99 EXPORT_SYMBOL(sk_filter);
100
101 /**
102  *      sk_run_filter - run a filter on a socket
103  *      @skb: buffer to run the filter on
104  *      @filter: filter to apply
105  *      @flen: length of filter
106  *
107  * Decode and apply filter instructions to the skb->data.
108  * Return length to keep, 0 for none. skb is the data we are
109  * filtering, filter is the array of filter instructions, and
110  * len is the number of filter blocks in the array.
111  */
112 unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
113 {
114         struct sock_filter *fentry;     /* We walk down these */
115         void *ptr;
116         u32 A = 0;                      /* Accumulator */
117         u32 X = 0;                      /* Index Register */
118         u32 mem[BPF_MEMWORDS];          /* Scratch Memory Store */
119         u32 tmp;
120         int k;
121         int pc;
122
123         /*
124          * Process array of filter instructions.
125          */
126         for (pc = 0; pc < flen; pc++) {
127                 fentry = &filter[pc];
128
129                 switch (fentry->code) {
130                 case BPF_ALU|BPF_ADD|BPF_X:
131                         A += X;
132                         continue;
133                 case BPF_ALU|BPF_ADD|BPF_K:
134                         A += fentry->k;
135                         continue;
136                 case BPF_ALU|BPF_SUB|BPF_X:
137                         A -= X;
138                         continue;
139                 case BPF_ALU|BPF_SUB|BPF_K:
140                         A -= fentry->k;
141                         continue;
142                 case BPF_ALU|BPF_MUL|BPF_X:
143                         A *= X;
144                         continue;
145                 case BPF_ALU|BPF_MUL|BPF_K:
146                         A *= fentry->k;
147                         continue;
148                 case BPF_ALU|BPF_DIV|BPF_X:
149                         if (X == 0)
150                                 return 0;
151                         A /= X;
152                         continue;
153                 case BPF_ALU|BPF_DIV|BPF_K:
154                         A /= fentry->k;
155                         continue;
156                 case BPF_ALU|BPF_AND|BPF_X:
157                         A &= X;
158                         continue;
159                 case BPF_ALU|BPF_AND|BPF_K:
160                         A &= fentry->k;
161                         continue;
162                 case BPF_ALU|BPF_OR|BPF_X:
163                         A |= X;
164                         continue;
165                 case BPF_ALU|BPF_OR|BPF_K:
166                         A |= fentry->k;
167                         continue;
168                 case BPF_ALU|BPF_LSH|BPF_X:
169                         A <<= X;
170                         continue;
171                 case BPF_ALU|BPF_LSH|BPF_K:
172                         A <<= fentry->k;
173                         continue;
174                 case BPF_ALU|BPF_RSH|BPF_X:
175                         A >>= X;
176                         continue;
177                 case BPF_ALU|BPF_RSH|BPF_K:
178                         A >>= fentry->k;
179                         continue;
180                 case BPF_ALU|BPF_NEG:
181                         A = -A;
182                         continue;
183                 case BPF_JMP|BPF_JA:
184                         pc += fentry->k;
185                         continue;
186                 case BPF_JMP|BPF_JGT|BPF_K:
187                         pc += (A > fentry->k) ? fentry->jt : fentry->jf;
188                         continue;
189                 case BPF_JMP|BPF_JGE|BPF_K:
190                         pc += (A >= fentry->k) ? fentry->jt : fentry->jf;
191                         continue;
192                 case BPF_JMP|BPF_JEQ|BPF_K:
193                         pc += (A == fentry->k) ? fentry->jt : fentry->jf;
194                         continue;
195                 case BPF_JMP|BPF_JSET|BPF_K:
196                         pc += (A & fentry->k) ? fentry->jt : fentry->jf;
197                         continue;
198                 case BPF_JMP|BPF_JGT|BPF_X:
199                         pc += (A > X) ? fentry->jt : fentry->jf;
200                         continue;
201                 case BPF_JMP|BPF_JGE|BPF_X:
202                         pc += (A >= X) ? fentry->jt : fentry->jf;
203                         continue;
204                 case BPF_JMP|BPF_JEQ|BPF_X:
205                         pc += (A == X) ? fentry->jt : fentry->jf;
206                         continue;
207                 case BPF_JMP|BPF_JSET|BPF_X:
208                         pc += (A & X) ? fentry->jt : fentry->jf;
209                         continue;
210                 case BPF_LD|BPF_W|BPF_ABS:
211                         k = fentry->k;
212 load_w:
213                         ptr = load_pointer(skb, k, 4, &tmp);
214                         if (ptr != NULL) {
215                                 A = get_unaligned_be32(ptr);
216                                 continue;
217                         }
218                         break;
219                 case BPF_LD|BPF_H|BPF_ABS:
220                         k = fentry->k;
221 load_h:
222                         ptr = load_pointer(skb, k, 2, &tmp);
223                         if (ptr != NULL) {
224                                 A = get_unaligned_be16(ptr);
225                                 continue;
226                         }
227                         break;
228                 case BPF_LD|BPF_B|BPF_ABS:
229                         k = fentry->k;
230 load_b:
231                         ptr = load_pointer(skb, k, 1, &tmp);
232                         if (ptr != NULL) {
233                                 A = *(u8 *)ptr;
234                                 continue;
235                         }
236                         break;
237                 case BPF_LD|BPF_W|BPF_LEN:
238                         A = skb->len;
239                         continue;
240                 case BPF_LDX|BPF_W|BPF_LEN:
241                         X = skb->len;
242                         continue;
243                 case BPF_LD|BPF_W|BPF_IND:
244                         k = X + fentry->k;
245                         goto load_w;
246                 case BPF_LD|BPF_H|BPF_IND:
247                         k = X + fentry->k;
248                         goto load_h;
249                 case BPF_LD|BPF_B|BPF_IND:
250                         k = X + fentry->k;
251                         goto load_b;
252                 case BPF_LDX|BPF_B|BPF_MSH:
253                         ptr = load_pointer(skb, fentry->k, 1, &tmp);
254                         if (ptr != NULL) {
255                                 X = (*(u8 *)ptr & 0xf) << 2;
256                                 continue;
257                         }
258                         return 0;
259                 case BPF_LD|BPF_IMM:
260                         A = fentry->k;
261                         continue;
262                 case BPF_LDX|BPF_IMM:
263                         X = fentry->k;
264                         continue;
265                 case BPF_LD|BPF_MEM:
266                         A = mem[fentry->k];
267                         continue;
268                 case BPF_LDX|BPF_MEM:
269                         X = mem[fentry->k];
270                         continue;
271                 case BPF_MISC|BPF_TAX:
272                         X = A;
273                         continue;
274                 case BPF_MISC|BPF_TXA:
275                         A = X;
276                         continue;
277                 case BPF_RET|BPF_K:
278                         return fentry->k;
279                 case BPF_RET|BPF_A:
280                         return A;
281                 case BPF_ST:
282                         mem[fentry->k] = A;
283                         continue;
284                 case BPF_STX:
285                         mem[fentry->k] = X;
286                         continue;
287                 default:
288                         WARN_ON(1);
289                         return 0;
290                 }
291
292                 /*
293                  * Handle ancillary data, which are impossible
294                  * (or very difficult) to get parsing packet contents.
295                  */
296                 switch (k-SKF_AD_OFF) {
297                 case SKF_AD_PROTOCOL:
298                         A = ntohs(skb->protocol);
299                         continue;
300                 case SKF_AD_PKTTYPE:
301                         A = skb->pkt_type;
302                         continue;
303                 case SKF_AD_IFINDEX:
304                         A = skb->dev->ifindex;
305                         continue;
306                 case SKF_AD_MARK:
307                         A = skb->mark;
308                         continue;
309                 case SKF_AD_NLATTR: {
310                         struct nlattr *nla;
311
312                         if (skb_is_nonlinear(skb))
313                                 return 0;
314                         if (A > skb->len - sizeof(struct nlattr))
315                                 return 0;
316
317                         nla = nla_find((struct nlattr *)&skb->data[A],
318                                        skb->len - A, X);
319                         if (nla)
320                                 A = (void *)nla - (void *)skb->data;
321                         else
322                                 A = 0;
323                         continue;
324                 }
325                 case SKF_AD_NLATTR_NEST: {
326                         struct nlattr *nla;
327
328                         if (skb_is_nonlinear(skb))
329                                 return 0;
330                         if (A > skb->len - sizeof(struct nlattr))
331                                 return 0;
332
333                         nla = (struct nlattr *)&skb->data[A];
334                         if (nla->nla_len > A - skb->len)
335                                 return 0;
336
337                         nla = nla_find_nested(nla, X);
338                         if (nla)
339                                 A = (void *)nla - (void *)skb->data;
340                         else
341                                 A = 0;
342                         continue;
343                 }
344                 default:
345                         return 0;
346                 }
347         }
348
349         return 0;
350 }
351 EXPORT_SYMBOL(sk_run_filter);
352
353 /**
354  *      sk_chk_filter - verify socket filter code
355  *      @filter: filter to verify
356  *      @flen: length of filter
357  *
358  * Check the user's filter code. If we let some ugly
359  * filter code slip through kaboom! The filter must contain
360  * no references or jumps that are out of range, no illegal
361  * instructions, and must end with a RET instruction.
362  *
363  * All jumps are forward as they are not signed.
364  *
365  * Returns 0 if the rule set is legal or -EINVAL if not.
366  */
367 int sk_chk_filter(struct sock_filter *filter, int flen)
368 {
369         struct sock_filter *ftest;
370         int pc;
371
372         if (flen == 0 || flen > BPF_MAXINSNS)
373                 return -EINVAL;
374
375         /* check the filter code now */
376         for (pc = 0; pc < flen; pc++) {
377                 ftest = &filter[pc];
378
379                 /* Only allow valid instructions */
380                 switch (ftest->code) {
381                 case BPF_ALU|BPF_ADD|BPF_K:
382                 case BPF_ALU|BPF_ADD|BPF_X:
383                 case BPF_ALU|BPF_SUB|BPF_K:
384                 case BPF_ALU|BPF_SUB|BPF_X:
385                 case BPF_ALU|BPF_MUL|BPF_K:
386                 case BPF_ALU|BPF_MUL|BPF_X:
387                 case BPF_ALU|BPF_DIV|BPF_X:
388                 case BPF_ALU|BPF_AND|BPF_K:
389                 case BPF_ALU|BPF_AND|BPF_X:
390                 case BPF_ALU|BPF_OR|BPF_K:
391                 case BPF_ALU|BPF_OR|BPF_X:
392                 case BPF_ALU|BPF_LSH|BPF_K:
393                 case BPF_ALU|BPF_LSH|BPF_X:
394                 case BPF_ALU|BPF_RSH|BPF_K:
395                 case BPF_ALU|BPF_RSH|BPF_X:
396                 case BPF_ALU|BPF_NEG:
397                 case BPF_LD|BPF_W|BPF_ABS:
398                 case BPF_LD|BPF_H|BPF_ABS:
399                 case BPF_LD|BPF_B|BPF_ABS:
400                 case BPF_LD|BPF_W|BPF_LEN:
401                 case BPF_LD|BPF_W|BPF_IND:
402                 case BPF_LD|BPF_H|BPF_IND:
403                 case BPF_LD|BPF_B|BPF_IND:
404                 case BPF_LD|BPF_IMM:
405                 case BPF_LDX|BPF_W|BPF_LEN:
406                 case BPF_LDX|BPF_B|BPF_MSH:
407                 case BPF_LDX|BPF_IMM:
408                 case BPF_MISC|BPF_TAX:
409                 case BPF_MISC|BPF_TXA:
410                 case BPF_RET|BPF_K:
411                 case BPF_RET|BPF_A:
412                         break;
413
414                 /* Some instructions need special checks */
415
416                 case BPF_ALU|BPF_DIV|BPF_K:
417                         /* check for division by zero */
418                         if (ftest->k == 0)
419                                 return -EINVAL;
420                         break;
421
422                 case BPF_LD|BPF_MEM:
423                 case BPF_LDX|BPF_MEM:
424                 case BPF_ST:
425                 case BPF_STX:
426                         /* check for invalid memory addresses */
427                         if (ftest->k >= BPF_MEMWORDS)
428                                 return -EINVAL;
429                         break;
430
431                 case BPF_JMP|BPF_JA:
432                         /*
433                          * Note, the large ftest->k might cause loops.
434                          * Compare this with conditional jumps below,
435                          * where offsets are limited. --ANK (981016)
436                          */
437                         if (ftest->k >= (unsigned)(flen-pc-1))
438                                 return -EINVAL;
439                         break;
440
441                 case BPF_JMP|BPF_JEQ|BPF_K:
442                 case BPF_JMP|BPF_JEQ|BPF_X:
443                 case BPF_JMP|BPF_JGE|BPF_K:
444                 case BPF_JMP|BPF_JGE|BPF_X:
445                 case BPF_JMP|BPF_JGT|BPF_K:
446                 case BPF_JMP|BPF_JGT|BPF_X:
447                 case BPF_JMP|BPF_JSET|BPF_K:
448                 case BPF_JMP|BPF_JSET|BPF_X:
449                         /* for conditionals both must be safe */
450                         if (pc + ftest->jt + 1 >= flen ||
451                             pc + ftest->jf + 1 >= flen)
452                                 return -EINVAL;
453                         break;
454
455                 default:
456                         return -EINVAL;
457                 }
458         }
459
460         return (BPF_CLASS(filter[flen - 1].code) == BPF_RET) ? 0 : -EINVAL;
461 }
462 EXPORT_SYMBOL(sk_chk_filter);
463
464 /**
465  *      sk_filter_rcu_release: Release a socket filter by rcu_head
466  *      @rcu: rcu_head that contains the sk_filter to free
467  */
468 static void sk_filter_rcu_release(struct rcu_head *rcu)
469 {
470         struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
471
472         sk_filter_release(fp);
473 }
474
475 static void sk_filter_delayed_uncharge(struct sock *sk, struct sk_filter *fp)
476 {
477         unsigned int size = sk_filter_len(fp);
478
479         atomic_sub(size, &sk->sk_omem_alloc);
480         call_rcu_bh(&fp->rcu, sk_filter_rcu_release);
481 }
482
483 /**
484  *      sk_attach_filter - attach a socket filter
485  *      @fprog: the filter program
486  *      @sk: the socket to use
487  *
488  * Attach the user's filter code. We first run some sanity checks on
489  * it to make sure it does not explode on us later. If an error
490  * occurs or there is insufficient memory for the filter a negative
491  * errno code is returned. On success the return is zero.
492  */
493 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
494 {
495         struct sk_filter *fp, *old_fp;
496         unsigned int fsize = sizeof(struct sock_filter) * fprog->len;
497         int err;
498
499         /* Make sure new filter is there and in the right amounts. */
500         if (fprog->filter == NULL)
501                 return -EINVAL;
502
503         fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL);
504         if (!fp)
505                 return -ENOMEM;
506         if (copy_from_user(fp->insns, fprog->filter, fsize)) {
507                 sock_kfree_s(sk, fp, fsize+sizeof(*fp));
508                 return -EFAULT;
509         }
510
511         atomic_set(&fp->refcnt, 1);
512         fp->len = fprog->len;
513
514         err = sk_chk_filter(fp->insns, fp->len);
515         if (err) {
516                 sk_filter_uncharge(sk, fp);
517                 return err;
518         }
519
520         rcu_read_lock_bh();
521         old_fp = rcu_dereference(sk->sk_filter);
522         rcu_assign_pointer(sk->sk_filter, fp);
523         rcu_read_unlock_bh();
524
525         if (old_fp)
526                 sk_filter_delayed_uncharge(sk, old_fp);
527         return 0;
528 }
529
530 int sk_detach_filter(struct sock *sk)
531 {
532         int ret = -ENOENT;
533         struct sk_filter *filter;
534
535         rcu_read_lock_bh();
536         filter = rcu_dereference(sk->sk_filter);
537         if (filter) {
538                 rcu_assign_pointer(sk->sk_filter, NULL);
539                 sk_filter_delayed_uncharge(sk, filter);
540                 ret = 0;
541         }
542         rcu_read_unlock_bh();
543         return ret;
544 }