]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv4/ip_output.c
[ICSK]: Introduce reqsk_queue_prune from code in tcp_synack_timer
[net-next-2.6.git] / net / ipv4 / ip_output.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
8 * Version: $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
9 *
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Donald Becker, <becker@super.org>
13 * Alan Cox, <Alan.Cox@linux.org>
14 * Richard Underwood
15 * Stefan Becker, <stefanb@yello.ping.de>
16 * Jorge Cwik, <jorge@laser.satlink.net>
17 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18 * Hirokazu Takahashi, <taka@valinux.co.jp>
19 *
20 * See ip_input.c for original log
21 *
22 * Fixes:
23 * Alan Cox : Missing nonblock feature in ip_build_xmit.
24 * Mike Kilburn : htons() missing in ip_build_xmit.
25 * Bradford Johnson: Fix faulty handling of some frames when
26 * no route is found.
27 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
28 * (in case if packet not accepted by
29 * output firewall rules)
30 * Mike McLagan : Routing by source
31 * Alexey Kuznetsov: use new route cache
32 * Andi Kleen: Fix broken PMTU recovery and remove
33 * some redundant tests.
34 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
35 * Andi Kleen : Replace ip_reply with ip_send_reply.
36 * Andi Kleen : Split fast and slow ip_build_xmit path
37 * for decreased register pressure on x86
38 * and more readibility.
39 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
40 * silently drop skb instead of failing with -EPERM.
41 * Detlev Wengorz : Copy protocol for fragments.
42 * Hirokazu Takahashi: HW checksumming for outgoing UDP
43 * datagrams.
44 * Hirokazu Takahashi: sendfile() on UDP works now.
45 */
46
47#include <asm/uaccess.h>
48#include <asm/system.h>
49#include <linux/module.h>
50#include <linux/types.h>
51#include <linux/kernel.h>
52#include <linux/sched.h>
53#include <linux/mm.h>
54#include <linux/string.h>
55#include <linux/errno.h>
56#include <linux/config.h>
57
58#include <linux/socket.h>
59#include <linux/sockios.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/etherdevice.h>
64#include <linux/proc_fs.h>
65#include <linux/stat.h>
66#include <linux/init.h>
67
68#include <net/snmp.h>
69#include <net/ip.h>
70#include <net/protocol.h>
71#include <net/route.h>
1da177e4
LT
72#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <net/arp.h>
75#include <net/icmp.h>
1da177e4
LT
76#include <net/checksum.h>
77#include <net/inetpeer.h>
78#include <net/checksum.h>
79#include <linux/igmp.h>
80#include <linux/netfilter_ipv4.h>
81#include <linux/netfilter_bridge.h>
82#include <linux/mroute.h>
83#include <linux/netlink.h>
6cbb0df7 84#include <linux/tcp.h>
1da177e4 85
1da177e4
LT
86int sysctl_ip_default_ttl = IPDEFTTL;
87
88/* Generate a checksum for an outgoing IP datagram. */
89__inline__ void ip_send_check(struct iphdr *iph)
90{
91 iph->check = 0;
92 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
93}
94
95/* dev_loopback_xmit for use with netfilter. */
96static int ip_dev_loopback_xmit(struct sk_buff *newskb)
97{
98 newskb->mac.raw = newskb->data;
99 __skb_pull(newskb, newskb->nh.raw - newskb->data);
100 newskb->pkt_type = PACKET_LOOPBACK;
101 newskb->ip_summed = CHECKSUM_UNNECESSARY;
102 BUG_TRAP(newskb->dst);
1da177e4
LT
103 netif_rx(newskb);
104 return 0;
105}
106
107static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
108{
109 int ttl = inet->uc_ttl;
110
111 if (ttl < 0)
112 ttl = dst_metric(dst, RTAX_HOPLIMIT);
113 return ttl;
114}
115
116/*
117 * Add an ip header to a skbuff and send it out.
118 *
119 */
120int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
121 u32 saddr, u32 daddr, struct ip_options *opt)
122{
123 struct inet_sock *inet = inet_sk(sk);
124 struct rtable *rt = (struct rtable *)skb->dst;
125 struct iphdr *iph;
126
127 /* Build the IP header. */
128 if (opt)
129 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
130 else
131 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
132
133 iph->version = 4;
134 iph->ihl = 5;
135 iph->tos = inet->tos;
136 if (ip_dont_fragment(sk, &rt->u.dst))
137 iph->frag_off = htons(IP_DF);
138 else
139 iph->frag_off = 0;
140 iph->ttl = ip_select_ttl(inet, &rt->u.dst);
141 iph->daddr = rt->rt_dst;
142 iph->saddr = rt->rt_src;
143 iph->protocol = sk->sk_protocol;
144 iph->tot_len = htons(skb->len);
145 ip_select_ident(iph, &rt->u.dst, sk);
146 skb->nh.iph = iph;
147
148 if (opt && opt->optlen) {
149 iph->ihl += opt->optlen>>2;
150 ip_options_build(skb, opt, daddr, rt, 0);
151 }
152 ip_send_check(iph);
153
154 skb->priority = sk->sk_priority;
155
156 /* Send it out. */
157 return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
158 dst_output);
159}
160
161static inline int ip_finish_output2(struct sk_buff *skb)
162{
163 struct dst_entry *dst = skb->dst;
164 struct hh_cache *hh = dst->hh;
165 struct net_device *dev = dst->dev;
166 int hh_len = LL_RESERVED_SPACE(dev);
167
168 /* Be paranoid, rather than too clever. */
169 if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
170 struct sk_buff *skb2;
171
172 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
173 if (skb2 == NULL) {
174 kfree_skb(skb);
175 return -ENOMEM;
176 }
177 if (skb->sk)
178 skb_set_owner_w(skb2, skb->sk);
179 kfree_skb(skb);
180 skb = skb2;
181 }
182
1da177e4
LT
183 if (hh) {
184 int hh_alen;
185
186 read_lock_bh(&hh->hh_lock);
187 hh_alen = HH_DATA_ALIGN(hh->hh_len);
188 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
189 read_unlock_bh(&hh->hh_lock);
190 skb_push(skb, hh->hh_len);
191 return hh->hh_output(skb);
192 } else if (dst->neighbour)
193 return dst->neighbour->output(skb);
194
195 if (net_ratelimit())
196 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
197 kfree_skb(skb);
198 return -EINVAL;
199}
200
0742fd53 201static int ip_finish_output(struct sk_buff *skb)
1da177e4
LT
202{
203 struct net_device *dev = skb->dst->dev;
204
205 skb->dev = dev;
206 skb->protocol = htons(ETH_P_IP);
207
208 return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
209 ip_finish_output2);
210}
211
212int ip_mc_output(struct sk_buff *skb)
213{
214 struct sock *sk = skb->sk;
215 struct rtable *rt = (struct rtable*)skb->dst;
216 struct net_device *dev = rt->u.dst.dev;
217
218 /*
219 * If the indicated interface is up and running, send the packet.
220 */
221 IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
222
223 skb->dev = dev;
224 skb->protocol = htons(ETH_P_IP);
225
226 /*
227 * Multicasts are looped back for other local users
228 */
229
230 if (rt->rt_flags&RTCF_MULTICAST) {
231 if ((!sk || inet_sk(sk)->mc_loop)
232#ifdef CONFIG_IP_MROUTE
233 /* Small optimization: do not loopback not local frames,
234 which returned after forwarding; they will be dropped
235 by ip_mr_input in any case.
236 Note, that local frames are looped back to be delivered
237 to local recipients.
238
239 This check is duplicated in ip_mr_input at the moment.
240 */
241 && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
242#endif
243 ) {
244 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
245 if (newskb)
246 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
247 newskb->dev,
248 ip_dev_loopback_xmit);
249 }
250
251 /* Multicasts with ttl 0 must not go beyond the host */
252
253 if (skb->nh.iph->ttl == 0) {
254 kfree_skb(skb);
255 return 0;
256 }
257 }
258
259 if (rt->rt_flags&RTCF_BROADCAST) {
260 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
261 if (newskb)
262 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
263 newskb->dev, ip_dev_loopback_xmit);
264 }
265
266 if (skb->len > dst_mtu(&rt->u.dst))
267 return ip_fragment(skb, ip_finish_output);
268 else
269 return ip_finish_output(skb);
270}
271
272int ip_output(struct sk_buff *skb)
273{
274 IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
275
276 if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size)
277 return ip_fragment(skb, ip_finish_output);
278 else
279 return ip_finish_output(skb);
280}
281
282int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
283{
284 struct sock *sk = skb->sk;
285 struct inet_sock *inet = inet_sk(sk);
286 struct ip_options *opt = inet->opt;
287 struct rtable *rt;
288 struct iphdr *iph;
289
290 /* Skip all of this if the packet is already routed,
291 * f.e. by something like SCTP.
292 */
293 rt = (struct rtable *) skb->dst;
294 if (rt != NULL)
295 goto packet_routed;
296
297 /* Make sure we can route this packet. */
298 rt = (struct rtable *)__sk_dst_check(sk, 0);
299 if (rt == NULL) {
300 u32 daddr;
301
302 /* Use correct destination address if we have options. */
303 daddr = inet->daddr;
304 if(opt && opt->srr)
305 daddr = opt->faddr;
306
307 {
308 struct flowi fl = { .oif = sk->sk_bound_dev_if,
309 .nl_u = { .ip4_u =
310 { .daddr = daddr,
311 .saddr = inet->saddr,
312 .tos = RT_CONN_FLAGS(sk) } },
313 .proto = sk->sk_protocol,
314 .uli_u = { .ports =
315 { .sport = inet->sport,
316 .dport = inet->dport } } };
317
318 /* If this fails, retransmit mechanism of transport layer will
319 * keep trying until route appears or the connection times
320 * itself out.
321 */
322 if (ip_route_output_flow(&rt, &fl, sk, 0))
323 goto no_route;
324 }
6cbb0df7 325 sk_setup_caps(sk, &rt->u.dst);
1da177e4
LT
326 }
327 skb->dst = dst_clone(&rt->u.dst);
328
329packet_routed:
330 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
331 goto no_route;
332
333 /* OK, we know where to send it, allocate and build IP header. */
334 iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
335 *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
336 iph->tot_len = htons(skb->len);
337 if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
338 iph->frag_off = htons(IP_DF);
339 else
340 iph->frag_off = 0;
341 iph->ttl = ip_select_ttl(inet, &rt->u.dst);
342 iph->protocol = sk->sk_protocol;
343 iph->saddr = rt->rt_src;
344 iph->daddr = rt->rt_dst;
345 skb->nh.iph = iph;
346 /* Transport layer set skb->h.foo itself. */
347
348 if (opt && opt->optlen) {
349 iph->ihl += opt->optlen >> 2;
350 ip_options_build(skb, opt, inet->daddr, rt, 0);
351 }
352
353 ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
354
355 /* Add an IP checksum. */
356 ip_send_check(iph);
357
358 skb->priority = sk->sk_priority;
359
360 return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
361 dst_output);
362
363no_route:
364 IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
365 kfree_skb(skb);
366 return -EHOSTUNREACH;
367}
368
369
370static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
371{
372 to->pkt_type = from->pkt_type;
373 to->priority = from->priority;
374 to->protocol = from->protocol;
1da177e4
LT
375 dst_release(to->dst);
376 to->dst = dst_clone(from->dst);
377 to->dev = from->dev;
378
379 /* Copy the flags to each fragment. */
380 IPCB(to)->flags = IPCB(from)->flags;
381
382#ifdef CONFIG_NET_SCHED
383 to->tc_index = from->tc_index;
384#endif
385#ifdef CONFIG_NETFILTER
386 to->nfmark = from->nfmark;
1da177e4
LT
387 /* Connection association is same as pre-frag packet */
388 nf_conntrack_put(to->nfct);
389 to->nfct = from->nfct;
390 nf_conntrack_get(to->nfct);
391 to->nfctinfo = from->nfctinfo;
392#ifdef CONFIG_BRIDGE_NETFILTER
393 nf_bridge_put(to->nf_bridge);
394 to->nf_bridge = from->nf_bridge;
395 nf_bridge_get(to->nf_bridge);
396#endif
1da177e4
LT
397#endif
398}
399
400/*
401 * This IP datagram is too large to be sent in one piece. Break it up into
402 * smaller pieces (each of size equal to IP header plus
403 * a block of the data of the original IP data part) that will yet fit in a
404 * single device frame, and queue such a frame for sending.
405 */
406
407int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
408{
409 struct iphdr *iph;
410 int raw = 0;
411 int ptr;
412 struct net_device *dev;
413 struct sk_buff *skb2;
414 unsigned int mtu, hlen, left, len, ll_rs;
415 int offset;
416 int not_last_frag;
417 struct rtable *rt = (struct rtable*)skb->dst;
418 int err = 0;
419
420 dev = rt->u.dst.dev;
421
422 /*
423 * Point into the IP datagram header.
424 */
425
426 iph = skb->nh.iph;
427
428 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
429 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
430 htonl(dst_mtu(&rt->u.dst)));
431 kfree_skb(skb);
432 return -EMSGSIZE;
433 }
434
435 /*
436 * Setup starting values.
437 */
438
439 hlen = iph->ihl * 4;
440 mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */
441
442 /* When frag_list is given, use it. First, check its validity:
443 * some transformers could create wrong frag_list or break existing
444 * one, it is not prohibited. In this case fall back to copying.
445 *
446 * LATER: this step can be merged to real generation of fragments,
447 * we can switch to copy when see the first bad fragment.
448 */
449 if (skb_shinfo(skb)->frag_list) {
450 struct sk_buff *frag;
451 int first_len = skb_pagelen(skb);
452
453 if (first_len - hlen > mtu ||
454 ((first_len - hlen) & 7) ||
455 (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
456 skb_cloned(skb))
457 goto slow_path;
458
459 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
460 /* Correct geometry. */
461 if (frag->len > mtu ||
462 ((frag->len & 7) && frag->next) ||
463 skb_headroom(frag) < hlen)
464 goto slow_path;
465
466 /* Partially cloned skb? */
467 if (skb_shared(frag))
468 goto slow_path;
2fdba6b0
HX
469
470 BUG_ON(frag->sk);
471 if (skb->sk) {
472 sock_hold(skb->sk);
473 frag->sk = skb->sk;
474 frag->destructor = sock_wfree;
475 skb->truesize -= frag->truesize;
476 }
1da177e4
LT
477 }
478
479 /* Everything is OK. Generate! */
480
481 err = 0;
482 offset = 0;
483 frag = skb_shinfo(skb)->frag_list;
484 skb_shinfo(skb)->frag_list = NULL;
485 skb->data_len = first_len - skb_headlen(skb);
486 skb->len = first_len;
487 iph->tot_len = htons(first_len);
488 iph->frag_off = htons(IP_MF);
489 ip_send_check(iph);
490
491 for (;;) {
492 /* Prepare header of the next frame,
493 * before previous one went down. */
494 if (frag) {
495 frag->ip_summed = CHECKSUM_NONE;
496 frag->h.raw = frag->data;
497 frag->nh.raw = __skb_push(frag, hlen);
498 memcpy(frag->nh.raw, iph, hlen);
499 iph = frag->nh.iph;
500 iph->tot_len = htons(frag->len);
501 ip_copy_metadata(frag, skb);
502 if (offset == 0)
503 ip_options_fragment(frag);
504 offset += skb->len - hlen;
505 iph->frag_off = htons(offset>>3);
506 if (frag->next != NULL)
507 iph->frag_off |= htons(IP_MF);
508 /* Ready, complete checksum */
509 ip_send_check(iph);
510 }
511
512 err = output(skb);
513
514 if (err || !frag)
515 break;
516
517 skb = frag;
518 frag = skb->next;
519 skb->next = NULL;
520 }
521
522 if (err == 0) {
523 IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
524 return 0;
525 }
526
527 while (frag) {
528 skb = frag->next;
529 kfree_skb(frag);
530 frag = skb;
531 }
532 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
533 return err;
534 }
535
536slow_path:
537 left = skb->len - hlen; /* Space per frame */
538 ptr = raw + hlen; /* Where to start from */
539
540#ifdef CONFIG_BRIDGE_NETFILTER
541 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
542 * we need to make room for the encapsulating header */
543 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
544 mtu -= nf_bridge_pad(skb);
545#else
546 ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
547#endif
548 /*
549 * Fragment the datagram.
550 */
551
552 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
553 not_last_frag = iph->frag_off & htons(IP_MF);
554
555 /*
556 * Keep copying data until we run out.
557 */
558
559 while(left > 0) {
560 len = left;
561 /* IF: it doesn't fit, use 'mtu' - the data space left */
562 if (len > mtu)
563 len = mtu;
564 /* IF: we are not sending upto and including the packet end
565 then align the next start on an eight byte boundary */
566 if (len < left) {
567 len &= ~7;
568 }
569 /*
570 * Allocate buffer.
571 */
572
573 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
574 NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
575 err = -ENOMEM;
576 goto fail;
577 }
578
579 /*
580 * Set up data on packet
581 */
582
583 ip_copy_metadata(skb2, skb);
584 skb_reserve(skb2, ll_rs);
585 skb_put(skb2, len + hlen);
586 skb2->nh.raw = skb2->data;
587 skb2->h.raw = skb2->data + hlen;
588
589 /*
590 * Charge the memory for the fragment to any owner
591 * it might possess
592 */
593
594 if (skb->sk)
595 skb_set_owner_w(skb2, skb->sk);
596
597 /*
598 * Copy the packet header into the new buffer.
599 */
600
601 memcpy(skb2->nh.raw, skb->data, hlen);
602
603 /*
604 * Copy a block of the IP datagram.
605 */
606 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
607 BUG();
608 left -= len;
609
610 /*
611 * Fill in the new header fields.
612 */
613 iph = skb2->nh.iph;
614 iph->frag_off = htons((offset >> 3));
615
616 /* ANK: dirty, but effective trick. Upgrade options only if
617 * the segment to be fragmented was THE FIRST (otherwise,
618 * options are already fixed) and make it ONCE
619 * on the initial skb, so that all the following fragments
620 * will inherit fixed options.
621 */
622 if (offset == 0)
623 ip_options_fragment(skb);
624
625 /*
626 * Added AC : If we are fragmenting a fragment that's not the
627 * last fragment then keep MF on each bit
628 */
629 if (left > 0 || not_last_frag)
630 iph->frag_off |= htons(IP_MF);
631 ptr += len;
632 offset += len;
633
634 /*
635 * Put this fragment into the sending queue.
636 */
637
638 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
639
640 iph->tot_len = htons(len + hlen);
641
642 ip_send_check(iph);
643
644 err = output(skb2);
645 if (err)
646 goto fail;
647 }
648 kfree_skb(skb);
649 IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
650 return err;
651
652fail:
653 kfree_skb(skb);
654 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
655 return err;
656}
657
658int
659ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
660{
661 struct iovec *iov = from;
662
663 if (skb->ip_summed == CHECKSUM_HW) {
664 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
665 return -EFAULT;
666 } else {
667 unsigned int csum = 0;
668 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
669 return -EFAULT;
670 skb->csum = csum_block_add(skb->csum, csum, odd);
671 }
672 return 0;
673}
674
675static inline unsigned int
676csum_page(struct page *page, int offset, int copy)
677{
678 char *kaddr;
679 unsigned int csum;
680 kaddr = kmap(page);
681 csum = csum_partial(kaddr + offset, copy, 0);
682 kunmap(page);
683 return csum;
684}
685
686/*
687 * ip_append_data() and ip_append_page() can make one large IP datagram
688 * from many pieces of data. Each pieces will be holded on the socket
689 * until ip_push_pending_frames() is called. Each piece can be a page
690 * or non-page data.
691 *
692 * Not only UDP, other transport protocols - e.g. raw sockets - can use
693 * this interface potentially.
694 *
695 * LATER: length must be adjusted by pad at tail, when it is required.
696 */
697int ip_append_data(struct sock *sk,
698 int getfrag(void *from, char *to, int offset, int len,
699 int odd, struct sk_buff *skb),
700 void *from, int length, int transhdrlen,
701 struct ipcm_cookie *ipc, struct rtable *rt,
702 unsigned int flags)
703{
704 struct inet_sock *inet = inet_sk(sk);
705 struct sk_buff *skb;
706
707 struct ip_options *opt = NULL;
708 int hh_len;
709 int exthdrlen;
710 int mtu;
711 int copy;
712 int err;
713 int offset = 0;
714 unsigned int maxfraglen, fragheaderlen;
715 int csummode = CHECKSUM_NONE;
716
717 if (flags&MSG_PROBE)
718 return 0;
719
720 if (skb_queue_empty(&sk->sk_write_queue)) {
721 /*
722 * setup for corking.
723 */
724 opt = ipc->opt;
725 if (opt) {
726 if (inet->cork.opt == NULL) {
727 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
728 if (unlikely(inet->cork.opt == NULL))
729 return -ENOBUFS;
730 }
731 memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
732 inet->cork.flags |= IPCORK_OPT;
733 inet->cork.addr = ipc->addr;
734 }
735 dst_hold(&rt->u.dst);
736 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
737 inet->cork.rt = rt;
738 inet->cork.length = 0;
739 sk->sk_sndmsg_page = NULL;
740 sk->sk_sndmsg_off = 0;
741 if ((exthdrlen = rt->u.dst.header_len) != 0) {
742 length += exthdrlen;
743 transhdrlen += exthdrlen;
744 }
745 } else {
746 rt = inet->cork.rt;
747 if (inet->cork.flags & IPCORK_OPT)
748 opt = inet->cork.opt;
749
750 transhdrlen = 0;
751 exthdrlen = 0;
752 mtu = inet->cork.fragsize;
753 }
754 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
755
756 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
757 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
758
759 if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
760 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
761 return -EMSGSIZE;
762 }
763
764 /*
765 * transhdrlen > 0 means that this is the first fragment and we wish
766 * it won't be fragmented in the future.
767 */
768 if (transhdrlen &&
769 length + fragheaderlen <= mtu &&
770 rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
771 !exthdrlen)
772 csummode = CHECKSUM_HW;
773
774 inet->cork.length += length;
775
776 /* So, what's going on in the loop below?
777 *
778 * We use calculated fragment length to generate chained skb,
779 * each of segments is IP fragment ready for sending to network after
780 * adding appropriate IP header.
781 */
782
783 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
784 goto alloc_new_skb;
785
786 while (length > 0) {
787 /* Check if the remaining data fits into current packet. */
788 copy = mtu - skb->len;
789 if (copy < length)
790 copy = maxfraglen - skb->len;
791 if (copy <= 0) {
792 char *data;
793 unsigned int datalen;
794 unsigned int fraglen;
795 unsigned int fraggap;
796 unsigned int alloclen;
797 struct sk_buff *skb_prev;
798alloc_new_skb:
799 skb_prev = skb;
800 if (skb_prev)
801 fraggap = skb_prev->len - maxfraglen;
802 else
803 fraggap = 0;
804
805 /*
806 * If remaining data exceeds the mtu,
807 * we know we need more fragment(s).
808 */
809 datalen = length + fraggap;
810 if (datalen > mtu - fragheaderlen)
811 datalen = maxfraglen - fragheaderlen;
812 fraglen = datalen + fragheaderlen;
813
814 if ((flags & MSG_MORE) &&
815 !(rt->u.dst.dev->features&NETIF_F_SG))
816 alloclen = mtu;
817 else
818 alloclen = datalen + fragheaderlen;
819
820 /* The last fragment gets additional space at tail.
821 * Note, with MSG_MORE we overallocate on fragments,
822 * because we have no idea what fragment will be
823 * the last.
824 */
825 if (datalen == length)
826 alloclen += rt->u.dst.trailer_len;
827
828 if (transhdrlen) {
829 skb = sock_alloc_send_skb(sk,
830 alloclen + hh_len + 15,
831 (flags & MSG_DONTWAIT), &err);
832 } else {
833 skb = NULL;
834 if (atomic_read(&sk->sk_wmem_alloc) <=
835 2 * sk->sk_sndbuf)
836 skb = sock_wmalloc(sk,
837 alloclen + hh_len + 15, 1,
838 sk->sk_allocation);
839 if (unlikely(skb == NULL))
840 err = -ENOBUFS;
841 }
842 if (skb == NULL)
843 goto error;
844
845 /*
846 * Fill in the control structures
847 */
848 skb->ip_summed = csummode;
849 skb->csum = 0;
850 skb_reserve(skb, hh_len);
851
852 /*
853 * Find where to start putting bytes.
854 */
855 data = skb_put(skb, fraglen);
856 skb->nh.raw = data + exthdrlen;
857 data += fragheaderlen;
858 skb->h.raw = data + exthdrlen;
859
860 if (fraggap) {
861 skb->csum = skb_copy_and_csum_bits(
862 skb_prev, maxfraglen,
863 data + transhdrlen, fraggap, 0);
864 skb_prev->csum = csum_sub(skb_prev->csum,
865 skb->csum);
866 data += fraggap;
867 skb_trim(skb_prev, maxfraglen);
868 }
869
870 copy = datalen - transhdrlen - fraggap;
871 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
872 err = -EFAULT;
873 kfree_skb(skb);
874 goto error;
875 }
876
877 offset += copy;
878 length -= datalen - fraggap;
879 transhdrlen = 0;
880 exthdrlen = 0;
881 csummode = CHECKSUM_NONE;
882
883 /*
884 * Put the packet on the pending queue.
885 */
886 __skb_queue_tail(&sk->sk_write_queue, skb);
887 continue;
888 }
889
890 if (copy > length)
891 copy = length;
892
893 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
894 unsigned int off;
895
896 off = skb->len;
897 if (getfrag(from, skb_put(skb, copy),
898 offset, copy, off, skb) < 0) {
899 __skb_trim(skb, off);
900 err = -EFAULT;
901 goto error;
902 }
903 } else {
904 int i = skb_shinfo(skb)->nr_frags;
905 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
906 struct page *page = sk->sk_sndmsg_page;
907 int off = sk->sk_sndmsg_off;
908 unsigned int left;
909
910 if (page && (left = PAGE_SIZE - off) > 0) {
911 if (copy >= left)
912 copy = left;
913 if (page != frag->page) {
914 if (i == MAX_SKB_FRAGS) {
915 err = -EMSGSIZE;
916 goto error;
917 }
918 get_page(page);
919 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
920 frag = &skb_shinfo(skb)->frags[i];
921 }
922 } else if (i < MAX_SKB_FRAGS) {
923 if (copy > PAGE_SIZE)
924 copy = PAGE_SIZE;
925 page = alloc_pages(sk->sk_allocation, 0);
926 if (page == NULL) {
927 err = -ENOMEM;
928 goto error;
929 }
930 sk->sk_sndmsg_page = page;
931 sk->sk_sndmsg_off = 0;
932
933 skb_fill_page_desc(skb, i, page, 0, 0);
934 frag = &skb_shinfo(skb)->frags[i];
935 skb->truesize += PAGE_SIZE;
936 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
937 } else {
938 err = -EMSGSIZE;
939 goto error;
940 }
941 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
942 err = -EFAULT;
943 goto error;
944 }
945 sk->sk_sndmsg_off += copy;
946 frag->size += copy;
947 skb->len += copy;
948 skb->data_len += copy;
949 }
950 offset += copy;
951 length -= copy;
952 }
953
954 return 0;
955
956error:
957 inet->cork.length -= length;
958 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
959 return err;
960}
961
962ssize_t ip_append_page(struct sock *sk, struct page *page,
963 int offset, size_t size, int flags)
964{
965 struct inet_sock *inet = inet_sk(sk);
966 struct sk_buff *skb;
967 struct rtable *rt;
968 struct ip_options *opt = NULL;
969 int hh_len;
970 int mtu;
971 int len;
972 int err;
973 unsigned int maxfraglen, fragheaderlen, fraggap;
974
975 if (inet->hdrincl)
976 return -EPERM;
977
978 if (flags&MSG_PROBE)
979 return 0;
980
981 if (skb_queue_empty(&sk->sk_write_queue))
982 return -EINVAL;
983
984 rt = inet->cork.rt;
985 if (inet->cork.flags & IPCORK_OPT)
986 opt = inet->cork.opt;
987
988 if (!(rt->u.dst.dev->features&NETIF_F_SG))
989 return -EOPNOTSUPP;
990
991 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
992 mtu = inet->cork.fragsize;
993
994 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
995 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
996
997 if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
998 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
999 return -EMSGSIZE;
1000 }
1001
1002 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1003 return -EINVAL;
1004
1005 inet->cork.length += size;
1006
1007 while (size > 0) {
1008 int i;
1009
1010 /* Check if the remaining data fits into current packet. */
1011 len = mtu - skb->len;
1012 if (len < size)
1013 len = maxfraglen - skb->len;
1014 if (len <= 0) {
1015 struct sk_buff *skb_prev;
1016 char *data;
1017 struct iphdr *iph;
1018 int alloclen;
1019
1020 skb_prev = skb;
1021 if (skb_prev)
1022 fraggap = skb_prev->len - maxfraglen;
1023 else
1024 fraggap = 0;
1025
1026 alloclen = fragheaderlen + hh_len + fraggap + 15;
1027 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1028 if (unlikely(!skb)) {
1029 err = -ENOBUFS;
1030 goto error;
1031 }
1032
1033 /*
1034 * Fill in the control structures
1035 */
1036 skb->ip_summed = CHECKSUM_NONE;
1037 skb->csum = 0;
1038 skb_reserve(skb, hh_len);
1039
1040 /*
1041 * Find where to start putting bytes.
1042 */
1043 data = skb_put(skb, fragheaderlen + fraggap);
1044 skb->nh.iph = iph = (struct iphdr *)data;
1045 data += fragheaderlen;
1046 skb->h.raw = data;
1047
1048 if (fraggap) {
1049 skb->csum = skb_copy_and_csum_bits(
1050 skb_prev, maxfraglen,
1051 data, fraggap, 0);
1052 skb_prev->csum = csum_sub(skb_prev->csum,
1053 skb->csum);
1054 skb_trim(skb_prev, maxfraglen);
1055 }
1056
1057 /*
1058 * Put the packet on the pending queue.
1059 */
1060 __skb_queue_tail(&sk->sk_write_queue, skb);
1061 continue;
1062 }
1063
1064 i = skb_shinfo(skb)->nr_frags;
1065 if (len > size)
1066 len = size;
1067 if (skb_can_coalesce(skb, i, page, offset)) {
1068 skb_shinfo(skb)->frags[i-1].size += len;
1069 } else if (i < MAX_SKB_FRAGS) {
1070 get_page(page);
1071 skb_fill_page_desc(skb, i, page, offset, len);
1072 } else {
1073 err = -EMSGSIZE;
1074 goto error;
1075 }
1076
1077 if (skb->ip_summed == CHECKSUM_NONE) {
1078 unsigned int csum;
1079 csum = csum_page(page, offset, len);
1080 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1081 }
1082
1083 skb->len += len;
1084 skb->data_len += len;
1085 offset += len;
1086 size -= len;
1087 }
1088 return 0;
1089
1090error:
1091 inet->cork.length -= size;
1092 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1093 return err;
1094}
1095
1096/*
1097 * Combined all pending IP fragments on the socket as one IP datagram
1098 * and push them out.
1099 */
1100int ip_push_pending_frames(struct sock *sk)
1101{
1102 struct sk_buff *skb, *tmp_skb;
1103 struct sk_buff **tail_skb;
1104 struct inet_sock *inet = inet_sk(sk);
1105 struct ip_options *opt = NULL;
1106 struct rtable *rt = inet->cork.rt;
1107 struct iphdr *iph;
1108 int df = 0;
1109 __u8 ttl;
1110 int err = 0;
1111
1112 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1113 goto out;
1114 tail_skb = &(skb_shinfo(skb)->frag_list);
1115
1116 /* move skb->data to ip header from ext header */
1117 if (skb->data < skb->nh.raw)
1118 __skb_pull(skb, skb->nh.raw - skb->data);
1119 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1120 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1121 *tail_skb = tmp_skb;
1122 tail_skb = &(tmp_skb->next);
1123 skb->len += tmp_skb->len;
1124 skb->data_len += tmp_skb->len;
1125 skb->truesize += tmp_skb->truesize;
1126 __sock_put(tmp_skb->sk);
1127 tmp_skb->destructor = NULL;
1128 tmp_skb->sk = NULL;
1129 }
1130
1131 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1132 * to fragment the frame generated here. No matter, what transforms
1133 * how transforms change size of the packet, it will come out.
1134 */
1135 if (inet->pmtudisc != IP_PMTUDISC_DO)
1136 skb->local_df = 1;
1137
1138 /* DF bit is set when we want to see DF on outgoing frames.
1139 * If local_df is set too, we still allow to fragment this frame
1140 * locally. */
1141 if (inet->pmtudisc == IP_PMTUDISC_DO ||
1142 (skb->len <= dst_mtu(&rt->u.dst) &&
1143 ip_dont_fragment(sk, &rt->u.dst)))
1144 df = htons(IP_DF);
1145
1146 if (inet->cork.flags & IPCORK_OPT)
1147 opt = inet->cork.opt;
1148
1149 if (rt->rt_type == RTN_MULTICAST)
1150 ttl = inet->mc_ttl;
1151 else
1152 ttl = ip_select_ttl(inet, &rt->u.dst);
1153
1154 iph = (struct iphdr *)skb->data;
1155 iph->version = 4;
1156 iph->ihl = 5;
1157 if (opt) {
1158 iph->ihl += opt->optlen>>2;
1159 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1160 }
1161 iph->tos = inet->tos;
1162 iph->tot_len = htons(skb->len);
1163 iph->frag_off = df;
1164 if (!df) {
1165 __ip_select_ident(iph, &rt->u.dst, 0);
1166 } else {
1167 iph->id = htons(inet->id++);
1168 }
1169 iph->ttl = ttl;
1170 iph->protocol = sk->sk_protocol;
1171 iph->saddr = rt->rt_src;
1172 iph->daddr = rt->rt_dst;
1173 ip_send_check(iph);
1174
1175 skb->priority = sk->sk_priority;
1176 skb->dst = dst_clone(&rt->u.dst);
1177
1178 /* Netfilter gets whole the not fragmented skb. */
1179 err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1180 skb->dst->dev, dst_output);
1181 if (err) {
1182 if (err > 0)
1183 err = inet->recverr ? net_xmit_errno(err) : 0;
1184 if (err)
1185 goto error;
1186 }
1187
1188out:
1189 inet->cork.flags &= ~IPCORK_OPT;
1190 if (inet->cork.opt) {
1191 kfree(inet->cork.opt);
1192 inet->cork.opt = NULL;
1193 }
1194 if (inet->cork.rt) {
1195 ip_rt_put(inet->cork.rt);
1196 inet->cork.rt = NULL;
1197 }
1198 return err;
1199
1200error:
1201 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1202 goto out;
1203}
1204
1205/*
1206 * Throw away all pending data on the socket.
1207 */
1208void ip_flush_pending_frames(struct sock *sk)
1209{
1210 struct inet_sock *inet = inet_sk(sk);
1211 struct sk_buff *skb;
1212
1213 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1214 kfree_skb(skb);
1215
1216 inet->cork.flags &= ~IPCORK_OPT;
1217 if (inet->cork.opt) {
1218 kfree(inet->cork.opt);
1219 inet->cork.opt = NULL;
1220 }
1221 if (inet->cork.rt) {
1222 ip_rt_put(inet->cork.rt);
1223 inet->cork.rt = NULL;
1224 }
1225}
1226
1227
1228/*
1229 * Fetch data from kernel space and fill in checksum if needed.
1230 */
1231static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1232 int len, int odd, struct sk_buff *skb)
1233{
1234 unsigned int csum;
1235
1236 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1237 skb->csum = csum_block_add(skb->csum, csum, odd);
1238 return 0;
1239}
1240
1241/*
1242 * Generic function to send a packet as reply to another packet.
1243 * Used to send TCP resets so far. ICMP should use this function too.
1244 *
1245 * Should run single threaded per socket because it uses the sock
1246 * structure to pass arguments.
1247 *
1248 * LATER: switch from ip_build_xmit to ip_append_*
1249 */
1250void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1251 unsigned int len)
1252{
1253 struct inet_sock *inet = inet_sk(sk);
1254 struct {
1255 struct ip_options opt;
1256 char data[40];
1257 } replyopts;
1258 struct ipcm_cookie ipc;
1259 u32 daddr;
1260 struct rtable *rt = (struct rtable*)skb->dst;
1261
1262 if (ip_options_echo(&replyopts.opt, skb))
1263 return;
1264
1265 daddr = ipc.addr = rt->rt_src;
1266 ipc.opt = NULL;
1267
1268 if (replyopts.opt.optlen) {
1269 ipc.opt = &replyopts.opt;
1270
1271 if (ipc.opt->srr)
1272 daddr = replyopts.opt.faddr;
1273 }
1274
1275 {
1276 struct flowi fl = { .nl_u = { .ip4_u =
1277 { .daddr = daddr,
1278 .saddr = rt->rt_spec_dst,
1279 .tos = RT_TOS(skb->nh.iph->tos) } },
1280 /* Not quite clean, but right. */
1281 .uli_u = { .ports =
1282 { .sport = skb->h.th->dest,
1283 .dport = skb->h.th->source } },
1284 .proto = sk->sk_protocol };
1285 if (ip_route_output_key(&rt, &fl))
1286 return;
1287 }
1288
1289 /* And let IP do all the hard work.
1290
1291 This chunk is not reenterable, hence spinlock.
1292 Note that it uses the fact, that this function is called
1293 with locally disabled BH and that sk cannot be already spinlocked.
1294 */
1295 bh_lock_sock(sk);
1296 inet->tos = skb->nh.iph->tos;
1297 sk->sk_priority = skb->priority;
1298 sk->sk_protocol = skb->nh.iph->protocol;
1299 ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1300 &ipc, rt, MSG_DONTWAIT);
1301 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1302 if (arg->csumoffset >= 0)
1303 *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1304 skb->ip_summed = CHECKSUM_NONE;
1305 ip_push_pending_frames(sk);
1306 }
1307
1308 bh_unlock_sock(sk);
1309
1310 ip_rt_put(rt);
1311}
1312
1da177e4
LT
1313void __init ip_init(void)
1314{
1da177e4
LT
1315 ip_rt_init();
1316 inet_initpeers();
1317
1318#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1319 igmp_mc_proc_init();
1320#endif
1321}
1322
1da177e4
LT
1323EXPORT_SYMBOL(ip_fragment);
1324EXPORT_SYMBOL(ip_generic_getfrag);
1325EXPORT_SYMBOL(ip_queue_xmit);
1326EXPORT_SYMBOL(ip_send_check);