]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/ipv6/ip6_output.c
[NETFILTER]: Add nf_conntrack subsystem.
[net-next-2.6.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation 
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *      Based on linux/net/ipv4/ip_output.c
11  *
12  *      This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *      Changes:
18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
19  *                              extension headers are implemented.
20  *                              route changes now work.
21  *                              ip6_forward does not confuse sniffers.
22  *                              etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *      Imran Patel     :       frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *                      :       add ip6_append_data and related functions
28  *                              for datagram xmit
29  */
30
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 {
63         static u32 ipv6_fragmentation_id = 1;
64         static DEFINE_SPINLOCK(ip6_id_lock);
65
66         spin_lock_bh(&ip6_id_lock);
67         fhdr->identification = htonl(ipv6_fragmentation_id);
68         if (++ipv6_fragmentation_id == 0)
69                 ipv6_fragmentation_id = 1;
70         spin_unlock_bh(&ip6_id_lock);
71 }
72
73 static inline int ip6_output_finish(struct sk_buff *skb)
74 {
75
76         struct dst_entry *dst = skb->dst;
77         struct hh_cache *hh = dst->hh;
78
79         if (hh) {
80                 int hh_alen;
81
82                 read_lock_bh(&hh->hh_lock);
83                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85                 read_unlock_bh(&hh->hh_lock);
86                 skb_push(skb, hh->hh_len);
87                 return hh->hh_output(skb);
88         } else if (dst->neighbour)
89                 return dst->neighbour->output(skb);
90
91         IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92         kfree_skb(skb);
93         return -EINVAL;
94
95 }
96
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99 {
100         newskb->mac.raw = newskb->data;
101         __skb_pull(newskb, newskb->nh.raw - newskb->data);
102         newskb->pkt_type = PACKET_LOOPBACK;
103         newskb->ip_summed = CHECKSUM_UNNECESSARY;
104         BUG_TRAP(newskb->dst);
105
106         netif_rx(newskb);
107         return 0;
108 }
109
110
111 static int ip6_output2(struct sk_buff *skb)
112 {
113         struct dst_entry *dst = skb->dst;
114         struct net_device *dev = dst->dev;
115
116         skb->protocol = htons(ETH_P_IPV6);
117         skb->dev = dev;
118
119         if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121
122                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123                     ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124                                 &skb->nh.ipv6h->saddr)) {
125                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126
127                         /* Do not check for IFF_ALLMULTI; multicast routing
128                            is not supported in any case.
129                          */
130                         if (newskb)
131                                 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132                                         newskb->dev,
133                                         ip6_dev_loopback_xmit);
134
135                         if (skb->nh.ipv6h->hop_limit == 0) {
136                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137                                 kfree_skb(skb);
138                                 return 0;
139                         }
140                 }
141
142                 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143         }
144
145         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146 }
147
148 int ip6_output(struct sk_buff *skb)
149 {
150         if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) ||
151                                 dst_allfrag(skb->dst))
152                 return ip6_fragment(skb, ip6_output2);
153         else
154                 return ip6_output2(skb);
155 }
156
157 /*
158  *      xmit an sk_buff (used by TCP)
159  */
160
161 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
162              struct ipv6_txoptions *opt, int ipfragok)
163 {
164         struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
165         struct in6_addr *first_hop = &fl->fl6_dst;
166         struct dst_entry *dst = skb->dst;
167         struct ipv6hdr *hdr;
168         u8  proto = fl->proto;
169         int seg_len = skb->len;
170         int hlimit, tclass;
171         u32 mtu;
172
173         if (opt) {
174                 int head_room;
175
176                 /* First: exthdrs may take lots of space (~8K for now)
177                    MAX_HEADER is not enough.
178                  */
179                 head_room = opt->opt_nflen + opt->opt_flen;
180                 seg_len += head_room;
181                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
182
183                 if (skb_headroom(skb) < head_room) {
184                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
185                         kfree_skb(skb);
186                         skb = skb2;
187                         if (skb == NULL) {      
188                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
189                                 return -ENOBUFS;
190                         }
191                         if (sk)
192                                 skb_set_owner_w(skb, sk);
193                 }
194                 if (opt->opt_flen)
195                         ipv6_push_frag_opts(skb, opt, &proto);
196                 if (opt->opt_nflen)
197                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
198         }
199
200         hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
201
202         /*
203          *      Fill in the IPv6 header
204          */
205
206         hlimit = -1;
207         if (np)
208                 hlimit = np->hop_limit;
209         if (hlimit < 0)
210                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
211         if (hlimit < 0)
212                 hlimit = ipv6_get_hoplimit(dst->dev);
213
214         tclass = -1;
215         if (np)
216                 tclass = np->tclass;
217         if (tclass < 0)
218                 tclass = 0;
219
220         *(u32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
221
222         hdr->payload_len = htons(seg_len);
223         hdr->nexthdr = proto;
224         hdr->hop_limit = hlimit;
225
226         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
227         ipv6_addr_copy(&hdr->daddr, first_hop);
228
229         mtu = dst_mtu(dst);
230         if ((skb->len <= mtu) || ipfragok) {
231                 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
232                 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
233                                 dst_output);
234         }
235
236         if (net_ratelimit())
237                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
238         skb->dev = dst->dev;
239         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
240         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
241         kfree_skb(skb);
242         return -EMSGSIZE;
243 }
244
245 /*
246  *      To avoid extra problems ND packets are send through this
247  *      routine. It's code duplication but I really want to avoid
248  *      extra checks since ipv6_build_header is used by TCP (which
249  *      is for us performance critical)
250  */
251
252 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
253                struct in6_addr *saddr, struct in6_addr *daddr,
254                int proto, int len)
255 {
256         struct ipv6_pinfo *np = inet6_sk(sk);
257         struct ipv6hdr *hdr;
258         int totlen;
259
260         skb->protocol = htons(ETH_P_IPV6);
261         skb->dev = dev;
262
263         totlen = len + sizeof(struct ipv6hdr);
264
265         hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
266         skb->nh.ipv6h = hdr;
267
268         *(u32*)hdr = htonl(0x60000000);
269
270         hdr->payload_len = htons(len);
271         hdr->nexthdr = proto;
272         hdr->hop_limit = np->hop_limit;
273
274         ipv6_addr_copy(&hdr->saddr, saddr);
275         ipv6_addr_copy(&hdr->daddr, daddr);
276
277         return 0;
278 }
279
280 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
281 {
282         struct ip6_ra_chain *ra;
283         struct sock *last = NULL;
284
285         read_lock(&ip6_ra_lock);
286         for (ra = ip6_ra_chain; ra; ra = ra->next) {
287                 struct sock *sk = ra->sk;
288                 if (sk && ra->sel == sel &&
289                     (!sk->sk_bound_dev_if ||
290                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
291                         if (last) {
292                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
293                                 if (skb2)
294                                         rawv6_rcv(last, skb2);
295                         }
296                         last = sk;
297                 }
298         }
299
300         if (last) {
301                 rawv6_rcv(last, skb);
302                 read_unlock(&ip6_ra_lock);
303                 return 1;
304         }
305         read_unlock(&ip6_ra_lock);
306         return 0;
307 }
308
309 static inline int ip6_forward_finish(struct sk_buff *skb)
310 {
311         return dst_output(skb);
312 }
313
314 int ip6_forward(struct sk_buff *skb)
315 {
316         struct dst_entry *dst = skb->dst;
317         struct ipv6hdr *hdr = skb->nh.ipv6h;
318         struct inet6_skb_parm *opt = IP6CB(skb);
319         
320         if (ipv6_devconf.forwarding == 0)
321                 goto error;
322
323         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
324                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
325                 goto drop;
326         }
327
328         skb->ip_summed = CHECKSUM_NONE;
329
330         /*
331          *      We DO NOT make any processing on
332          *      RA packets, pushing them to user level AS IS
333          *      without ane WARRANTY that application will be able
334          *      to interpret them. The reason is that we
335          *      cannot make anything clever here.
336          *
337          *      We are not end-node, so that if packet contains
338          *      AH/ESP, we cannot make anything.
339          *      Defragmentation also would be mistake, RA packets
340          *      cannot be fragmented, because there is no warranty
341          *      that different fragments will go along one path. --ANK
342          */
343         if (opt->ra) {
344                 u8 *ptr = skb->nh.raw + opt->ra;
345                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
346                         return 0;
347         }
348
349         /*
350          *      check and decrement ttl
351          */
352         if (hdr->hop_limit <= 1) {
353                 /* Force OUTPUT device used as source address */
354                 skb->dev = dst->dev;
355                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
356                             0, skb->dev);
357
358                 kfree_skb(skb);
359                 return -ETIMEDOUT;
360         }
361
362         if (!xfrm6_route_forward(skb)) {
363                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
364                 goto drop;
365         }
366         dst = skb->dst;
367
368         /* IPv6 specs say nothing about it, but it is clear that we cannot
369            send redirects to source routed frames.
370          */
371         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
372                 struct in6_addr *target = NULL;
373                 struct rt6_info *rt;
374                 struct neighbour *n = dst->neighbour;
375
376                 /*
377                  *      incoming and outgoing devices are the same
378                  *      send a redirect.
379                  */
380
381                 rt = (struct rt6_info *) dst;
382                 if ((rt->rt6i_flags & RTF_GATEWAY))
383                         target = (struct in6_addr*)&n->primary_key;
384                 else
385                         target = &hdr->daddr;
386
387                 /* Limit redirects both by destination (here)
388                    and by source (inside ndisc_send_redirect)
389                  */
390                 if (xrlim_allow(dst, 1*HZ))
391                         ndisc_send_redirect(skb, n, target);
392         } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
393                                                 |IPV6_ADDR_LINKLOCAL)) {
394                 /* This check is security critical. */
395                 goto error;
396         }
397
398         if (skb->len > dst_mtu(dst)) {
399                 /* Again, force OUTPUT device used as source address */
400                 skb->dev = dst->dev;
401                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
402                 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
403                 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
404                 kfree_skb(skb);
405                 return -EMSGSIZE;
406         }
407
408         if (skb_cow(skb, dst->dev->hard_header_len)) {
409                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
410                 goto drop;
411         }
412
413         hdr = skb->nh.ipv6h;
414
415         /* Mangling hops number delayed to point after skb COW */
416  
417         hdr->hop_limit--;
418
419         IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
420         return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
421
422 error:
423         IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
424 drop:
425         kfree_skb(skb);
426         return -EINVAL;
427 }
428
429 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
430 {
431         to->pkt_type = from->pkt_type;
432         to->priority = from->priority;
433         to->protocol = from->protocol;
434         dst_release(to->dst);
435         to->dst = dst_clone(from->dst);
436         to->dev = from->dev;
437
438 #ifdef CONFIG_NET_SCHED
439         to->tc_index = from->tc_index;
440 #endif
441 #ifdef CONFIG_NETFILTER
442         to->nfmark = from->nfmark;
443         /* Connection association is same as pre-frag packet */
444         nf_conntrack_put(to->nfct);
445         to->nfct = from->nfct;
446         nf_conntrack_get(to->nfct);
447         to->nfctinfo = from->nfctinfo;
448 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
449         nf_conntrack_put_reasm(to->nfct_reasm);
450         to->nfct_reasm = from->nfct_reasm;
451         nf_conntrack_get_reasm(to->nfct_reasm);
452 #endif
453 #ifdef CONFIG_BRIDGE_NETFILTER
454         nf_bridge_put(to->nf_bridge);
455         to->nf_bridge = from->nf_bridge;
456         nf_bridge_get(to->nf_bridge);
457 #endif
458 #endif
459 }
460
461 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
462 {
463         u16 offset = sizeof(struct ipv6hdr);
464         struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
465         unsigned int packet_len = skb->tail - skb->nh.raw;
466         int found_rhdr = 0;
467         *nexthdr = &skb->nh.ipv6h->nexthdr;
468
469         while (offset + 1 <= packet_len) {
470
471                 switch (**nexthdr) {
472
473                 case NEXTHDR_HOP:
474                 case NEXTHDR_ROUTING:
475                 case NEXTHDR_DEST:
476                         if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
477                         if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
478                         offset += ipv6_optlen(exthdr);
479                         *nexthdr = &exthdr->nexthdr;
480                         exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
481                         break;
482                 default :
483                         return offset;
484                 }
485         }
486
487         return offset;
488 }
489
490 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
491 {
492         struct net_device *dev;
493         struct sk_buff *frag;
494         struct rt6_info *rt = (struct rt6_info*)skb->dst;
495         struct ipv6hdr *tmp_hdr;
496         struct frag_hdr *fh;
497         unsigned int mtu, hlen, left, len;
498         u32 frag_id = 0;
499         int ptr, offset = 0, err=0;
500         u8 *prevhdr, nexthdr = 0;
501
502         dev = rt->u.dst.dev;
503         hlen = ip6_find_1stfragopt(skb, &prevhdr);
504         nexthdr = *prevhdr;
505
506         mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
507
508         if (skb_shinfo(skb)->frag_list) {
509                 int first_len = skb_pagelen(skb);
510
511                 if (first_len - hlen > mtu ||
512                     ((first_len - hlen) & 7) ||
513                     skb_cloned(skb))
514                         goto slow_path;
515
516                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
517                         /* Correct geometry. */
518                         if (frag->len > mtu ||
519                             ((frag->len & 7) && frag->next) ||
520                             skb_headroom(frag) < hlen)
521                             goto slow_path;
522
523                         /* Partially cloned skb? */
524                         if (skb_shared(frag))
525                                 goto slow_path;
526
527                         BUG_ON(frag->sk);
528                         if (skb->sk) {
529                                 sock_hold(skb->sk);
530                                 frag->sk = skb->sk;
531                                 frag->destructor = sock_wfree;
532                                 skb->truesize -= frag->truesize;
533                         }
534                 }
535
536                 err = 0;
537                 offset = 0;
538                 frag = skb_shinfo(skb)->frag_list;
539                 skb_shinfo(skb)->frag_list = NULL;
540                 /* BUILD HEADER */
541
542                 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
543                 if (!tmp_hdr) {
544                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
545                         return -ENOMEM;
546                 }
547
548                 *prevhdr = NEXTHDR_FRAGMENT;
549                 memcpy(tmp_hdr, skb->nh.raw, hlen);
550                 __skb_pull(skb, hlen);
551                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
552                 skb->nh.raw = __skb_push(skb, hlen);
553                 memcpy(skb->nh.raw, tmp_hdr, hlen);
554
555                 ipv6_select_ident(skb, fh);
556                 fh->nexthdr = nexthdr;
557                 fh->reserved = 0;
558                 fh->frag_off = htons(IP6_MF);
559                 frag_id = fh->identification;
560
561                 first_len = skb_pagelen(skb);
562                 skb->data_len = first_len - skb_headlen(skb);
563                 skb->len = first_len;
564                 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
565  
566
567                 for (;;) {
568                         /* Prepare header of the next frame,
569                          * before previous one went down. */
570                         if (frag) {
571                                 frag->ip_summed = CHECKSUM_NONE;
572                                 frag->h.raw = frag->data;
573                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
574                                 frag->nh.raw = __skb_push(frag, hlen);
575                                 memcpy(frag->nh.raw, tmp_hdr, hlen);
576                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
577                                 fh->nexthdr = nexthdr;
578                                 fh->reserved = 0;
579                                 fh->frag_off = htons(offset);
580                                 if (frag->next != NULL)
581                                         fh->frag_off |= htons(IP6_MF);
582                                 fh->identification = frag_id;
583                                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
584                                 ip6_copy_metadata(frag, skb);
585                         }
586                         
587                         err = output(skb);
588                         if (err || !frag)
589                                 break;
590
591                         skb = frag;
592                         frag = skb->next;
593                         skb->next = NULL;
594                 }
595
596                 kfree(tmp_hdr);
597
598                 if (err == 0) {
599                         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
600                         return 0;
601                 }
602
603                 while (frag) {
604                         skb = frag->next;
605                         kfree_skb(frag);
606                         frag = skb;
607                 }
608
609                 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
610                 return err;
611         }
612
613 slow_path:
614         left = skb->len - hlen;         /* Space per frame */
615         ptr = hlen;                     /* Where to start from */
616
617         /*
618          *      Fragment the datagram.
619          */
620
621         *prevhdr = NEXTHDR_FRAGMENT;
622
623         /*
624          *      Keep copying data until we run out.
625          */
626         while(left > 0) {
627                 len = left;
628                 /* IF: it doesn't fit, use 'mtu' - the data space left */
629                 if (len > mtu)
630                         len = mtu;
631                 /* IF: we are not sending upto and including the packet end
632                    then align the next start on an eight byte boundary */
633                 if (len < left) {
634                         len &= ~7;
635                 }
636                 /*
637                  *      Allocate buffer.
638                  */
639
640                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
641                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
642                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
643                         err = -ENOMEM;
644                         goto fail;
645                 }
646
647                 /*
648                  *      Set up data on packet
649                  */
650
651                 ip6_copy_metadata(frag, skb);
652                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
653                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
654                 frag->nh.raw = frag->data;
655                 fh = (struct frag_hdr*)(frag->data + hlen);
656                 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
657
658                 /*
659                  *      Charge the memory for the fragment to any owner
660                  *      it might possess
661                  */
662                 if (skb->sk)
663                         skb_set_owner_w(frag, skb->sk);
664
665                 /*
666                  *      Copy the packet header into the new buffer.
667                  */
668                 memcpy(frag->nh.raw, skb->data, hlen);
669
670                 /*
671                  *      Build fragment header.
672                  */
673                 fh->nexthdr = nexthdr;
674                 fh->reserved = 0;
675                 if (!frag_id) {
676                         ipv6_select_ident(skb, fh);
677                         frag_id = fh->identification;
678                 } else
679                         fh->identification = frag_id;
680
681                 /*
682                  *      Copy a block of the IP datagram.
683                  */
684                 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
685                         BUG();
686                 left -= len;
687
688                 fh->frag_off = htons(offset);
689                 if (left > 0)
690                         fh->frag_off |= htons(IP6_MF);
691                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
692
693                 ptr += len;
694                 offset += len;
695
696                 /*
697                  *      Put this fragment into the sending queue.
698                  */
699
700                 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
701
702                 err = output(frag);
703                 if (err)
704                         goto fail;
705         }
706         kfree_skb(skb);
707         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
708         return err;
709
710 fail:
711         kfree_skb(skb); 
712         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
713         return err;
714 }
715
716 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
717 {
718         int err = 0;
719
720         *dst = NULL;
721         if (sk) {
722                 struct ipv6_pinfo *np = inet6_sk(sk);
723         
724                 *dst = sk_dst_check(sk, np->dst_cookie);
725                 if (*dst) {
726                         struct rt6_info *rt = (struct rt6_info*)*dst;
727         
728                                 /* Yes, checking route validity in not connected
729                                    case is not very simple. Take into account,
730                                    that we do not support routing by source, TOS,
731                                    and MSG_DONTROUTE            --ANK (980726)
732         
733                                    1. If route was host route, check that
734                                       cached destination is current.
735                                       If it is network route, we still may
736                                       check its validity using saved pointer
737                                       to the last used address: daddr_cache.
738                                       We do not want to save whole address now,
739                                       (because main consumer of this service
740                                        is tcp, which has not this problem),
741                                       so that the last trick works only on connected
742                                       sockets.
743                                    2. oif also should be the same.
744                                  */
745         
746                         if (((rt->rt6i_dst.plen != 128 ||
747                               !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
748                              && (np->daddr_cache == NULL ||
749                                  !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
750                             || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
751                                 dst_release(*dst);
752                                 *dst = NULL;
753                         }
754                 }
755         }
756
757         if (*dst == NULL)
758                 *dst = ip6_route_output(sk, fl);
759
760         if ((err = (*dst)->error))
761                 goto out_err_release;
762
763         if (ipv6_addr_any(&fl->fl6_src)) {
764                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
765
766                 if (err)
767                         goto out_err_release;
768         }
769
770         return 0;
771
772 out_err_release:
773         dst_release(*dst);
774         *dst = NULL;
775         return err;
776 }
777 inline int ip6_ufo_append_data(struct sock *sk,
778                         int getfrag(void *from, char *to, int offset, int len,
779                         int odd, struct sk_buff *skb),
780                         void *from, int length, int hh_len, int fragheaderlen,
781                         int transhdrlen, int mtu,unsigned int flags)
782
783 {
784         struct sk_buff *skb;
785         int err;
786
787         /* There is support for UDP large send offload by network
788          * device, so create one single skb packet containing complete
789          * udp datagram
790          */
791         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
792                 skb = sock_alloc_send_skb(sk,
793                         hh_len + fragheaderlen + transhdrlen + 20,
794                         (flags & MSG_DONTWAIT), &err);
795                 if (skb == NULL)
796                         return -ENOMEM;
797
798                 /* reserve space for Hardware header */
799                 skb_reserve(skb, hh_len);
800
801                 /* create space for UDP/IP header */
802                 skb_put(skb,fragheaderlen + transhdrlen);
803
804                 /* initialize network header pointer */
805                 skb->nh.raw = skb->data;
806
807                 /* initialize protocol header pointer */
808                 skb->h.raw = skb->data + fragheaderlen;
809
810                 skb->ip_summed = CHECKSUM_HW;
811                 skb->csum = 0;
812                 sk->sk_sndmsg_off = 0;
813         }
814
815         err = skb_append_datato_frags(sk,skb, getfrag, from,
816                                       (length - transhdrlen));
817         if (!err) {
818                 struct frag_hdr fhdr;
819
820                 /* specify the length of each IP datagram fragment*/
821                 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) - 
822                                                 sizeof(struct frag_hdr);
823                 ipv6_select_ident(skb, &fhdr);
824                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
825                 __skb_queue_tail(&sk->sk_write_queue, skb);
826
827                 return 0;
828         }
829         /* There is not enough support do UPD LSO,
830          * so follow normal path
831          */
832         kfree_skb(skb);
833
834         return err;
835 }
836
837 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
838         int offset, int len, int odd, struct sk_buff *skb),
839         void *from, int length, int transhdrlen,
840         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
841         struct rt6_info *rt, unsigned int flags)
842 {
843         struct inet_sock *inet = inet_sk(sk);
844         struct ipv6_pinfo *np = inet6_sk(sk);
845         struct sk_buff *skb;
846         unsigned int maxfraglen, fragheaderlen;
847         int exthdrlen;
848         int hh_len;
849         int mtu;
850         int copy;
851         int err;
852         int offset = 0;
853         int csummode = CHECKSUM_NONE;
854
855         if (flags&MSG_PROBE)
856                 return 0;
857         if (skb_queue_empty(&sk->sk_write_queue)) {
858                 /*
859                  * setup for corking
860                  */
861                 if (opt) {
862                         if (np->cork.opt == NULL) {
863                                 np->cork.opt = kmalloc(opt->tot_len,
864                                                        sk->sk_allocation);
865                                 if (unlikely(np->cork.opt == NULL))
866                                         return -ENOBUFS;
867                         } else if (np->cork.opt->tot_len < opt->tot_len) {
868                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
869                                 return -EINVAL;
870                         }
871                         memcpy(np->cork.opt, opt, opt->tot_len);
872                         inet->cork.flags |= IPCORK_OPT;
873                         /* need source address above miyazawa*/
874                 }
875                 dst_hold(&rt->u.dst);
876                 np->cork.rt = rt;
877                 inet->cork.fl = *fl;
878                 np->cork.hop_limit = hlimit;
879                 np->cork.tclass = tclass;
880                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
881                 if (dst_allfrag(rt->u.dst.path))
882                         inet->cork.flags |= IPCORK_ALLFRAG;
883                 inet->cork.length = 0;
884                 sk->sk_sndmsg_page = NULL;
885                 sk->sk_sndmsg_off = 0;
886                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
887                 length += exthdrlen;
888                 transhdrlen += exthdrlen;
889         } else {
890                 rt = np->cork.rt;
891                 fl = &inet->cork.fl;
892                 if (inet->cork.flags & IPCORK_OPT)
893                         opt = np->cork.opt;
894                 transhdrlen = 0;
895                 exthdrlen = 0;
896                 mtu = inet->cork.fragsize;
897         }
898
899         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
900
901         fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
902         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
903
904         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
905                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
906                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
907                         return -EMSGSIZE;
908                 }
909         }
910
911         /*
912          * Let's try using as much space as possible.
913          * Use MTU if total length of the message fits into the MTU.
914          * Otherwise, we need to reserve fragment header and
915          * fragment alignment (= 8-15 octects, in total).
916          *
917          * Note that we may need to "move" the data from the tail of
918          * of the buffer to the new fragment when we split 
919          * the message.
920          *
921          * FIXME: It may be fragmented into multiple chunks 
922          *        at once if non-fragmentable extension headers
923          *        are too large.
924          * --yoshfuji 
925          */
926
927         inet->cork.length += length;
928         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
929             (rt->u.dst.dev->features & NETIF_F_UFO)) {
930
931                 if(ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
932                                 fragheaderlen, transhdrlen, mtu, flags))
933                         goto error;
934
935                 return 0;
936         }
937
938         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
939                 goto alloc_new_skb;
940
941         while (length > 0) {
942                 /* Check if the remaining data fits into current packet. */
943                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
944                 if (copy < length)
945                         copy = maxfraglen - skb->len;
946
947                 if (copy <= 0) {
948                         char *data;
949                         unsigned int datalen;
950                         unsigned int fraglen;
951                         unsigned int fraggap;
952                         unsigned int alloclen;
953                         struct sk_buff *skb_prev;
954 alloc_new_skb:
955                         skb_prev = skb;
956
957                         /* There's no room in the current skb */
958                         if (skb_prev)
959                                 fraggap = skb_prev->len - maxfraglen;
960                         else
961                                 fraggap = 0;
962
963                         /*
964                          * If remaining data exceeds the mtu,
965                          * we know we need more fragment(s).
966                          */
967                         datalen = length + fraggap;
968                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
969                                 datalen = maxfraglen - fragheaderlen;
970
971                         fraglen = datalen + fragheaderlen;
972                         if ((flags & MSG_MORE) &&
973                             !(rt->u.dst.dev->features&NETIF_F_SG))
974                                 alloclen = mtu;
975                         else
976                                 alloclen = datalen + fragheaderlen;
977
978                         /*
979                          * The last fragment gets additional space at tail.
980                          * Note: we overallocate on fragments with MSG_MODE
981                          * because we have no idea if we're the last one.
982                          */
983                         if (datalen == length + fraggap)
984                                 alloclen += rt->u.dst.trailer_len;
985
986                         /*
987                          * We just reserve space for fragment header.
988                          * Note: this may be overallocation if the message 
989                          * (without MSG_MORE) fits into the MTU.
990                          */
991                         alloclen += sizeof(struct frag_hdr);
992
993                         if (transhdrlen) {
994                                 skb = sock_alloc_send_skb(sk,
995                                                 alloclen + hh_len,
996                                                 (flags & MSG_DONTWAIT), &err);
997                         } else {
998                                 skb = NULL;
999                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1000                                     2 * sk->sk_sndbuf)
1001                                         skb = sock_wmalloc(sk,
1002                                                            alloclen + hh_len, 1,
1003                                                            sk->sk_allocation);
1004                                 if (unlikely(skb == NULL))
1005                                         err = -ENOBUFS;
1006                         }
1007                         if (skb == NULL)
1008                                 goto error;
1009                         /*
1010                          *      Fill in the control structures
1011                          */
1012                         skb->ip_summed = csummode;
1013                         skb->csum = 0;
1014                         /* reserve for fragmentation */
1015                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1016
1017                         /*
1018                          *      Find where to start putting bytes
1019                          */
1020                         data = skb_put(skb, fraglen);
1021                         skb->nh.raw = data + exthdrlen;
1022                         data += fragheaderlen;
1023                         skb->h.raw = data + exthdrlen;
1024
1025                         if (fraggap) {
1026                                 skb->csum = skb_copy_and_csum_bits(
1027                                         skb_prev, maxfraglen,
1028                                         data + transhdrlen, fraggap, 0);
1029                                 skb_prev->csum = csum_sub(skb_prev->csum,
1030                                                           skb->csum);
1031                                 data += fraggap;
1032                                 skb_trim(skb_prev, maxfraglen);
1033                         }
1034                         copy = datalen - transhdrlen - fraggap;
1035                         if (copy < 0) {
1036                                 err = -EINVAL;
1037                                 kfree_skb(skb);
1038                                 goto error;
1039                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1040                                 err = -EFAULT;
1041                                 kfree_skb(skb);
1042                                 goto error;
1043                         }
1044
1045                         offset += copy;
1046                         length -= datalen - fraggap;
1047                         transhdrlen = 0;
1048                         exthdrlen = 0;
1049                         csummode = CHECKSUM_NONE;
1050
1051                         /*
1052                          * Put the packet on the pending queue
1053                          */
1054                         __skb_queue_tail(&sk->sk_write_queue, skb);
1055                         continue;
1056                 }
1057
1058                 if (copy > length)
1059                         copy = length;
1060
1061                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1062                         unsigned int off;
1063
1064                         off = skb->len;
1065                         if (getfrag(from, skb_put(skb, copy),
1066                                                 offset, copy, off, skb) < 0) {
1067                                 __skb_trim(skb, off);
1068                                 err = -EFAULT;
1069                                 goto error;
1070                         }
1071                 } else {
1072                         int i = skb_shinfo(skb)->nr_frags;
1073                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1074                         struct page *page = sk->sk_sndmsg_page;
1075                         int off = sk->sk_sndmsg_off;
1076                         unsigned int left;
1077
1078                         if (page && (left = PAGE_SIZE - off) > 0) {
1079                                 if (copy >= left)
1080                                         copy = left;
1081                                 if (page != frag->page) {
1082                                         if (i == MAX_SKB_FRAGS) {
1083                                                 err = -EMSGSIZE;
1084                                                 goto error;
1085                                         }
1086                                         get_page(page);
1087                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1088                                         frag = &skb_shinfo(skb)->frags[i];
1089                                 }
1090                         } else if(i < MAX_SKB_FRAGS) {
1091                                 if (copy > PAGE_SIZE)
1092                                         copy = PAGE_SIZE;
1093                                 page = alloc_pages(sk->sk_allocation, 0);
1094                                 if (page == NULL) {
1095                                         err = -ENOMEM;
1096                                         goto error;
1097                                 }
1098                                 sk->sk_sndmsg_page = page;
1099                                 sk->sk_sndmsg_off = 0;
1100
1101                                 skb_fill_page_desc(skb, i, page, 0, 0);
1102                                 frag = &skb_shinfo(skb)->frags[i];
1103                                 skb->truesize += PAGE_SIZE;
1104                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1105                         } else {
1106                                 err = -EMSGSIZE;
1107                                 goto error;
1108                         }
1109                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1110                                 err = -EFAULT;
1111                                 goto error;
1112                         }
1113                         sk->sk_sndmsg_off += copy;
1114                         frag->size += copy;
1115                         skb->len += copy;
1116                         skb->data_len += copy;
1117                 }
1118                 offset += copy;
1119                 length -= copy;
1120         }
1121         return 0;
1122 error:
1123         inet->cork.length -= length;
1124         IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1125         return err;
1126 }
1127
1128 int ip6_push_pending_frames(struct sock *sk)
1129 {
1130         struct sk_buff *skb, *tmp_skb;
1131         struct sk_buff **tail_skb;
1132         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1133         struct inet_sock *inet = inet_sk(sk);
1134         struct ipv6_pinfo *np = inet6_sk(sk);
1135         struct ipv6hdr *hdr;
1136         struct ipv6_txoptions *opt = np->cork.opt;
1137         struct rt6_info *rt = np->cork.rt;
1138         struct flowi *fl = &inet->cork.fl;
1139         unsigned char proto = fl->proto;
1140         int err = 0;
1141
1142         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1143                 goto out;
1144         tail_skb = &(skb_shinfo(skb)->frag_list);
1145
1146         /* move skb->data to ip header from ext header */
1147         if (skb->data < skb->nh.raw)
1148                 __skb_pull(skb, skb->nh.raw - skb->data);
1149         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1150                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1151                 *tail_skb = tmp_skb;
1152                 tail_skb = &(tmp_skb->next);
1153                 skb->len += tmp_skb->len;
1154                 skb->data_len += tmp_skb->len;
1155                 skb->truesize += tmp_skb->truesize;
1156                 __sock_put(tmp_skb->sk);
1157                 tmp_skb->destructor = NULL;
1158                 tmp_skb->sk = NULL;
1159         }
1160
1161         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1162         __skb_pull(skb, skb->h.raw - skb->nh.raw);
1163         if (opt && opt->opt_flen)
1164                 ipv6_push_frag_opts(skb, opt, &proto);
1165         if (opt && opt->opt_nflen)
1166                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1167
1168         skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1169         
1170         *(u32*)hdr = fl->fl6_flowlabel |
1171                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1172
1173         if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1174                 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1175         else
1176                 hdr->payload_len = 0;
1177         hdr->hop_limit = np->cork.hop_limit;
1178         hdr->nexthdr = proto;
1179         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1180         ipv6_addr_copy(&hdr->daddr, final_dst);
1181
1182         skb->dst = dst_clone(&rt->u.dst);
1183         IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); 
1184         err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1185         if (err) {
1186                 if (err > 0)
1187                         err = np->recverr ? net_xmit_errno(err) : 0;
1188                 if (err)
1189                         goto error;
1190         }
1191
1192 out:
1193         inet->cork.flags &= ~IPCORK_OPT;
1194         kfree(np->cork.opt);
1195         np->cork.opt = NULL;
1196         if (np->cork.rt) {
1197                 dst_release(&np->cork.rt->u.dst);
1198                 np->cork.rt = NULL;
1199                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1200         }
1201         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1202         return err;
1203 error:
1204         goto out;
1205 }
1206
1207 void ip6_flush_pending_frames(struct sock *sk)
1208 {
1209         struct inet_sock *inet = inet_sk(sk);
1210         struct ipv6_pinfo *np = inet6_sk(sk);
1211         struct sk_buff *skb;
1212
1213         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1214                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1215                 kfree_skb(skb);
1216         }
1217
1218         inet->cork.flags &= ~IPCORK_OPT;
1219
1220         kfree(np->cork.opt);
1221         np->cork.opt = NULL;
1222         if (np->cork.rt) {
1223                 dst_release(&np->cork.rt->u.dst);
1224                 np->cork.rt = NULL;
1225                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1226         }
1227         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1228 }