OSDN Git Service

ipv6: remove some useless RCU read lock
[uclinux-h8/linux.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63         int len;
64
65         len = skb->len - sizeof(struct ipv6hdr);
66         if (len > IPV6_MAXPLEN)
67                 len = 0;
68         ipv6_hdr(skb)->payload_len = htons(len);
69
70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71                        skb_dst(skb)->dev, dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76         int err;
77
78         err = __ip6_local_out(skb);
79         if (likely(err == 1))
80                 err = dst_output(skb);
81
82         return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 static int ip6_finish_output2(struct sk_buff *skb)
87 {
88         struct dst_entry *dst = skb_dst(skb);
89         struct net_device *dev = dst->dev;
90         struct neighbour *neigh;
91         struct rt6_info *rt;
92
93         skb->protocol = htons(ETH_P_IPV6);
94         skb->dev = dev;
95
96         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
97                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
98
99                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
100                     ((mroute6_socket(dev_net(dev), skb) &&
101                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
102                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
103                                          &ipv6_hdr(skb)->saddr))) {
104                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
105
106                         /* Do not check for IFF_ALLMULTI; multicast routing
107                            is not supported in any case.
108                          */
109                         if (newskb)
110                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
111                                         newskb, NULL, newskb->dev,
112                                         dev_loopback_xmit);
113
114                         if (ipv6_hdr(skb)->hop_limit == 0) {
115                                 IP6_INC_STATS(dev_net(dev), idev,
116                                               IPSTATS_MIB_OUTDISCARDS);
117                                 kfree_skb(skb);
118                                 return 0;
119                         }
120                 }
121
122                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
123                                 skb->len);
124         }
125
126         rt = (struct rt6_info *) dst;
127         neigh = rt->n;
128         if (neigh)
129                 return dst_neigh_output(dst, neigh, skb);
130
131         IP6_INC_STATS_BH(dev_net(dst->dev),
132                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
133         kfree_skb(skb);
134         return -EINVAL;
135 }
136
137 static int ip6_finish_output(struct sk_buff *skb)
138 {
139         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
140             dst_allfrag(skb_dst(skb)))
141                 return ip6_fragment(skb, ip6_finish_output2);
142         else
143                 return ip6_finish_output2(skb);
144 }
145
146 int ip6_output(struct sk_buff *skb)
147 {
148         struct net_device *dev = skb_dst(skb)->dev;
149         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
150         if (unlikely(idev->cnf.disable_ipv6)) {
151                 IP6_INC_STATS(dev_net(dev), idev,
152                               IPSTATS_MIB_OUTDISCARDS);
153                 kfree_skb(skb);
154                 return 0;
155         }
156
157         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
158                             ip6_finish_output,
159                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
160 }
161
162 /*
163  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
164  */
165
166 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
167              struct ipv6_txoptions *opt, int tclass)
168 {
169         struct net *net = sock_net(sk);
170         struct ipv6_pinfo *np = inet6_sk(sk);
171         struct in6_addr *first_hop = &fl6->daddr;
172         struct dst_entry *dst = skb_dst(skb);
173         struct ipv6hdr *hdr;
174         u8  proto = fl6->flowi6_proto;
175         int seg_len = skb->len;
176         int hlimit = -1;
177         u32 mtu;
178
179         if (opt) {
180                 unsigned int head_room;
181
182                 /* First: exthdrs may take lots of space (~8K for now)
183                    MAX_HEADER is not enough.
184                  */
185                 head_room = opt->opt_nflen + opt->opt_flen;
186                 seg_len += head_room;
187                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
188
189                 if (skb_headroom(skb) < head_room) {
190                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
191                         if (skb2 == NULL) {
192                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
193                                               IPSTATS_MIB_OUTDISCARDS);
194                                 kfree_skb(skb);
195                                 return -ENOBUFS;
196                         }
197                         consume_skb(skb);
198                         skb = skb2;
199                         skb_set_owner_w(skb, sk);
200                 }
201                 if (opt->opt_flen)
202                         ipv6_push_frag_opts(skb, opt, &proto);
203                 if (opt->opt_nflen)
204                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
205         }
206
207         skb_push(skb, sizeof(struct ipv6hdr));
208         skb_reset_network_header(skb);
209         hdr = ipv6_hdr(skb);
210
211         /*
212          *      Fill in the IPv6 header
213          */
214         if (np)
215                 hlimit = np->hop_limit;
216         if (hlimit < 0)
217                 hlimit = ip6_dst_hoplimit(dst);
218
219         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
220
221         hdr->payload_len = htons(seg_len);
222         hdr->nexthdr = proto;
223         hdr->hop_limit = hlimit;
224
225         hdr->saddr = fl6->saddr;
226         hdr->daddr = *first_hop;
227
228         skb->priority = sk->sk_priority;
229         skb->mark = sk->sk_mark;
230
231         mtu = dst_mtu(dst);
232         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
233                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
234                               IPSTATS_MIB_OUT, skb->len);
235                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
236                                dst->dev, dst_output);
237         }
238
239         net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
240         skb->dev = dst->dev;
241         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
242         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
243         kfree_skb(skb);
244         return -EMSGSIZE;
245 }
246
247 EXPORT_SYMBOL(ip6_xmit);
248
249 /*
250  *      To avoid extra problems ND packets are send through this
251  *      routine. It's code duplication but I really want to avoid
252  *      extra checks since ipv6_build_header is used by TCP (which
253  *      is for us performance critical)
254  */
255
256 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
257                const struct in6_addr *saddr, const struct in6_addr *daddr,
258                int proto, int len)
259 {
260         struct ipv6_pinfo *np = inet6_sk(sk);
261         struct ipv6hdr *hdr;
262
263         skb->protocol = htons(ETH_P_IPV6);
264         skb->dev = dev;
265
266         skb_reset_network_header(skb);
267         skb_put(skb, sizeof(struct ipv6hdr));
268         hdr = ipv6_hdr(skb);
269
270         *(__be32*)hdr = htonl(0x60000000);
271
272         hdr->payload_len = htons(len);
273         hdr->nexthdr = proto;
274         hdr->hop_limit = np->hop_limit;
275
276         hdr->saddr = *saddr;
277         hdr->daddr = *daddr;
278
279         return 0;
280 }
281
282 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
283 {
284         struct ip6_ra_chain *ra;
285         struct sock *last = NULL;
286
287         read_lock(&ip6_ra_lock);
288         for (ra = ip6_ra_chain; ra; ra = ra->next) {
289                 struct sock *sk = ra->sk;
290                 if (sk && ra->sel == sel &&
291                     (!sk->sk_bound_dev_if ||
292                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
293                         if (last) {
294                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
295                                 if (skb2)
296                                         rawv6_rcv(last, skb2);
297                         }
298                         last = sk;
299                 }
300         }
301
302         if (last) {
303                 rawv6_rcv(last, skb);
304                 read_unlock(&ip6_ra_lock);
305                 return 1;
306         }
307         read_unlock(&ip6_ra_lock);
308         return 0;
309 }
310
311 static int ip6_forward_proxy_check(struct sk_buff *skb)
312 {
313         struct ipv6hdr *hdr = ipv6_hdr(skb);
314         u8 nexthdr = hdr->nexthdr;
315         __be16 frag_off;
316         int offset;
317
318         if (ipv6_ext_hdr(nexthdr)) {
319                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
320                 if (offset < 0)
321                         return 0;
322         } else
323                 offset = sizeof(struct ipv6hdr);
324
325         if (nexthdr == IPPROTO_ICMPV6) {
326                 struct icmp6hdr *icmp6;
327
328                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
329                                          offset + 1 - skb->data)))
330                         return 0;
331
332                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
333
334                 switch (icmp6->icmp6_type) {
335                 case NDISC_ROUTER_SOLICITATION:
336                 case NDISC_ROUTER_ADVERTISEMENT:
337                 case NDISC_NEIGHBOUR_SOLICITATION:
338                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
339                 case NDISC_REDIRECT:
340                         /* For reaction involving unicast neighbor discovery
341                          * message destined to the proxied address, pass it to
342                          * input function.
343                          */
344                         return 1;
345                 default:
346                         break;
347                 }
348         }
349
350         /*
351          * The proxying router can't forward traffic sent to a link-local
352          * address, so signal the sender and discard the packet. This
353          * behavior is clarified by the MIPv6 specification.
354          */
355         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
356                 dst_link_failure(skb);
357                 return -1;
358         }
359
360         return 0;
361 }
362
363 static inline int ip6_forward_finish(struct sk_buff *skb)
364 {
365         return dst_output(skb);
366 }
367
368 int ip6_forward(struct sk_buff *skb)
369 {
370         struct dst_entry *dst = skb_dst(skb);
371         struct ipv6hdr *hdr = ipv6_hdr(skb);
372         struct inet6_skb_parm *opt = IP6CB(skb);
373         struct net *net = dev_net(dst->dev);
374         u32 mtu;
375
376         if (net->ipv6.devconf_all->forwarding == 0)
377                 goto error;
378
379         if (skb_warn_if_lro(skb))
380                 goto drop;
381
382         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
383                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
384                 goto drop;
385         }
386
387         if (skb->pkt_type != PACKET_HOST)
388                 goto drop;
389
390         skb_forward_csum(skb);
391
392         /*
393          *      We DO NOT make any processing on
394          *      RA packets, pushing them to user level AS IS
395          *      without ane WARRANTY that application will be able
396          *      to interpret them. The reason is that we
397          *      cannot make anything clever here.
398          *
399          *      We are not end-node, so that if packet contains
400          *      AH/ESP, we cannot make anything.
401          *      Defragmentation also would be mistake, RA packets
402          *      cannot be fragmented, because there is no warranty
403          *      that different fragments will go along one path. --ANK
404          */
405         if (opt->ra) {
406                 u8 *ptr = skb_network_header(skb) + opt->ra;
407                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
408                         return 0;
409         }
410
411         /*
412          *      check and decrement ttl
413          */
414         if (hdr->hop_limit <= 1) {
415                 /* Force OUTPUT device used as source address */
416                 skb->dev = dst->dev;
417                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
418                 IP6_INC_STATS_BH(net,
419                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
420
421                 kfree_skb(skb);
422                 return -ETIMEDOUT;
423         }
424
425         /* XXX: idev->cnf.proxy_ndp? */
426         if (net->ipv6.devconf_all->proxy_ndp &&
427             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
428                 int proxied = ip6_forward_proxy_check(skb);
429                 if (proxied > 0)
430                         return ip6_input(skb);
431                 else if (proxied < 0) {
432                         IP6_INC_STATS(net, ip6_dst_idev(dst),
433                                       IPSTATS_MIB_INDISCARDS);
434                         goto drop;
435                 }
436         }
437
438         if (!xfrm6_route_forward(skb)) {
439                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
440                 goto drop;
441         }
442         dst = skb_dst(skb);
443
444         /* IPv6 specs say nothing about it, but it is clear that we cannot
445            send redirects to source routed frames.
446            We don't send redirects to frames decapsulated from IPsec.
447          */
448         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
449                 struct in6_addr *target = NULL;
450                 struct inet_peer *peer;
451                 struct rt6_info *rt;
452
453                 /*
454                  *      incoming and outgoing devices are the same
455                  *      send a redirect.
456                  */
457
458                 rt = (struct rt6_info *) dst;
459                 if (rt->rt6i_flags & RTF_GATEWAY)
460                         target = &rt->rt6i_gateway;
461                 else
462                         target = &hdr->daddr;
463
464                 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
465
466                 /* Limit redirects both by destination (here)
467                    and by source (inside ndisc_send_redirect)
468                  */
469                 if (inet_peer_xrlim_allow(peer, 1*HZ))
470                         ndisc_send_redirect(skb, target);
471                 if (peer)
472                         inet_putpeer(peer);
473         } else {
474                 int addrtype = ipv6_addr_type(&hdr->saddr);
475
476                 /* This check is security critical. */
477                 if (addrtype == IPV6_ADDR_ANY ||
478                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
479                         goto error;
480                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
481                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
482                                     ICMPV6_NOT_NEIGHBOUR, 0);
483                         goto error;
484                 }
485         }
486
487         mtu = dst_mtu(dst);
488         if (mtu < IPV6_MIN_MTU)
489                 mtu = IPV6_MIN_MTU;
490
491         if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
492             (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
493                 /* Again, force OUTPUT device used as source address */
494                 skb->dev = dst->dev;
495                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
496                 IP6_INC_STATS_BH(net,
497                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
498                 IP6_INC_STATS_BH(net,
499                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
500                 kfree_skb(skb);
501                 return -EMSGSIZE;
502         }
503
504         if (skb_cow(skb, dst->dev->hard_header_len)) {
505                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
506                 goto drop;
507         }
508
509         hdr = ipv6_hdr(skb);
510
511         /* Mangling hops number delayed to point after skb COW */
512
513         hdr->hop_limit--;
514
515         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
516         IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
517         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
518                        ip6_forward_finish);
519
520 error:
521         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
522 drop:
523         kfree_skb(skb);
524         return -EINVAL;
525 }
526
527 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
528 {
529         to->pkt_type = from->pkt_type;
530         to->priority = from->priority;
531         to->protocol = from->protocol;
532         skb_dst_drop(to);
533         skb_dst_set(to, dst_clone(skb_dst(from)));
534         to->dev = from->dev;
535         to->mark = from->mark;
536
537 #ifdef CONFIG_NET_SCHED
538         to->tc_index = from->tc_index;
539 #endif
540         nf_copy(to, from);
541 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
542     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
543         to->nf_trace = from->nf_trace;
544 #endif
545         skb_copy_secmark(to, from);
546 }
547
548 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
549 {
550         u16 offset = sizeof(struct ipv6hdr);
551         struct ipv6_opt_hdr *exthdr =
552                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
553         unsigned int packet_len = skb->tail - skb->network_header;
554         int found_rhdr = 0;
555         *nexthdr = &ipv6_hdr(skb)->nexthdr;
556
557         while (offset + 1 <= packet_len) {
558
559                 switch (**nexthdr) {
560
561                 case NEXTHDR_HOP:
562                         break;
563                 case NEXTHDR_ROUTING:
564                         found_rhdr = 1;
565                         break;
566                 case NEXTHDR_DEST:
567 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
568                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
569                                 break;
570 #endif
571                         if (found_rhdr)
572                                 return offset;
573                         break;
574                 default :
575                         return offset;
576                 }
577
578                 offset += ipv6_optlen(exthdr);
579                 *nexthdr = &exthdr->nexthdr;
580                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
581                                                  offset);
582         }
583
584         return offset;
585 }
586
587 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
588 {
589         static atomic_t ipv6_fragmentation_id;
590         int old, new;
591
592         if (rt && !(rt->dst.flags & DST_NOPEER)) {
593                 struct inet_peer *peer;
594                 struct net *net;
595
596                 net = dev_net(rt->dst.dev);
597                 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
598                 if (peer) {
599                         fhdr->identification = htonl(inet_getid(peer, 0));
600                         inet_putpeer(peer);
601                         return;
602                 }
603         }
604         do {
605                 old = atomic_read(&ipv6_fragmentation_id);
606                 new = old + 1;
607                 if (!new)
608                         new = 1;
609         } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
610         fhdr->identification = htonl(new);
611 }
612
613 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
614 {
615         struct sk_buff *frag;
616         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
617         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
618         struct ipv6hdr *tmp_hdr;
619         struct frag_hdr *fh;
620         unsigned int mtu, hlen, left, len;
621         int hroom, troom;
622         __be32 frag_id = 0;
623         int ptr, offset = 0, err=0;
624         u8 *prevhdr, nexthdr = 0;
625         struct net *net = dev_net(skb_dst(skb)->dev);
626
627         hlen = ip6_find_1stfragopt(skb, &prevhdr);
628         nexthdr = *prevhdr;
629
630         mtu = ip6_skb_dst_mtu(skb);
631
632         /* We must not fragment if the socket is set to force MTU discovery
633          * or if the skb it not generated by a local socket.
634          */
635         if (unlikely(!skb->local_df && skb->len > mtu) ||
636                      (IP6CB(skb)->frag_max_size &&
637                       IP6CB(skb)->frag_max_size > mtu)) {
638                 if (skb->sk && dst_allfrag(skb_dst(skb)))
639                         sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
640
641                 skb->dev = skb_dst(skb)->dev;
642                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
643                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
644                               IPSTATS_MIB_FRAGFAILS);
645                 kfree_skb(skb);
646                 return -EMSGSIZE;
647         }
648
649         if (np && np->frag_size < mtu) {
650                 if (np->frag_size)
651                         mtu = np->frag_size;
652         }
653         mtu -= hlen + sizeof(struct frag_hdr);
654
655         if (skb_has_frag_list(skb)) {
656                 int first_len = skb_pagelen(skb);
657                 struct sk_buff *frag2;
658
659                 if (first_len - hlen > mtu ||
660                     ((first_len - hlen) & 7) ||
661                     skb_cloned(skb))
662                         goto slow_path;
663
664                 skb_walk_frags(skb, frag) {
665                         /* Correct geometry. */
666                         if (frag->len > mtu ||
667                             ((frag->len & 7) && frag->next) ||
668                             skb_headroom(frag) < hlen)
669                                 goto slow_path_clean;
670
671                         /* Partially cloned skb? */
672                         if (skb_shared(frag))
673                                 goto slow_path_clean;
674
675                         BUG_ON(frag->sk);
676                         if (skb->sk) {
677                                 frag->sk = skb->sk;
678                                 frag->destructor = sock_wfree;
679                         }
680                         skb->truesize -= frag->truesize;
681                 }
682
683                 err = 0;
684                 offset = 0;
685                 frag = skb_shinfo(skb)->frag_list;
686                 skb_frag_list_init(skb);
687                 /* BUILD HEADER */
688
689                 *prevhdr = NEXTHDR_FRAGMENT;
690                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
691                 if (!tmp_hdr) {
692                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
693                                       IPSTATS_MIB_FRAGFAILS);
694                         return -ENOMEM;
695                 }
696
697                 __skb_pull(skb, hlen);
698                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
699                 __skb_push(skb, hlen);
700                 skb_reset_network_header(skb);
701                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
702
703                 ipv6_select_ident(fh, rt);
704                 fh->nexthdr = nexthdr;
705                 fh->reserved = 0;
706                 fh->frag_off = htons(IP6_MF);
707                 frag_id = fh->identification;
708
709                 first_len = skb_pagelen(skb);
710                 skb->data_len = first_len - skb_headlen(skb);
711                 skb->len = first_len;
712                 ipv6_hdr(skb)->payload_len = htons(first_len -
713                                                    sizeof(struct ipv6hdr));
714
715                 dst_hold(&rt->dst);
716
717                 for (;;) {
718                         /* Prepare header of the next frame,
719                          * before previous one went down. */
720                         if (frag) {
721                                 frag->ip_summed = CHECKSUM_NONE;
722                                 skb_reset_transport_header(frag);
723                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
724                                 __skb_push(frag, hlen);
725                                 skb_reset_network_header(frag);
726                                 memcpy(skb_network_header(frag), tmp_hdr,
727                                        hlen);
728                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
729                                 fh->nexthdr = nexthdr;
730                                 fh->reserved = 0;
731                                 fh->frag_off = htons(offset);
732                                 if (frag->next != NULL)
733                                         fh->frag_off |= htons(IP6_MF);
734                                 fh->identification = frag_id;
735                                 ipv6_hdr(frag)->payload_len =
736                                                 htons(frag->len -
737                                                       sizeof(struct ipv6hdr));
738                                 ip6_copy_metadata(frag, skb);
739                         }
740
741                         err = output(skb);
742                         if(!err)
743                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
744                                               IPSTATS_MIB_FRAGCREATES);
745
746                         if (err || !frag)
747                                 break;
748
749                         skb = frag;
750                         frag = skb->next;
751                         skb->next = NULL;
752                 }
753
754                 kfree(tmp_hdr);
755
756                 if (err == 0) {
757                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
758                                       IPSTATS_MIB_FRAGOKS);
759                         dst_release(&rt->dst);
760                         return 0;
761                 }
762
763                 while (frag) {
764                         skb = frag->next;
765                         kfree_skb(frag);
766                         frag = skb;
767                 }
768
769                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
770                               IPSTATS_MIB_FRAGFAILS);
771                 dst_release(&rt->dst);
772                 return err;
773
774 slow_path_clean:
775                 skb_walk_frags(skb, frag2) {
776                         if (frag2 == frag)
777                                 break;
778                         frag2->sk = NULL;
779                         frag2->destructor = NULL;
780                         skb->truesize += frag2->truesize;
781                 }
782         }
783
784 slow_path:
785         if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
786             skb_checksum_help(skb))
787                 goto fail;
788
789         left = skb->len - hlen;         /* Space per frame */
790         ptr = hlen;                     /* Where to start from */
791
792         /*
793          *      Fragment the datagram.
794          */
795
796         *prevhdr = NEXTHDR_FRAGMENT;
797         hroom = LL_RESERVED_SPACE(rt->dst.dev);
798         troom = rt->dst.dev->needed_tailroom;
799
800         /*
801          *      Keep copying data until we run out.
802          */
803         while(left > 0) {
804                 len = left;
805                 /* IF: it doesn't fit, use 'mtu' - the data space left */
806                 if (len > mtu)
807                         len = mtu;
808                 /* IF: we are not sending up to and including the packet end
809                    then align the next start on an eight byte boundary */
810                 if (len < left) {
811                         len &= ~7;
812                 }
813                 /*
814                  *      Allocate buffer.
815                  */
816
817                 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
818                                       hroom + troom, GFP_ATOMIC)) == NULL) {
819                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
820                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
821                                       IPSTATS_MIB_FRAGFAILS);
822                         err = -ENOMEM;
823                         goto fail;
824                 }
825
826                 /*
827                  *      Set up data on packet
828                  */
829
830                 ip6_copy_metadata(frag, skb);
831                 skb_reserve(frag, hroom);
832                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
833                 skb_reset_network_header(frag);
834                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
835                 frag->transport_header = (frag->network_header + hlen +
836                                           sizeof(struct frag_hdr));
837
838                 /*
839                  *      Charge the memory for the fragment to any owner
840                  *      it might possess
841                  */
842                 if (skb->sk)
843                         skb_set_owner_w(frag, skb->sk);
844
845                 /*
846                  *      Copy the packet header into the new buffer.
847                  */
848                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
849
850                 /*
851                  *      Build fragment header.
852                  */
853                 fh->nexthdr = nexthdr;
854                 fh->reserved = 0;
855                 if (!frag_id) {
856                         ipv6_select_ident(fh, rt);
857                         frag_id = fh->identification;
858                 } else
859                         fh->identification = frag_id;
860
861                 /*
862                  *      Copy a block of the IP datagram.
863                  */
864                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
865                         BUG();
866                 left -= len;
867
868                 fh->frag_off = htons(offset);
869                 if (left > 0)
870                         fh->frag_off |= htons(IP6_MF);
871                 ipv6_hdr(frag)->payload_len = htons(frag->len -
872                                                     sizeof(struct ipv6hdr));
873
874                 ptr += len;
875                 offset += len;
876
877                 /*
878                  *      Put this fragment into the sending queue.
879                  */
880                 err = output(frag);
881                 if (err)
882                         goto fail;
883
884                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
885                               IPSTATS_MIB_FRAGCREATES);
886         }
887         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
888                       IPSTATS_MIB_FRAGOKS);
889         consume_skb(skb);
890         return err;
891
892 fail:
893         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
894                       IPSTATS_MIB_FRAGFAILS);
895         kfree_skb(skb);
896         return err;
897 }
898
899 static inline int ip6_rt_check(const struct rt6key *rt_key,
900                                const struct in6_addr *fl_addr,
901                                const struct in6_addr *addr_cache)
902 {
903         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
904                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
905 }
906
907 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
908                                           struct dst_entry *dst,
909                                           const struct flowi6 *fl6)
910 {
911         struct ipv6_pinfo *np = inet6_sk(sk);
912         struct rt6_info *rt = (struct rt6_info *)dst;
913
914         if (!dst)
915                 goto out;
916
917         /* Yes, checking route validity in not connected
918          * case is not very simple. Take into account,
919          * that we do not support routing by source, TOS,
920          * and MSG_DONTROUTE            --ANK (980726)
921          *
922          * 1. ip6_rt_check(): If route was host route,
923          *    check that cached destination is current.
924          *    If it is network route, we still may
925          *    check its validity using saved pointer
926          *    to the last used address: daddr_cache.
927          *    We do not want to save whole address now,
928          *    (because main consumer of this service
929          *    is tcp, which has not this problem),
930          *    so that the last trick works only on connected
931          *    sockets.
932          * 2. oif also should be the same.
933          */
934         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
935 #ifdef CONFIG_IPV6_SUBTREES
936             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
937 #endif
938             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
939                 dst_release(dst);
940                 dst = NULL;
941         }
942
943 out:
944         return dst;
945 }
946
947 static int ip6_dst_lookup_tail(struct sock *sk,
948                                struct dst_entry **dst, struct flowi6 *fl6)
949 {
950         struct net *net = sock_net(sk);
951 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
952         struct neighbour *n;
953         struct rt6_info *rt;
954 #endif
955         int err;
956
957         if (*dst == NULL)
958                 *dst = ip6_route_output(net, sk, fl6);
959
960         if ((err = (*dst)->error))
961                 goto out_err_release;
962
963         if (ipv6_addr_any(&fl6->saddr)) {
964                 struct rt6_info *rt = (struct rt6_info *) *dst;
965                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
966                                           sk ? inet6_sk(sk)->srcprefs : 0,
967                                           &fl6->saddr);
968                 if (err)
969                         goto out_err_release;
970         }
971
972 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
973         /*
974          * Here if the dst entry we've looked up
975          * has a neighbour entry that is in the INCOMPLETE
976          * state and the src address from the flow is
977          * marked as OPTIMISTIC, we release the found
978          * dst entry and replace it instead with the
979          * dst entry of the nexthop router
980          */
981         rt = (struct rt6_info *) *dst;
982         n = rt->n;
983         if (n && !(n->nud_state & NUD_VALID)) {
984                 struct inet6_ifaddr *ifp;
985                 struct flowi6 fl_gw6;
986                 int redirect;
987
988                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
989                                       (*dst)->dev, 1);
990
991                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
992                 if (ifp)
993                         in6_ifa_put(ifp);
994
995                 if (redirect) {
996                         /*
997                          * We need to get the dst entry for the
998                          * default router instead
999                          */
1000                         dst_release(*dst);
1001                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1002                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1003                         *dst = ip6_route_output(net, sk, &fl_gw6);
1004                         if ((err = (*dst)->error))
1005                                 goto out_err_release;
1006                 }
1007         }
1008 #endif
1009
1010         return 0;
1011
1012 out_err_release:
1013         if (err == -ENETUNREACH)
1014                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1015         dst_release(*dst);
1016         *dst = NULL;
1017         return err;
1018 }
1019
1020 /**
1021  *      ip6_dst_lookup - perform route lookup on flow
1022  *      @sk: socket which provides route info
1023  *      @dst: pointer to dst_entry * for result
1024  *      @fl6: flow to lookup
1025  *
1026  *      This function performs a route lookup on the given flow.
1027  *
1028  *      It returns zero on success, or a standard errno code on error.
1029  */
1030 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1031 {
1032         *dst = NULL;
1033         return ip6_dst_lookup_tail(sk, dst, fl6);
1034 }
1035 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1036
1037 /**
1038  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1039  *      @sk: socket which provides route info
1040  *      @fl6: flow to lookup
1041  *      @final_dst: final destination address for ipsec lookup
1042  *      @can_sleep: we are in a sleepable context
1043  *
1044  *      This function performs a route lookup on the given flow.
1045  *
1046  *      It returns a valid dst pointer on success, or a pointer encoded
1047  *      error code.
1048  */
1049 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1050                                       const struct in6_addr *final_dst,
1051                                       bool can_sleep)
1052 {
1053         struct dst_entry *dst = NULL;
1054         int err;
1055
1056         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1057         if (err)
1058                 return ERR_PTR(err);
1059         if (final_dst)
1060                 fl6->daddr = *final_dst;
1061         if (can_sleep)
1062                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1063
1064         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1065 }
1066 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1067
1068 /**
1069  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1070  *      @sk: socket which provides the dst cache and route info
1071  *      @fl6: flow to lookup
1072  *      @final_dst: final destination address for ipsec lookup
1073  *      @can_sleep: we are in a sleepable context
1074  *
1075  *      This function performs a route lookup on the given flow with the
1076  *      possibility of using the cached route in the socket if it is valid.
1077  *      It will take the socket dst lock when operating on the dst cache.
1078  *      As a result, this function can only be used in process context.
1079  *
1080  *      It returns a valid dst pointer on success, or a pointer encoded
1081  *      error code.
1082  */
1083 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1084                                          const struct in6_addr *final_dst,
1085                                          bool can_sleep)
1086 {
1087         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1088         int err;
1089
1090         dst = ip6_sk_dst_check(sk, dst, fl6);
1091
1092         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1093         if (err)
1094                 return ERR_PTR(err);
1095         if (final_dst)
1096                 fl6->daddr = *final_dst;
1097         if (can_sleep)
1098                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1099
1100         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1101 }
1102 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1103
1104 static inline int ip6_ufo_append_data(struct sock *sk,
1105                         int getfrag(void *from, char *to, int offset, int len,
1106                         int odd, struct sk_buff *skb),
1107                         void *from, int length, int hh_len, int fragheaderlen,
1108                         int transhdrlen, int mtu,unsigned int flags,
1109                         struct rt6_info *rt)
1110
1111 {
1112         struct sk_buff *skb;
1113         int err;
1114
1115         /* There is support for UDP large send offload by network
1116          * device, so create one single skb packet containing complete
1117          * udp datagram
1118          */
1119         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1120                 skb = sock_alloc_send_skb(sk,
1121                         hh_len + fragheaderlen + transhdrlen + 20,
1122                         (flags & MSG_DONTWAIT), &err);
1123                 if (skb == NULL)
1124                         return err;
1125
1126                 /* reserve space for Hardware header */
1127                 skb_reserve(skb, hh_len);
1128
1129                 /* create space for UDP/IP header */
1130                 skb_put(skb,fragheaderlen + transhdrlen);
1131
1132                 /* initialize network header pointer */
1133                 skb_reset_network_header(skb);
1134
1135                 /* initialize protocol header pointer */
1136                 skb->transport_header = skb->network_header + fragheaderlen;
1137
1138                 skb->ip_summed = CHECKSUM_PARTIAL;
1139                 skb->csum = 0;
1140         }
1141
1142         err = skb_append_datato_frags(sk,skb, getfrag, from,
1143                                       (length - transhdrlen));
1144         if (!err) {
1145                 struct frag_hdr fhdr;
1146
1147                 /* Specify the length of each IPv6 datagram fragment.
1148                  * It has to be a multiple of 8.
1149                  */
1150                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1151                                              sizeof(struct frag_hdr)) & ~7;
1152                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1153                 ipv6_select_ident(&fhdr, rt);
1154                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1155                 __skb_queue_tail(&sk->sk_write_queue, skb);
1156
1157                 return 0;
1158         }
1159         /* There is not enough support do UPD LSO,
1160          * so follow normal path
1161          */
1162         kfree_skb(skb);
1163
1164         return err;
1165 }
1166
1167 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1168                                                gfp_t gfp)
1169 {
1170         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1171 }
1172
1173 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1174                                                 gfp_t gfp)
1175 {
1176         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1177 }
1178
1179 static void ip6_append_data_mtu(int *mtu,
1180                                 int *maxfraglen,
1181                                 unsigned int fragheaderlen,
1182                                 struct sk_buff *skb,
1183                                 struct rt6_info *rt)
1184 {
1185         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1186                 if (skb == NULL) {
1187                         /* first fragment, reserve header_len */
1188                         *mtu = *mtu - rt->dst.header_len;
1189
1190                 } else {
1191                         /*
1192                          * this fragment is not first, the headers
1193                          * space is regarded as data space.
1194                          */
1195                         *mtu = dst_mtu(rt->dst.path);
1196                 }
1197                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1198                               + fragheaderlen - sizeof(struct frag_hdr);
1199         }
1200 }
1201
1202 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1203         int offset, int len, int odd, struct sk_buff *skb),
1204         void *from, int length, int transhdrlen,
1205         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1206         struct rt6_info *rt, unsigned int flags, int dontfrag)
1207 {
1208         struct inet_sock *inet = inet_sk(sk);
1209         struct ipv6_pinfo *np = inet6_sk(sk);
1210         struct inet_cork *cork;
1211         struct sk_buff *skb, *skb_prev = NULL;
1212         unsigned int maxfraglen, fragheaderlen;
1213         int exthdrlen;
1214         int dst_exthdrlen;
1215         int hh_len;
1216         int mtu;
1217         int copy;
1218         int err;
1219         int offset = 0;
1220         __u8 tx_flags = 0;
1221
1222         if (flags&MSG_PROBE)
1223                 return 0;
1224         cork = &inet->cork.base;
1225         if (skb_queue_empty(&sk->sk_write_queue)) {
1226                 /*
1227                  * setup for corking
1228                  */
1229                 if (opt) {
1230                         if (WARN_ON(np->cork.opt))
1231                                 return -EINVAL;
1232
1233                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1234                         if (unlikely(np->cork.opt == NULL))
1235                                 return -ENOBUFS;
1236
1237                         np->cork.opt->tot_len = opt->tot_len;
1238                         np->cork.opt->opt_flen = opt->opt_flen;
1239                         np->cork.opt->opt_nflen = opt->opt_nflen;
1240
1241                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1242                                                             sk->sk_allocation);
1243                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1244                                 return -ENOBUFS;
1245
1246                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1247                                                             sk->sk_allocation);
1248                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1249                                 return -ENOBUFS;
1250
1251                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1252                                                            sk->sk_allocation);
1253                         if (opt->hopopt && !np->cork.opt->hopopt)
1254                                 return -ENOBUFS;
1255
1256                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1257                                                             sk->sk_allocation);
1258                         if (opt->srcrt && !np->cork.opt->srcrt)
1259                                 return -ENOBUFS;
1260
1261                         /* need source address above miyazawa*/
1262                 }
1263                 dst_hold(&rt->dst);
1264                 cork->dst = &rt->dst;
1265                 inet->cork.fl.u.ip6 = *fl6;
1266                 np->cork.hop_limit = hlimit;
1267                 np->cork.tclass = tclass;
1268                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1269                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1270                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1271                 else
1272                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1273                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1274                 if (np->frag_size < mtu) {
1275                         if (np->frag_size)
1276                                 mtu = np->frag_size;
1277                 }
1278                 cork->fragsize = mtu;
1279                 if (dst_allfrag(rt->dst.path))
1280                         cork->flags |= IPCORK_ALLFRAG;
1281                 cork->length = 0;
1282                 sk->sk_sndmsg_page = NULL;
1283                 sk->sk_sndmsg_off = 0;
1284                 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1285                 length += exthdrlen;
1286                 transhdrlen += exthdrlen;
1287                 dst_exthdrlen = rt->dst.header_len;
1288         } else {
1289                 rt = (struct rt6_info *)cork->dst;
1290                 fl6 = &inet->cork.fl.u.ip6;
1291                 opt = np->cork.opt;
1292                 transhdrlen = 0;
1293                 exthdrlen = 0;
1294                 dst_exthdrlen = 0;
1295                 mtu = cork->fragsize;
1296         }
1297
1298         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1299
1300         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1301                         (opt ? opt->opt_nflen : 0);
1302         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1303
1304         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1305                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1306                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1307                         return -EMSGSIZE;
1308                 }
1309         }
1310
1311         /* For UDP, check if TX timestamp is enabled */
1312         if (sk->sk_type == SOCK_DGRAM) {
1313                 err = sock_tx_timestamp(sk, &tx_flags);
1314                 if (err)
1315                         goto error;
1316         }
1317
1318         /*
1319          * Let's try using as much space as possible.
1320          * Use MTU if total length of the message fits into the MTU.
1321          * Otherwise, we need to reserve fragment header and
1322          * fragment alignment (= 8-15 octects, in total).
1323          *
1324          * Note that we may need to "move" the data from the tail of
1325          * of the buffer to the new fragment when we split
1326          * the message.
1327          *
1328          * FIXME: It may be fragmented into multiple chunks
1329          *        at once if non-fragmentable extension headers
1330          *        are too large.
1331          * --yoshfuji
1332          */
1333
1334         cork->length += length;
1335         if (length > mtu) {
1336                 int proto = sk->sk_protocol;
1337                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1338                         ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1339                         return -EMSGSIZE;
1340                 }
1341
1342                 if (proto == IPPROTO_UDP &&
1343                     (rt->dst.dev->features & NETIF_F_UFO)) {
1344
1345                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1346                                                   hh_len, fragheaderlen,
1347                                                   transhdrlen, mtu, flags, rt);
1348                         if (err)
1349                                 goto error;
1350                         return 0;
1351                 }
1352         }
1353
1354         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1355                 goto alloc_new_skb;
1356
1357         while (length > 0) {
1358                 /* Check if the remaining data fits into current packet. */
1359                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1360                 if (copy < length)
1361                         copy = maxfraglen - skb->len;
1362
1363                 if (copy <= 0) {
1364                         char *data;
1365                         unsigned int datalen;
1366                         unsigned int fraglen;
1367                         unsigned int fraggap;
1368                         unsigned int alloclen;
1369 alloc_new_skb:
1370                         /* There's no room in the current skb */
1371                         if (skb)
1372                                 fraggap = skb->len - maxfraglen;
1373                         else
1374                                 fraggap = 0;
1375                         /* update mtu and maxfraglen if necessary */
1376                         if (skb == NULL || skb_prev == NULL)
1377                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1378                                                     fragheaderlen, skb, rt);
1379
1380                         skb_prev = skb;
1381
1382                         /*
1383                          * If remaining data exceeds the mtu,
1384                          * we know we need more fragment(s).
1385                          */
1386                         datalen = length + fraggap;
1387
1388                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1389                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1390                         if ((flags & MSG_MORE) &&
1391                             !(rt->dst.dev->features&NETIF_F_SG))
1392                                 alloclen = mtu;
1393                         else
1394                                 alloclen = datalen + fragheaderlen;
1395
1396                         alloclen += dst_exthdrlen;
1397
1398                         if (datalen != length + fraggap) {
1399                                 /*
1400                                  * this is not the last fragment, the trailer
1401                                  * space is regarded as data space.
1402                                  */
1403                                 datalen += rt->dst.trailer_len;
1404                         }
1405
1406                         alloclen += rt->dst.trailer_len;
1407                         fraglen = datalen + fragheaderlen;
1408
1409                         /*
1410                          * We just reserve space for fragment header.
1411                          * Note: this may be overallocation if the message
1412                          * (without MSG_MORE) fits into the MTU.
1413                          */
1414                         alloclen += sizeof(struct frag_hdr);
1415
1416                         if (transhdrlen) {
1417                                 skb = sock_alloc_send_skb(sk,
1418                                                 alloclen + hh_len,
1419                                                 (flags & MSG_DONTWAIT), &err);
1420                         } else {
1421                                 skb = NULL;
1422                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1423                                     2 * sk->sk_sndbuf)
1424                                         skb = sock_wmalloc(sk,
1425                                                            alloclen + hh_len, 1,
1426                                                            sk->sk_allocation);
1427                                 if (unlikely(skb == NULL))
1428                                         err = -ENOBUFS;
1429                                 else {
1430                                         /* Only the initial fragment
1431                                          * is time stamped.
1432                                          */
1433                                         tx_flags = 0;
1434                                 }
1435                         }
1436                         if (skb == NULL)
1437                                 goto error;
1438                         /*
1439                          *      Fill in the control structures
1440                          */
1441                         skb->ip_summed = CHECKSUM_NONE;
1442                         skb->csum = 0;
1443                         /* reserve for fragmentation and ipsec header */
1444                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1445                                     dst_exthdrlen);
1446
1447                         if (sk->sk_type == SOCK_DGRAM)
1448                                 skb_shinfo(skb)->tx_flags = tx_flags;
1449
1450                         /*
1451                          *      Find where to start putting bytes
1452                          */
1453                         data = skb_put(skb, fraglen);
1454                         skb_set_network_header(skb, exthdrlen);
1455                         data += fragheaderlen;
1456                         skb->transport_header = (skb->network_header +
1457                                                  fragheaderlen);
1458                         if (fraggap) {
1459                                 skb->csum = skb_copy_and_csum_bits(
1460                                         skb_prev, maxfraglen,
1461                                         data + transhdrlen, fraggap, 0);
1462                                 skb_prev->csum = csum_sub(skb_prev->csum,
1463                                                           skb->csum);
1464                                 data += fraggap;
1465                                 pskb_trim_unique(skb_prev, maxfraglen);
1466                         }
1467                         copy = datalen - transhdrlen - fraggap;
1468
1469                         if (copy < 0) {
1470                                 err = -EINVAL;
1471                                 kfree_skb(skb);
1472                                 goto error;
1473                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1474                                 err = -EFAULT;
1475                                 kfree_skb(skb);
1476                                 goto error;
1477                         }
1478
1479                         offset += copy;
1480                         length -= datalen - fraggap;
1481                         transhdrlen = 0;
1482                         exthdrlen = 0;
1483                         dst_exthdrlen = 0;
1484
1485                         /*
1486                          * Put the packet on the pending queue
1487                          */
1488                         __skb_queue_tail(&sk->sk_write_queue, skb);
1489                         continue;
1490                 }
1491
1492                 if (copy > length)
1493                         copy = length;
1494
1495                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1496                         unsigned int off;
1497
1498                         off = skb->len;
1499                         if (getfrag(from, skb_put(skb, copy),
1500                                                 offset, copy, off, skb) < 0) {
1501                                 __skb_trim(skb, off);
1502                                 err = -EFAULT;
1503                                 goto error;
1504                         }
1505                 } else {
1506                         int i = skb_shinfo(skb)->nr_frags;
1507                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1508                         struct page *page = sk->sk_sndmsg_page;
1509                         int off = sk->sk_sndmsg_off;
1510                         unsigned int left;
1511
1512                         if (page && (left = PAGE_SIZE - off) > 0) {
1513                                 if (copy >= left)
1514                                         copy = left;
1515                                 if (page != skb_frag_page(frag)) {
1516                                         if (i == MAX_SKB_FRAGS) {
1517                                                 err = -EMSGSIZE;
1518                                                 goto error;
1519                                         }
1520                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1521                                         skb_frag_ref(skb, i);
1522                                         frag = &skb_shinfo(skb)->frags[i];
1523                                 }
1524                         } else if(i < MAX_SKB_FRAGS) {
1525                                 if (copy > PAGE_SIZE)
1526                                         copy = PAGE_SIZE;
1527                                 page = alloc_pages(sk->sk_allocation, 0);
1528                                 if (page == NULL) {
1529                                         err = -ENOMEM;
1530                                         goto error;
1531                                 }
1532                                 sk->sk_sndmsg_page = page;
1533                                 sk->sk_sndmsg_off = 0;
1534
1535                                 skb_fill_page_desc(skb, i, page, 0, 0);
1536                                 frag = &skb_shinfo(skb)->frags[i];
1537                         } else {
1538                                 err = -EMSGSIZE;
1539                                 goto error;
1540                         }
1541                         if (getfrag(from,
1542                                     skb_frag_address(frag) + skb_frag_size(frag),
1543                                     offset, copy, skb->len, skb) < 0) {
1544                                 err = -EFAULT;
1545                                 goto error;
1546                         }
1547                         sk->sk_sndmsg_off += copy;
1548                         skb_frag_size_add(frag, copy);
1549                         skb->len += copy;
1550                         skb->data_len += copy;
1551                         skb->truesize += copy;
1552                         atomic_add(copy, &sk->sk_wmem_alloc);
1553                 }
1554                 offset += copy;
1555                 length -= copy;
1556         }
1557         return 0;
1558 error:
1559         cork->length -= length;
1560         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1561         return err;
1562 }
1563 EXPORT_SYMBOL_GPL(ip6_append_data);
1564
1565 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1566 {
1567         if (np->cork.opt) {
1568                 kfree(np->cork.opt->dst0opt);
1569                 kfree(np->cork.opt->dst1opt);
1570                 kfree(np->cork.opt->hopopt);
1571                 kfree(np->cork.opt->srcrt);
1572                 kfree(np->cork.opt);
1573                 np->cork.opt = NULL;
1574         }
1575
1576         if (inet->cork.base.dst) {
1577                 dst_release(inet->cork.base.dst);
1578                 inet->cork.base.dst = NULL;
1579                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1580         }
1581         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1582 }
1583
1584 int ip6_push_pending_frames(struct sock *sk)
1585 {
1586         struct sk_buff *skb, *tmp_skb;
1587         struct sk_buff **tail_skb;
1588         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1589         struct inet_sock *inet = inet_sk(sk);
1590         struct ipv6_pinfo *np = inet6_sk(sk);
1591         struct net *net = sock_net(sk);
1592         struct ipv6hdr *hdr;
1593         struct ipv6_txoptions *opt = np->cork.opt;
1594         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1595         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1596         unsigned char proto = fl6->flowi6_proto;
1597         int err = 0;
1598
1599         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1600                 goto out;
1601         tail_skb = &(skb_shinfo(skb)->frag_list);
1602
1603         /* move skb->data to ip header from ext header */
1604         if (skb->data < skb_network_header(skb))
1605                 __skb_pull(skb, skb_network_offset(skb));
1606         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1607                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1608                 *tail_skb = tmp_skb;
1609                 tail_skb = &(tmp_skb->next);
1610                 skb->len += tmp_skb->len;
1611                 skb->data_len += tmp_skb->len;
1612                 skb->truesize += tmp_skb->truesize;
1613                 tmp_skb->destructor = NULL;
1614                 tmp_skb->sk = NULL;
1615         }
1616
1617         /* Allow local fragmentation. */
1618         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1619                 skb->local_df = 1;
1620
1621         *final_dst = fl6->daddr;
1622         __skb_pull(skb, skb_network_header_len(skb));
1623         if (opt && opt->opt_flen)
1624                 ipv6_push_frag_opts(skb, opt, &proto);
1625         if (opt && opt->opt_nflen)
1626                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1627
1628         skb_push(skb, sizeof(struct ipv6hdr));
1629         skb_reset_network_header(skb);
1630         hdr = ipv6_hdr(skb);
1631
1632         *(__be32*)hdr = fl6->flowlabel |
1633                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1634
1635         hdr->hop_limit = np->cork.hop_limit;
1636         hdr->nexthdr = proto;
1637         hdr->saddr = fl6->saddr;
1638         hdr->daddr = *final_dst;
1639
1640         skb->priority = sk->sk_priority;
1641         skb->mark = sk->sk_mark;
1642
1643         skb_dst_set(skb, dst_clone(&rt->dst));
1644         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1645         if (proto == IPPROTO_ICMPV6) {
1646                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1647
1648                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1649                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1650         }
1651
1652         err = ip6_local_out(skb);
1653         if (err) {
1654                 if (err > 0)
1655                         err = net_xmit_errno(err);
1656                 if (err)
1657                         goto error;
1658         }
1659
1660 out:
1661         ip6_cork_release(inet, np);
1662         return err;
1663 error:
1664         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1665         goto out;
1666 }
1667 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1668
1669 void ip6_flush_pending_frames(struct sock *sk)
1670 {
1671         struct sk_buff *skb;
1672
1673         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1674                 if (skb_dst(skb))
1675                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1676                                       IPSTATS_MIB_OUTDISCARDS);
1677                 kfree_skb(skb);
1678         }
1679
1680         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1681 }
1682 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);