OSDN Git Service

mediatek: add missing Kconfig
[immortalwrt/immortalwrt.git] / package / kernel / shortcut-fe / src / sfe_cm.c
1 /*
2  * sfe-cm.c
3  *      Shortcut forwarding engine connection manager.
4  *
5  * Copyright (c) 2013-2018 The Linux Foundation. All rights reserved.
6  * Permission to use, copy, modify, and/or distribute this software for
7  * any purpose with or without fee is hereby granted, provided that the
8  * above copyright notice and this permission notice appear in all copies.
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
15  * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17
18 #include <linux/module.h>
19 #include <linux/sysfs.h>
20 #include <linux/skbuff.h>
21 #include <net/route.h>
22 #include <net/ip6_route.h>
23 #include <net/addrconf.h>
24 #include <net/dsfield.h>
25 #include <linux/inetdevice.h>
26 #include <linux/netfilter_bridge.h>
27 #include <linux/netfilter_ipv6.h>
28 #include <net/netfilter/nf_conntrack_acct.h>
29 #include <net/netfilter/nf_conntrack_helper.h>
30 #include <net/netfilter/nf_conntrack_zones.h>
31 #include <net/netfilter/nf_conntrack_core.h>
32 #include <linux/netfilter/xt_dscp.h>
33 #include <linux/if_bridge.h>
34 #include <linux/version.h>
35
36 #include "sfe.h"
37 #include "sfe_cm.h"
38 #include "sfe_backport.h"
39
40 typedef enum sfe_cm_exception {
41         SFE_CM_EXCEPTION_PACKET_BROADCAST,
42         SFE_CM_EXCEPTION_PACKET_MULTICAST,
43         SFE_CM_EXCEPTION_NO_IIF,
44         SFE_CM_EXCEPTION_NO_CT,
45         SFE_CM_EXCEPTION_CT_NO_TRACK,
46         SFE_CM_EXCEPTION_CT_NO_CONFIRM,
47         SFE_CM_EXCEPTION_CT_IS_ALG,
48         SFE_CM_EXCEPTION_IS_IPV4_MCAST,
49         SFE_CM_EXCEPTION_IS_IPV6_MCAST,
50         SFE_CM_EXCEPTION_TCP_NOT_ASSURED,
51         SFE_CM_EXCEPTION_TCP_NOT_ESTABLISHED,
52         SFE_CM_EXCEPTION_UNKNOW_PROTOCOL,
53         SFE_CM_EXCEPTION_NO_SRC_DEV,
54         SFE_CM_EXCEPTION_NO_SRC_XLATE_DEV,
55         SFE_CM_EXCEPTION_NO_DEST_DEV,
56         SFE_CM_EXCEPTION_NO_DEST_XLATE_DEV,
57         SFE_CM_EXCEPTION_NO_BRIDGE,
58         SFE_CM_EXCEPTION_LOCAL_OUT,
59         SFE_CM_EXCEPTION_MAX
60 } sfe_cm_exception_t;
61
62 static char *sfe_cm_exception_events_string[SFE_CM_EXCEPTION_MAX] = {
63         "PACKET_BROADCAST",
64         "PACKET_MULTICAST",
65         "NO_IIF",
66         "NO_CT",
67         "CT_NO_TRACK",
68         "CT_NO_CONFIRM",
69         "CT_IS_ALG",
70         "IS_IPV4_MCAST",
71         "IS_IPV6_MCAST",
72         "TCP_NOT_ASSURED",
73         "TCP_NOT_ESTABLISHED",
74         "UNKNOW_PROTOCOL",
75         "NO_SRC_DEV",
76         "NO_SRC_XLATE_DEV",
77         "NO_DEST_DEV",
78         "NO_DEST_XLATE_DEV",
79         "NO_BRIDGE",
80         "LOCAL_OUT"
81 };
82
83 /*
84  * Per-module structure.
85  */
86 struct sfe_cm {
87         spinlock_t lock;                /* Lock for SMP correctness */
88
89         /*
90          * Control state.
91          */
92         struct kobject *sys_sfe_cm;     /* sysfs linkage */
93
94         /*
95          * Callback notifiers.
96          */
97         struct notifier_block dev_notifier;     /* Device notifier */
98         struct notifier_block inet_notifier;    /* IPv4 notifier */
99         struct notifier_block inet6_notifier;   /* IPv6 notifier */
100         u32 exceptions[SFE_CM_EXCEPTION_MAX];
101 };
102
103 static struct sfe_cm __sc;
104
105 /*
106  * sfe_cm_incr_exceptions()
107  *      increase an exception counter.
108  */
109 static inline void sfe_cm_incr_exceptions(sfe_cm_exception_t except)
110 {
111         struct sfe_cm *sc = &__sc;
112
113         spin_lock_bh(&sc->lock);
114         sc->exceptions[except]++;
115         spin_unlock_bh(&sc->lock);
116 }
117
118 /*
119  * sfe_cm_recv()
120  *      Handle packet receives.
121  *
122  * Returns 1 if the packet is forwarded or 0 if it isn't.
123  */
124 int sfe_cm_recv(struct sk_buff *skb)
125 {
126         struct net_device *dev;
127
128         /*
129          * We know that for the vast majority of packets we need the transport
130          * layer header so we may as well start to fetch it now!
131          */
132         prefetch(skb->data + 32);
133         barrier();
134
135         dev = skb->dev;
136
137         /*
138          * We're only interested in IPv4 and IPv6 packets.
139          */
140         if (likely(htons(ETH_P_IP) == skb->protocol)) {
141                 struct in_device *in_dev;
142
143                 /*
144                  * Does our input device support IP processing?
145                  */
146                 in_dev = (struct in_device *)dev->ip_ptr;
147                 if (unlikely(!in_dev)) {
148                         DEBUG_TRACE("no IP processing for device: %s\n", dev->name);
149                         return 0;
150                 }
151
152                 /*
153                  * Does it have an IP address?  If it doesn't then we can't do anything
154                  * interesting here!
155                  */
156                 if (unlikely(!in_dev->ifa_list)) {
157                         DEBUG_TRACE("no IP address for device: %s\n", dev->name);
158                         return 0;
159                 }
160
161                 return sfe_ipv4_recv(dev, skb);
162         }
163
164         if (likely(htons(ETH_P_IPV6) == skb->protocol)) {
165                 struct inet6_dev *in_dev;
166
167                 /*
168                  * Does our input device support IPv6 processing?
169                  */
170                 in_dev = (struct inet6_dev *)dev->ip6_ptr;
171                 if (unlikely(!in_dev)) {
172                         DEBUG_TRACE("no IPv6 processing for device: %s\n", dev->name);
173                         return 0;
174                 }
175
176                 /*
177                  * Does it have an IPv6 address?  If it doesn't then we can't do anything
178                  * interesting here!
179                  */
180                 if (unlikely(list_empty(&in_dev->addr_list))) {
181                         DEBUG_TRACE("no IPv6 address for device: %s\n", dev->name);
182                         return 0;
183                 }
184
185                 return sfe_ipv6_recv(dev, skb);
186         }
187
188         DEBUG_TRACE("not IP packet\n");
189         return 0;
190 }
191
192 /*
193  * sfe_cm_find_dev_and_mac_addr()
194  *      Find the device and MAC address for a given IPv4/IPv6 address.
195  *
196  * Returns true if we find the device and MAC address, otherwise false.
197  *
198  * We look up the rtable entry for the address and, from its neighbour
199  * structure, obtain the hardware address.  This means this function also
200  * works if the neighbours are routers too.
201  */
202 static bool sfe_cm_find_dev_and_mac_addr(sfe_ip_addr_t *addr, struct net_device **dev, u8 *mac_addr, int is_v4)
203 {
204         struct neighbour *neigh;
205         struct rtable *rt;
206         struct rt6_info *rt6;
207         struct dst_entry *dst;
208         struct net_device *mac_dev;
209
210         /*
211          * Look up the rtable entry for the IP address then get the hardware
212          * address from its neighbour structure.  This means this work when the
213          * neighbours are routers too.
214          */
215         if (likely(is_v4)) {
216                 rt = ip_route_output(&init_net, addr->ip, 0, 0, 0);
217                 if (unlikely(IS_ERR(rt))) {
218                         goto ret_fail;
219                 }
220
221                 dst = (struct dst_entry *)rt;
222         } else {
223 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0))
224                 rt6 = rt6_lookup(&init_net, (struct in6_addr *)addr->ip6, 0, 0, NULL, 0);
225 #else
226                 rt6 = rt6_lookup(&init_net, (struct in6_addr *)addr->ip6, 0, 0, 0);
227 #endif /*KERNEL_VERSION(4, 17, 0)*/
228                 if (!rt6) {
229                         goto ret_fail;
230                 }
231
232                 dst = (struct dst_entry *)rt6;
233         }
234
235         rcu_read_lock();
236         neigh = sfe_dst_get_neighbour(dst, addr);
237         if (unlikely(!neigh)) {
238                 rcu_read_unlock();
239                 dst_release(dst);
240                 goto ret_fail;
241         }
242
243         if (unlikely(!(neigh->nud_state & NUD_VALID))) {
244                 rcu_read_unlock();
245                 neigh_release(neigh);
246                 dst_release(dst);
247                 goto ret_fail;
248         }
249
250         mac_dev = neigh->dev;
251         if (!mac_dev) {
252                 rcu_read_unlock();
253                 neigh_release(neigh);
254                 dst_release(dst);
255                 goto ret_fail;
256         }
257
258         memcpy(mac_addr, neigh->ha, (size_t)mac_dev->addr_len);
259
260         dev_hold(mac_dev);
261         *dev = mac_dev;
262         rcu_read_unlock();
263         neigh_release(neigh);
264         dst_release(dst);
265
266         return true;
267
268 ret_fail:
269         if (is_v4) {
270                 DEBUG_TRACE("failed to find MAC address for IP: %pI4\n", &addr->ip);
271
272         } else {
273                 DEBUG_TRACE("failed to find MAC address for IP: %pI6\n", addr->ip6);
274         }
275
276         return false;
277 }
278
279 /*
280  * sfe_cm_post_routing()
281  *      Called for packets about to leave the box - either locally generated or forwarded from another interface
282  */
283 static unsigned int sfe_cm_post_routing(struct sk_buff *skb, int is_v4)
284 {
285         struct sfe_connection_create sic;
286         struct net_device *in;
287         struct nf_conn *ct;
288         enum ip_conntrack_info ctinfo;
289         struct net_device *dev;
290         struct net_device *src_dev;
291         struct net_device *dest_dev;
292         struct net_device *src_dev_tmp;
293         struct net_device *dest_dev_tmp;
294         struct net_device *src_br_dev = NULL;
295         struct net_device *dest_br_dev = NULL;
296         struct nf_conntrack_tuple orig_tuple;
297         struct nf_conntrack_tuple reply_tuple;
298         SFE_NF_CONN_ACCT(acct);
299
300         /*
301          * Don't process broadcast or multicast packets.
302          */
303         if (unlikely(skb->pkt_type == PACKET_BROADCAST)) {
304                 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_PACKET_BROADCAST);
305                 DEBUG_TRACE("broadcast, ignoring\n");
306                 return NF_ACCEPT;
307         }
308         if (unlikely(skb->pkt_type == PACKET_MULTICAST)) {
309                 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_PACKET_MULTICAST);
310                 DEBUG_TRACE("multicast, ignoring\n");
311                 return NF_ACCEPT;
312         }
313
314 #ifdef CONFIG_XFRM
315         /*
316          * Packet to xfrm for encapsulation, we can't process it
317          */
318         if (unlikely(skb_dst(skb)->xfrm)) {
319                 DEBUG_TRACE("packet to xfrm, ignoring\n");
320                 return NF_ACCEPT;
321         }
322 #endif
323
324         /*
325          * Don't process locally generated packets.
326          */
327         if (skb->sk) {
328                 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_LOCAL_OUT);
329                 DEBUG_TRACE("skip local out packet\n");
330                 return NF_ACCEPT;
331         }
332
333         /*
334          * Don't process packets that are not being forwarded.
335          */
336         in = dev_get_by_index(&init_net, skb->skb_iif);
337         if (!in) {
338                 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_IIF);
339                 DEBUG_TRACE("packet not forwarding\n");
340                 return NF_ACCEPT;
341         }
342
343         dev_put(in);
344
345         /*
346          * Don't process packets that aren't being tracked by conntrack.
347          */
348         ct = nf_ct_get(skb, &ctinfo);
349         if (unlikely(!ct)) {
350                 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_CT);
351                 DEBUG_TRACE("no conntrack connection, ignoring\n");
352                 return NF_ACCEPT;
353         }
354
355 #if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0))
356         /*
357          * Don't process untracked connections.
358          */
359         if (unlikely(nf_ct_is_untracked(ct))) {
360                 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_NO_TRACK);
361                 DEBUG_TRACE("untracked connection\n");
362                 return NF_ACCEPT;
363         }
364 #endif /*KERNEL_VERSION(4, 12, 0)*/
365
366         /*
367          * Unconfirmed connection may be dropped by Linux at the final step,
368          * So we don't process unconfirmed connections.
369          */
370         if (!nf_ct_is_confirmed(ct)) {
371                 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_NO_CONFIRM);
372                 DEBUG_TRACE("unconfirmed connection\n");
373                 return NF_ACCEPT;
374         }
375
376         /*
377          * Don't process connections that require support from a 'helper' (typically a NAT ALG).
378          */
379         if (unlikely(nfct_help(ct))) {
380                 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_IS_ALG);
381                 DEBUG_TRACE("connection has helper\n");
382                 return NF_ACCEPT;
383         }
384
385         /*
386          * Check if the acceleration of a flow could be rejected quickly.
387          */
388         acct = nf_conn_acct_find(ct);
389         if (acct) {
390                 long long packets = atomic64_read(&SFE_ACCT_COUNTER(acct)[CTINFO2DIR(ctinfo)].packets);
391                 if ((packets > 0xff) && (packets & 0xff)) {
392                         /*
393                          * Connection hits slow path at least 256 times, so it must be not able to accelerate.
394                          * But we also give it a chance to walk through ECM every 256 packets
395                          */
396                         return NF_ACCEPT;
397                 }
398         }
399
400         /*
401          * Look up the details of our connection in conntrack.
402          *
403          * Note that the data we get from conntrack is for the "ORIGINAL" direction
404          * but our packet may actually be in the "REPLY" direction.
405          */
406         orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
407         reply_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
408         sic.protocol = (s32)orig_tuple.dst.protonum;
409
410         sic.flags = 0;
411
412         /*
413          * Get addressing information, non-NAT first
414          */
415         if (likely(is_v4)) {
416                 u32 dscp;
417
418                 sic.src_ip.ip = (__be32)orig_tuple.src.u3.ip;
419                 sic.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip;
420
421                 if (ipv4_is_multicast(sic.src_ip.ip) || ipv4_is_multicast(sic.dest_ip.ip)) {
422                         sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_IS_IPV4_MCAST);
423                         DEBUG_TRACE("multicast address\n");
424                         return NF_ACCEPT;
425                 }
426
427                 /*
428                  * NAT'ed addresses - note these are as seen from the 'reply' direction
429                  * When NAT does not apply to this connection these will be identical to the above.
430                  */
431                 sic.src_ip_xlate.ip = (__be32)reply_tuple.dst.u3.ip;
432                 sic.dest_ip_xlate.ip = (__be32)reply_tuple.src.u3.ip;
433
434                 dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT;
435                 if (dscp) {
436                         sic.dest_dscp = dscp;
437                         sic.src_dscp = sic.dest_dscp;
438                         sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP;
439                 }
440         } else {
441                 u32 dscp;
442
443                 sic.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6);
444                 sic.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6);
445
446                 if (ipv6_addr_is_multicast((struct in6_addr *)sic.src_ip.ip6) ||
447                     ipv6_addr_is_multicast((struct in6_addr *)sic.dest_ip.ip6)) {
448                         sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_IS_IPV6_MCAST);
449                         DEBUG_TRACE("multicast address\n");
450                         return NF_ACCEPT;
451                 }
452
453                 /*
454                  * NAT'ed addresses - note these are as seen from the 'reply' direction
455                  * When NAT does not apply to this connection these will be identical to the above.
456                  */
457                 sic.src_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.dst.u3.in6);
458                 sic.dest_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.src.u3.in6);
459
460                 dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT;
461                 if (dscp) {
462                         sic.dest_dscp = dscp;
463                         sic.src_dscp = sic.dest_dscp;
464                         sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP;
465                 }
466         }
467
468         switch (sic.protocol) {
469         case IPPROTO_TCP:
470                 sic.src_port = orig_tuple.src.u.tcp.port;
471                 sic.dest_port = orig_tuple.dst.u.tcp.port;
472                 sic.src_port_xlate = reply_tuple.dst.u.tcp.port;
473                 sic.dest_port_xlate = reply_tuple.src.u.tcp.port;
474                 sic.src_td_window_scale = ct->proto.tcp.seen[0].td_scale;
475                 sic.src_td_max_window = ct->proto.tcp.seen[0].td_maxwin;
476                 sic.src_td_end = ct->proto.tcp.seen[0].td_end;
477                 sic.src_td_max_end = ct->proto.tcp.seen[0].td_maxend;
478                 sic.dest_td_window_scale = ct->proto.tcp.seen[1].td_scale;
479                 sic.dest_td_max_window = ct->proto.tcp.seen[1].td_maxwin;
480                 sic.dest_td_end = ct->proto.tcp.seen[1].td_end;
481                 sic.dest_td_max_end = ct->proto.tcp.seen[1].td_maxend;
482
483                 if (nf_ct_tcp_no_window_check
484                     || (ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_BE_LIBERAL)
485                     || (ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_BE_LIBERAL)) {
486                         sic.flags |= SFE_CREATE_FLAG_NO_SEQ_CHECK;
487                 }
488
489                 /*
490                  * Don't try to manage a non-established connection.
491                  */
492                 if (!test_bit(IPS_ASSURED_BIT, &ct->status)) {
493                         sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_TCP_NOT_ASSURED);
494                         DEBUG_TRACE("non-established connection\n");
495                         return NF_ACCEPT;
496                 }
497
498                 /*
499                  * If the connection is shutting down do not manage it.
500                  * state can not be SYN_SENT, SYN_RECV because connection is assured
501                  * Not managed states: FIN_WAIT, CLOSE_WAIT, LAST_ACK, TIME_WAIT, CLOSE.
502                  */
503                 spin_lock_bh(&ct->lock);
504                 if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) {
505                         spin_unlock_bh(&ct->lock);
506                         sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_TCP_NOT_ESTABLISHED);
507                         DEBUG_TRACE("connection in termination state: %#x, s: %pI4:%u, d: %pI4:%u\n",
508                                     ct->proto.tcp.state, &sic.src_ip, ntohs(sic.src_port),
509                                     &sic.dest_ip, ntohs(sic.dest_port));
510                         return NF_ACCEPT;
511                 }
512                 spin_unlock_bh(&ct->lock);
513                 break;
514
515         case IPPROTO_UDP:
516                 sic.src_port = orig_tuple.src.u.udp.port;
517                 sic.dest_port = orig_tuple.dst.u.udp.port;
518                 sic.src_port_xlate = reply_tuple.dst.u.udp.port;
519                 sic.dest_port_xlate = reply_tuple.src.u.udp.port;
520                 break;
521
522         default:
523                 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_UNKNOW_PROTOCOL);
524                 DEBUG_TRACE("unhandled protocol %d\n", sic.protocol);
525                 return NF_ACCEPT;
526         }
527
528 #ifdef CONFIG_XFRM
529         sic.original_accel = 1;
530         sic.reply_accel = 1;
531
532         /*
533          * For packets de-capsulated from xfrm, we still can accelerate it
534          * on the direction we just received the packet.
535          */
536 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0))
537         if (unlikely(skb_ext_exist(skb, SKB_EXT_SEC_PATH))) {
538 #else
539         if (unlikely(skb->sp)) {
540 #endif
541                 if (sic.protocol == IPPROTO_TCP &&
542                     !(sic.flags & SFE_CREATE_FLAG_NO_SEQ_CHECK)) {
543                         return NF_ACCEPT;
544                 }
545
546                 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
547                         sic.reply_accel = 0;
548                 } else {
549                         sic.original_accel = 0;
550                 }
551         }
552 #endif
553
554         /*
555          * Get QoS information
556          */
557         if (skb->priority) {
558                 sic.dest_priority = skb->priority;
559                 sic.src_priority = sic.dest_priority;
560                 sic.flags |= SFE_CREATE_FLAG_REMARK_PRIORITY;
561         }
562
563         /*
564          * Get the net device and MAC addresses that correspond to the various source and
565          * destination host addresses.
566          */
567         if (!sfe_cm_find_dev_and_mac_addr(&sic.src_ip, &src_dev_tmp, sic.src_mac, is_v4)) {
568                 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_SRC_DEV);
569                 return NF_ACCEPT;
570         }
571         src_dev = src_dev_tmp;
572
573         if (!sfe_cm_find_dev_and_mac_addr(&sic.src_ip_xlate, &dev, sic.src_mac_xlate, is_v4)) {
574                 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_SRC_XLATE_DEV);
575                 goto done1;
576         }
577         dev_put(dev);
578
579         if (!sfe_cm_find_dev_and_mac_addr(&sic.dest_ip, &dev, sic.dest_mac, is_v4)) {
580                 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_DEST_DEV);
581                 goto done1;
582         }
583         dev_put(dev);
584
585         if (!sfe_cm_find_dev_and_mac_addr(&sic.dest_ip_xlate, &dest_dev_tmp, sic.dest_mac_xlate, is_v4)) {
586                 sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_DEST_XLATE_DEV);
587                 goto done1;
588         }
589         dest_dev = dest_dev_tmp;
590
591         /*
592          * Our devices may actually be part of a bridge interface.  If that's
593          * the case then find the bridge interface instead.
594          */
595         if (src_dev->priv_flags & IFF_BRIDGE_PORT) {
596                 src_br_dev = sfe_dev_get_master(src_dev);
597                 if (!src_br_dev) {
598                         sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_BRIDGE);
599                         DEBUG_TRACE("no bridge found for: %s\n", src_dev->name);
600                         goto done2;
601                 }
602                 src_dev = src_br_dev;
603         }
604
605         if (dest_dev->priv_flags & IFF_BRIDGE_PORT) {
606                 dest_br_dev = sfe_dev_get_master(dest_dev);
607                 if (!dest_br_dev) {
608                         sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_BRIDGE);
609                         DEBUG_TRACE("no bridge found for: %s\n", dest_dev->name);
610                         goto done3;
611                 }
612                 dest_dev = dest_br_dev;
613         }
614
615         sic.src_dev = src_dev;
616         sic.dest_dev = dest_dev;
617
618         sic.src_mtu = src_dev->mtu;
619         sic.dest_mtu = dest_dev->mtu;
620
621         if (likely(is_v4)) {
622                 sfe_ipv4_create_rule(&sic);
623         } else {
624                 sfe_ipv6_create_rule(&sic);
625         }
626
627         /*
628          * If we had bridge ports then release them too.
629          */
630         if (dest_br_dev) {
631                 dev_put(dest_br_dev);
632         }
633 done3:
634         if (src_br_dev) {
635                 dev_put(src_br_dev);
636         }
637 done2:
638         dev_put(dest_dev_tmp);
639 done1:
640         dev_put(src_dev_tmp);
641
642         return NF_ACCEPT;
643 }
644
645 /*
646  * sfe_cm_ipv4_post_routing_hook()
647  *      Called for packets about to leave the box - either locally generated or forwarded from another interface
648  */
649 sfe_cm_ipv4_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn)
650 {
651         return sfe_cm_post_routing(skb, true);
652 }
653
654 /*
655  * sfe_cm_ipv6_post_routing_hook()
656  *      Called for packets about to leave the box - either locally generated or forwarded from another interface
657  */
658 sfe_cm_ipv6_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn)
659 {
660         return sfe_cm_post_routing(skb, false);
661 }
662
663 #ifdef CONFIG_NF_CONNTRACK_EVENTS
664 /*
665  * sfe_cm_conntrack_event()
666  *      Callback event invoked when a conntrack connection's state changes.
667  */
668 #ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
669 static int sfe_cm_conntrack_event(struct notifier_block *this,
670                                   unsigned long events, void *ptr)
671 #else
672 static int sfe_cm_conntrack_event(unsigned int events, struct nf_ct_event *item)
673 #endif
674 {
675 #ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
676         struct nf_ct_event *item = ptr;
677 #endif
678         struct sfe_connection_destroy sid;
679         struct nf_conn *ct = item->ct;
680         struct nf_conntrack_tuple orig_tuple;
681
682         /*
683          * If we don't have a conntrack entry then we're done.
684          */
685         if (unlikely(!ct)) {
686                 DEBUG_WARN("no ct in conntrack event callback\n");
687                 return NOTIFY_DONE;
688         }
689
690 #if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0))
691         /*
692          * If this is an untracked connection then we can't have any state either.
693          */
694         if (unlikely(nf_ct_is_untracked(ct))) {
695                 DEBUG_TRACE("ignoring untracked conn\n");
696                 return NOTIFY_DONE;
697         }
698 #endif /*KERNEL_VERSION(4, 12, 0)*/
699
700         /*
701          * We're only interested in destroy events.
702          */
703         if (unlikely(!(events & (1 << IPCT_DESTROY)))) {
704                 DEBUG_TRACE("ignoring non-destroy event\n");
705                 return NOTIFY_DONE;
706         }
707
708         orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
709         sid.protocol = (s32)orig_tuple.dst.protonum;
710
711         /*
712          * Extract information from the conntrack connection.  We're only interested
713          * in nominal connection information (i.e. we're ignoring any NAT information).
714          */
715         switch (sid.protocol) {
716         case IPPROTO_TCP:
717                 sid.src_port = orig_tuple.src.u.tcp.port;
718                 sid.dest_port = orig_tuple.dst.u.tcp.port;
719                 break;
720
721         case IPPROTO_UDP:
722                 sid.src_port = orig_tuple.src.u.udp.port;
723                 sid.dest_port = orig_tuple.dst.u.udp.port;
724                 break;
725
726         default:
727                 DEBUG_TRACE("unhandled protocol: %d\n", sid.protocol);
728                 return NOTIFY_DONE;
729         }
730
731         if (likely(nf_ct_l3num(ct) == AF_INET)) {
732                 sid.src_ip.ip = (__be32)orig_tuple.src.u3.ip;
733                 sid.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip;
734
735                 sfe_ipv4_destroy_rule(&sid);
736         } else if (likely(nf_ct_l3num(ct) == AF_INET6)) {
737                 sid.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6);
738                 sid.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6);
739
740                 sfe_ipv6_destroy_rule(&sid);
741         } else {
742                 DEBUG_TRACE("ignoring non-IPv4 and non-IPv6 connection\n");
743         }
744
745         return NOTIFY_DONE;
746 }
747
748 /*
749  * Netfilter conntrack event system to monitor connection tracking changes
750  */
751 #ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
752 static struct notifier_block sfe_cm_conntrack_notifier = {
753         .notifier_call = sfe_cm_conntrack_event,
754 };
755 #else
756 static struct nf_ct_event_notifier sfe_cm_conntrack_notifier = {
757         .fcn = sfe_cm_conntrack_event,
758 };
759 #endif
760 #endif
761
762 /*
763  * Structure to establish a hook into the post routing netfilter point - this
764  * will pick up local outbound and packets going from one interface to another.
765  *
766  * Note: see include/linux/netfilter_ipv4.h for info related to priority levels.
767  * We want to examine packets after NAT translation and any ALG processing.
768  */
769 static struct nf_hook_ops sfe_cm_ops_post_routing[] __read_mostly = {
770         SFE_IPV4_NF_POST_ROUTING_HOOK(__sfe_cm_ipv4_post_routing_hook),
771 #ifdef SFE_SUPPORT_IPV6
772         SFE_IPV6_NF_POST_ROUTING_HOOK(__sfe_cm_ipv6_post_routing_hook),
773 #endif
774 };
775
776 /*
777  * sfe_cm_sync_rule()
778  *      Synchronize a connection's state.
779  */
780 static void sfe_cm_sync_rule(struct sfe_connection_sync *sis)
781 {
782         struct nf_conntrack_tuple_hash *h;
783         struct nf_conntrack_tuple tuple;
784         struct nf_conn *ct;
785         SFE_NF_CONN_ACCT(acct);
786
787         /*
788          * Create a tuple so as to be able to look up a connection
789          */
790         memset(&tuple, 0, sizeof(tuple));
791         tuple.src.u.all = (__be16)sis->src_port;
792         tuple.dst.dir = IP_CT_DIR_ORIGINAL;
793         tuple.dst.protonum = (u8)sis->protocol;
794         tuple.dst.u.all = (__be16)sis->dest_port;
795
796         if (sis->is_v6) {
797                 tuple.src.u3.in6 = *((struct in6_addr *)sis->src_ip.ip6);
798                 tuple.dst.u3.in6 = *((struct in6_addr *)sis->dest_ip.ip6);
799                 tuple.src.l3num = AF_INET6;
800
801                 DEBUG_TRACE("update connection - p: %d, s: %pI6:%u, d: %pI6:%u\n",
802                             (int)tuple.dst.protonum,
803                             &tuple.src.u3.in6, (unsigned int)ntohs(tuple.src.u.all),
804                             &tuple.dst.u3.in6, (unsigned int)ntohs(tuple.dst.u.all));
805         } else {
806                 tuple.src.u3.ip = sis->src_ip.ip;
807                 tuple.dst.u3.ip = sis->dest_ip.ip;
808                 tuple.src.l3num = AF_INET;
809
810                 DEBUG_TRACE("update connection - p: %d, s: %pI4:%u, d: %pI4:%u\n",
811                             (int)tuple.dst.protonum,
812                             &tuple.src.u3.ip, (unsigned int)ntohs(tuple.src.u.all),
813                             &tuple.dst.u3.ip, (unsigned int)ntohs(tuple.dst.u.all));
814         }
815
816         /*
817          * Look up conntrack connection
818          */
819         h = nf_conntrack_find_get(&init_net, SFE_NF_CT_DEFAULT_ZONE, &tuple);
820         if (unlikely(!h)) {
821                 DEBUG_TRACE("no connection found\n");
822                 return;
823         }
824
825         ct = nf_ct_tuplehash_to_ctrack(h);
826 #if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 9, 0))
827         NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
828 #endif /*KERNEL_VERSION(4, 9, 0)*/
829
830         /*
831          * Only update if this is not a fixed timeout
832          */
833         if (!test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
834                 spin_lock_bh(&ct->lock);
835 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0))
836                 ct->timeout += sis->delta_jiffies;
837 #else
838                 ct->timeout.expires += sis->delta_jiffies;
839 #endif /*KERNEL_VERSION(4, 9, 0)*/
840                 spin_unlock_bh(&ct->lock);
841         }
842
843         acct = nf_conn_acct_find(ct);
844         if (acct) {
845                 spin_lock_bh(&ct->lock);
846                 atomic64_add(sis->src_new_packet_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].packets);
847                 atomic64_add(sis->src_new_byte_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].bytes);
848                 atomic64_add(sis->dest_new_packet_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].packets);
849                 atomic64_add(sis->dest_new_byte_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].bytes);
850                 spin_unlock_bh(&ct->lock);
851         }
852
853         switch (sis->protocol) {
854         case IPPROTO_TCP:
855                 spin_lock_bh(&ct->lock);
856                 if (ct->proto.tcp.seen[0].td_maxwin < sis->src_td_max_window) {
857                         ct->proto.tcp.seen[0].td_maxwin = sis->src_td_max_window;
858                 }
859                 if ((s32)(ct->proto.tcp.seen[0].td_end - sis->src_td_end) < 0) {
860                         ct->proto.tcp.seen[0].td_end = sis->src_td_end;
861                 }
862                 if ((s32)(ct->proto.tcp.seen[0].td_maxend - sis->src_td_max_end) < 0) {
863                         ct->proto.tcp.seen[0].td_maxend = sis->src_td_max_end;
864                 }
865                 if (ct->proto.tcp.seen[1].td_maxwin < sis->dest_td_max_window) {
866                         ct->proto.tcp.seen[1].td_maxwin = sis->dest_td_max_window;
867                 }
868                 if ((s32)(ct->proto.tcp.seen[1].td_end - sis->dest_td_end) < 0) {
869                         ct->proto.tcp.seen[1].td_end = sis->dest_td_end;
870                 }
871                 if ((s32)(ct->proto.tcp.seen[1].td_maxend - sis->dest_td_max_end) < 0) {
872                         ct->proto.tcp.seen[1].td_maxend = sis->dest_td_max_end;
873                 }
874                 spin_unlock_bh(&ct->lock);
875                 break;
876 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 4, 0))
877         case IPPROTO_UDP:
878                 /*
879                  * In Linux connection track, UDP flow has two timeout values:
880                  * /proc/sys/net/netfilter/nf_conntrack_udp_timeout:
881                  *      this is for uni-direction UDP flow, normally its value is 60 seconds
882                  * /proc/sys/net/netfilter/nf_conntrack_udp_timeout_stream:
883                  *      this is for bi-direction UDP flow, normally its value is 180 seconds
884                  *
885                  * Linux will update timer of UDP flow to stream timeout once it seen packets
886                  * in reply direction. But if flow is accelerated by NSS or SFE, Linux won't
887                  * see any packets. So we have to do the same thing in our stats sync message.
888                  */
889                 if (!test_bit(IPS_ASSURED_BIT, &ct->status) && acct) {
890                         u_int64_t reply_pkts = atomic64_read(&SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].packets);
891
892                         if (reply_pkts != 0) {
893                                 unsigned int *timeouts;
894
895                                 set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
896                                 set_bit(IPS_ASSURED_BIT, &ct->status);
897
898 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 19, 0))
899                                 timeouts = nf_ct_timeout_lookup(ct);
900 #else
901                                 struct nf_conntrack_l4proto *l4proto;
902
903                                 l4proto = __nf_ct_l4proto_find((sis->is_v6 ? AF_INET6 : AF_INET), IPPROTO_UDP);
904                                 timeouts = nf_ct_timeout_lookup(&init_net, ct, l4proto);
905 #endif /*KERNEL_VERSION(4, 19, 0)*/
906
907                                 spin_lock_bh(&ct->lock);
908 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0))
909                                 ct->timeout = jiffies + timeouts[UDP_CT_REPLIED];
910 #else
911                                 ct->timeout.expires = jiffies + timeouts[UDP_CT_REPLIED];
912 #endif /*KERNEL_VERSION(4, 9, 0)*/
913                                 spin_unlock_bh(&ct->lock);
914                         }
915                 }
916                 break;
917 #endif /*KERNEL_VERSION(3, 4, 0)*/
918         }
919
920         /*
921          * Release connection
922          */
923         nf_ct_put(ct);
924 }
925
926 /*
927  * sfe_cm_device_event()
928  */
929 int sfe_cm_device_event(struct notifier_block *this, unsigned long event, void *ptr)
930 {
931         struct net_device *dev = SFE_DEV_EVENT_PTR(ptr);
932
933         if (dev && (event == NETDEV_DOWN)) {
934                 sfe_ipv4_destroy_all_rules_for_dev(dev);
935                 sfe_ipv6_destroy_all_rules_for_dev(dev);
936         }
937
938         return NOTIFY_DONE;
939 }
940
941 /*
942  * sfe_cm_inet_event()
943  */
944 static int sfe_cm_inet_event(struct notifier_block *this, unsigned long event, void *ptr)
945 {
946         struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
947
948         if (dev && (event == NETDEV_DOWN)) {
949                 sfe_ipv4_destroy_all_rules_for_dev(dev);
950         }
951
952         return NOTIFY_DONE;
953 }
954
955 /*
956  * sfe_cm_inet6_event()
957  */
958 static int sfe_cm_inet6_event(struct notifier_block *this, unsigned long event, void *ptr)
959 {
960         struct net_device *dev = ((struct inet6_ifaddr *)ptr)->idev->dev;
961
962         if (dev && (event == NETDEV_DOWN)) {
963                 sfe_ipv6_destroy_all_rules_for_dev(dev);
964         }
965
966         return NOTIFY_DONE;
967 }
968
969 /*
970  * sfe_cm_get_exceptions
971  *      dump exception counters
972  */
973 static ssize_t sfe_cm_get_exceptions(struct device *dev,
974                                      struct device_attribute *attr,
975                                      char *buf)
976 {
977         int idx, len;
978         struct sfe_cm *sc = &__sc;
979
980         spin_lock_bh(&sc->lock);
981         for (len = 0, idx = 0; idx < SFE_CM_EXCEPTION_MAX; idx++) {
982                 if (sc->exceptions[idx]) {
983                         len += snprintf(buf + len, (ssize_t)(PAGE_SIZE - len), "%s = %d\n", sfe_cm_exception_events_string[idx], sc->exceptions[idx]);
984                 }
985         }
986         spin_unlock_bh(&sc->lock);
987
988         return len;
989 }
990
991 /*
992  * sysfs attributes.
993  */
994 static const struct device_attribute sfe_cm_exceptions_attr =
995         __ATTR(exceptions, S_IRUGO, sfe_cm_get_exceptions, NULL);
996
997 /*
998  * sfe_cm_init()
999  */
1000 static int __init sfe_cm_init(void)
1001 {
1002         struct sfe_cm *sc = &__sc;
1003         int result = -1;
1004
1005         DEBUG_INFO("SFE CM init\n");
1006
1007         /*
1008          * Create sys/sfe_cm
1009          */
1010         sc->sys_sfe_cm = kobject_create_and_add("sfe_cm", NULL);
1011         if (!sc->sys_sfe_cm) {
1012                 DEBUG_ERROR("failed to register sfe_cm\n");
1013                 goto exit1;
1014         }
1015
1016         /*
1017          * Create sys/sfe_cm/exceptions
1018          */
1019         result = sysfs_create_file(sc->sys_sfe_cm, &sfe_cm_exceptions_attr.attr);
1020         if (result) {
1021                 DEBUG_ERROR("failed to register exceptions file: %d\n", result);
1022                 goto exit2;
1023         }
1024
1025         sc->dev_notifier.notifier_call = sfe_cm_device_event;
1026         sc->dev_notifier.priority = 1;
1027         register_netdevice_notifier(&sc->dev_notifier);
1028
1029         sc->inet_notifier.notifier_call = sfe_cm_inet_event;
1030         sc->inet_notifier.priority = 1;
1031         register_inetaddr_notifier(&sc->inet_notifier);
1032
1033         sc->inet6_notifier.notifier_call = sfe_cm_inet6_event;
1034         sc->inet6_notifier.priority = 1;
1035         register_inet6addr_notifier(&sc->inet6_notifier);
1036         /*
1037          * Register our netfilter hooks.
1038          */
1039         result = nf_register_net_hooks(&init_net, sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing));
1040         if (result < 0) {
1041                 DEBUG_ERROR("can't register nf post routing hook: %d\n", result);
1042                 goto exit3;
1043         }
1044
1045         /*
1046          * Register a notifier hook to get fast notifications of expired connections.
1047          * Note: In CONFIG_NF_CONNTRACK_CHAIN_EVENTS enabled case, nf_conntrack_register_notifier()
1048          * function always returns 0.
1049          */
1050 #ifdef CONFIG_NF_CONNTRACK_EVENTS
1051 #ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
1052         (void)nf_conntrack_register_chain_notifier(&init_net, &sfe_cm_conntrack_notifier);
1053 #else
1054         result = nf_conntrack_register_notifier(&init_net, &sfe_cm_conntrack_notifier);
1055         if (result < 0) {
1056                 DEBUG_ERROR("can't register nf notifier hook: %d\n", result);
1057                 goto exit4;
1058         }
1059 #endif
1060 #endif
1061
1062         spin_lock_init(&sc->lock);
1063
1064         /*
1065          * Hook the receive path in the network stack.
1066          */
1067         BUG_ON(athrs_fast_nat_recv);
1068         RCU_INIT_POINTER(athrs_fast_nat_recv, sfe_cm_recv);
1069
1070         /*
1071          * Hook the shortcut sync callback.
1072          */
1073         sfe_ipv4_register_sync_rule_callback(sfe_cm_sync_rule);
1074         sfe_ipv6_register_sync_rule_callback(sfe_cm_sync_rule);
1075         return 0;
1076
1077 #ifdef CONFIG_NF_CONNTRACK_EVENTS
1078 #ifndef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
1079 exit4:
1080         nf_unregister_net_hooks(&init_net, sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing));
1081 #endif
1082 #endif
1083 exit3:
1084         unregister_inet6addr_notifier(&sc->inet6_notifier);
1085         unregister_inetaddr_notifier(&sc->inet_notifier);
1086         unregister_netdevice_notifier(&sc->dev_notifier);
1087 exit2:
1088         kobject_put(sc->sys_sfe_cm);
1089
1090 exit1:
1091         return result;
1092 }
1093
1094 /*
1095  * sfe_cm_exit()
1096  */
1097 static void __exit sfe_cm_exit(void)
1098 {
1099         struct sfe_cm *sc = &__sc;
1100
1101         DEBUG_INFO("SFE CM exit\n");
1102
1103         /*
1104          * Unregister our sync callback.
1105          */
1106         sfe_ipv4_register_sync_rule_callback(NULL);
1107         sfe_ipv6_register_sync_rule_callback(NULL);
1108
1109         /*
1110          * Unregister our receive callback.
1111          */
1112         RCU_INIT_POINTER(athrs_fast_nat_recv, NULL);
1113
1114         /*
1115          * Wait for all callbacks to complete.
1116          */
1117         rcu_barrier();
1118
1119         /*
1120          * Destroy all connections.
1121          */
1122         sfe_ipv4_destroy_all_rules_for_dev(NULL);
1123         sfe_ipv6_destroy_all_rules_for_dev(NULL);
1124
1125 #ifdef CONFIG_NF_CONNTRACK_EVENTS
1126 #ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
1127         nf_conntrack_unregister_chain_notifier(&init_net, &sfe_cm_conntrack_notifier);
1128 #else
1129         nf_conntrack_unregister_notifier(&init_net, &sfe_cm_conntrack_notifier);
1130 #endif
1131 #endif
1132         nf_unregister_net_hooks(&init_net, sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing));
1133
1134         unregister_inet6addr_notifier(&sc->inet6_notifier);
1135         unregister_inetaddr_notifier(&sc->inet_notifier);
1136         unregister_netdevice_notifier(&sc->dev_notifier);
1137
1138         kobject_put(sc->sys_sfe_cm);
1139 }
1140
1141 module_init(sfe_cm_init)
1142 module_exit(sfe_cm_exit)
1143
1144 MODULE_DESCRIPTION("Shortcut Forwarding Engine - Connection Manager");
1145 MODULE_LICENSE("Dual BSD/GPL");
1146