OSDN Git Service

bpf: selftests: test skb->tstamp in redirect_neigh
authorMartin KaFai Lau <kafai@fb.com>
Wed, 2 Mar 2022 19:56:41 +0000 (11:56 -0800)
committerDavid S. Miller <davem@davemloft.net>
Thu, 3 Mar 2022 14:38:49 +0000 (14:38 +0000)
This patch adds tests on forwarding the delivery_time for
the following cases
- tcp/udp + ip4/ip6 + bpf_redirect_neigh
- tcp/udp + ip4/ip6 + ip[6]_forward
- bpf_skb_set_delivery_time
- The old rcv timestamp expectation on tc-bpf@ingress

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
tools/testing/selftests/bpf/prog_tests/tc_redirect.c
tools/testing/selftests/bpf/progs/test_tc_dtime.c [new file with mode: 0644]

index 647b0a8..2b255e2 100644 (file)
@@ -17,6 +17,8 @@
 #include <linux/if_tun.h>
 #include <linux/limits.h>
 #include <linux/sysctl.h>
+#include <linux/time_types.h>
+#include <linux/net_tstamp.h>
 #include <sched.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include "test_tc_neigh_fib.skel.h"
 #include "test_tc_neigh.skel.h"
 #include "test_tc_peer.skel.h"
+#include "test_tc_dtime.skel.h"
+
+#ifndef TCP_TX_DELAY
+#define TCP_TX_DELAY 37
+#endif
 
 #define NS_SRC "ns_src"
 #define NS_FWD "ns_fwd"
@@ -61,6 +68,7 @@
 #define CHK_PROG_PIN_FILE "/sys/fs/bpf/test_tc_chk"
 
 #define TIMEOUT_MILLIS 10000
+#define NSEC_PER_SEC 1000000000ULL
 
 #define log_err(MSG, ...) \
        fprintf(stderr, "(%s:%d: errno: %s) " MSG "\n", \
@@ -440,6 +448,431 @@ static int set_forwarding(bool enable)
        return 0;
 }
 
+static void rcv_tstamp(int fd, const char *expected, size_t s)
+{
+       struct __kernel_timespec pkt_ts = {};
+       char ctl[CMSG_SPACE(sizeof(pkt_ts))];
+       struct timespec now_ts;
+       struct msghdr msg = {};
+       __u64 now_ns, pkt_ns;
+       struct cmsghdr *cmsg;
+       struct iovec iov;
+       char data[32];
+       int ret;
+
+       iov.iov_base = data;
+       iov.iov_len = sizeof(data);
+       msg.msg_iov = &iov;
+       msg.msg_iovlen = 1;
+       msg.msg_control = &ctl;
+       msg.msg_controllen = sizeof(ctl);
+
+       ret = recvmsg(fd, &msg, 0);
+       if (!ASSERT_EQ(ret, s, "recvmsg"))
+               return;
+       ASSERT_STRNEQ(data, expected, s, "expected rcv data");
+
+       cmsg = CMSG_FIRSTHDR(&msg);
+       if (cmsg && cmsg->cmsg_level == SOL_SOCKET &&
+           cmsg->cmsg_type == SO_TIMESTAMPNS_NEW)
+               memcpy(&pkt_ts, CMSG_DATA(cmsg), sizeof(pkt_ts));
+
+       pkt_ns = pkt_ts.tv_sec * NSEC_PER_SEC + pkt_ts.tv_nsec;
+       ASSERT_NEQ(pkt_ns, 0, "pkt rcv tstamp");
+
+       ret = clock_gettime(CLOCK_REALTIME, &now_ts);
+       ASSERT_OK(ret, "clock_gettime");
+       now_ns = now_ts.tv_sec * NSEC_PER_SEC + now_ts.tv_nsec;
+
+       if (ASSERT_GE(now_ns, pkt_ns, "check rcv tstamp"))
+               ASSERT_LT(now_ns - pkt_ns, 5 * NSEC_PER_SEC,
+                         "check rcv tstamp");
+}
+
+static void snd_tstamp(int fd, char *b, size_t s)
+{
+       struct sock_txtime opt = { .clockid = CLOCK_TAI };
+       char ctl[CMSG_SPACE(sizeof(__u64))];
+       struct timespec now_ts;
+       struct msghdr msg = {};
+       struct cmsghdr *cmsg;
+       struct iovec iov;
+       __u64 now_ns;
+       int ret;
+
+       ret = clock_gettime(CLOCK_TAI, &now_ts);
+       ASSERT_OK(ret, "clock_get_time(CLOCK_TAI)");
+       now_ns = now_ts.tv_sec * NSEC_PER_SEC + now_ts.tv_nsec;
+
+       iov.iov_base = b;
+       iov.iov_len = s;
+       msg.msg_iov = &iov;
+       msg.msg_iovlen = 1;
+       msg.msg_control = &ctl;
+       msg.msg_controllen = sizeof(ctl);
+
+       cmsg = CMSG_FIRSTHDR(&msg);
+       cmsg->cmsg_level = SOL_SOCKET;
+       cmsg->cmsg_type = SCM_TXTIME;
+       cmsg->cmsg_len = CMSG_LEN(sizeof(now_ns));
+       *(__u64 *)CMSG_DATA(cmsg) = now_ns;
+
+       ret = setsockopt(fd, SOL_SOCKET, SO_TXTIME, &opt, sizeof(opt));
+       ASSERT_OK(ret, "setsockopt(SO_TXTIME)");
+
+       ret = sendmsg(fd, &msg, 0);
+       ASSERT_EQ(ret, s, "sendmsg");
+}
+
+static void test_inet_dtime(int family, int type, const char *addr, __u16 port)
+{
+       int opt = 1, accept_fd = -1, client_fd = -1, listen_fd, err;
+       char buf[] = "testing testing";
+       struct nstoken *nstoken;
+
+       nstoken = open_netns(NS_DST);
+       if (!ASSERT_OK_PTR(nstoken, "setns dst"))
+               return;
+       listen_fd = start_server(family, type, addr, port, 0);
+       close_netns(nstoken);
+
+       if (!ASSERT_GE(listen_fd, 0, "listen"))
+               return;
+
+       /* Ensure the kernel puts the (rcv) timestamp for all skb */
+       err = setsockopt(listen_fd, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
+                        &opt, sizeof(opt));
+       if (!ASSERT_OK(err, "setsockopt(SO_TIMESTAMPNS_NEW)"))
+               goto done;
+
+       if (type == SOCK_STREAM) {
+               /* Ensure the kernel set EDT when sending out rst/ack
+                * from the kernel's ctl_sk.
+                */
+               err = setsockopt(listen_fd, SOL_TCP, TCP_TX_DELAY, &opt,
+                                sizeof(opt));
+               if (!ASSERT_OK(err, "setsockopt(TCP_TX_DELAY)"))
+                       goto done;
+       }
+
+       nstoken = open_netns(NS_SRC);
+       if (!ASSERT_OK_PTR(nstoken, "setns src"))
+               goto done;
+       client_fd = connect_to_fd(listen_fd, TIMEOUT_MILLIS);
+       close_netns(nstoken);
+
+       if (!ASSERT_GE(client_fd, 0, "connect_to_fd"))
+               goto done;
+
+       if (type == SOCK_STREAM) {
+               int n;
+
+               accept_fd = accept(listen_fd, NULL, NULL);
+               if (!ASSERT_GE(accept_fd, 0, "accept"))
+                       goto done;
+
+               n = write(client_fd, buf, sizeof(buf));
+               if (!ASSERT_EQ(n, sizeof(buf), "send to server"))
+                       goto done;
+               rcv_tstamp(accept_fd, buf, sizeof(buf));
+       } else {
+               snd_tstamp(client_fd, buf, sizeof(buf));
+               rcv_tstamp(listen_fd, buf, sizeof(buf));
+       }
+
+done:
+       close(listen_fd);
+       if (accept_fd != -1)
+               close(accept_fd);
+       if (client_fd != -1)
+               close(client_fd);
+}
+
+static int netns_load_dtime_bpf(struct test_tc_dtime *skel)
+{
+       struct nstoken *nstoken;
+
+#define PIN_FNAME(__file) "/sys/fs/bpf/" #__file
+#define PIN(__prog) ({                                                 \
+               int err = bpf_program__pin(skel->progs.__prog, PIN_FNAME(__prog)); \
+               if (!ASSERT_OK(err, "pin " #__prog))            \
+                       goto fail;                                      \
+               })
+
+       /* setup ns_src tc progs */
+       nstoken = open_netns(NS_SRC);
+       if (!ASSERT_OK_PTR(nstoken, "setns " NS_SRC))
+               return -1;
+       PIN(egress_host);
+       PIN(ingress_host);
+       SYS("tc qdisc add dev veth_src clsact");
+       SYS("tc filter add dev veth_src ingress bpf da object-pinned "
+           PIN_FNAME(ingress_host));
+       SYS("tc filter add dev veth_src egress bpf da object-pinned "
+           PIN_FNAME(egress_host));
+       close_netns(nstoken);
+
+       /* setup ns_dst tc progs */
+       nstoken = open_netns(NS_DST);
+       if (!ASSERT_OK_PTR(nstoken, "setns " NS_DST))
+               return -1;
+       PIN(egress_host);
+       PIN(ingress_host);
+       SYS("tc qdisc add dev veth_dst clsact");
+       SYS("tc filter add dev veth_dst ingress bpf da object-pinned "
+           PIN_FNAME(ingress_host));
+       SYS("tc filter add dev veth_dst egress bpf da object-pinned "
+           PIN_FNAME(egress_host));
+       close_netns(nstoken);
+
+       /* setup ns_fwd tc progs */
+       nstoken = open_netns(NS_FWD);
+       if (!ASSERT_OK_PTR(nstoken, "setns " NS_FWD))
+               return -1;
+       PIN(ingress_fwdns_prio100);
+       PIN(egress_fwdns_prio100);
+       PIN(ingress_fwdns_prio101);
+       PIN(egress_fwdns_prio101);
+       SYS("tc qdisc add dev veth_dst_fwd clsact");
+       SYS("tc filter add dev veth_dst_fwd ingress prio 100 bpf da object-pinned "
+           PIN_FNAME(ingress_fwdns_prio100));
+       SYS("tc filter add dev veth_dst_fwd ingress prio 101 bpf da object-pinned "
+           PIN_FNAME(ingress_fwdns_prio101));
+       SYS("tc filter add dev veth_dst_fwd egress prio 100 bpf da object-pinned "
+           PIN_FNAME(egress_fwdns_prio100));
+       SYS("tc filter add dev veth_dst_fwd egress prio 101 bpf da object-pinned "
+           PIN_FNAME(egress_fwdns_prio101));
+       SYS("tc qdisc add dev veth_src_fwd clsact");
+       SYS("tc filter add dev veth_src_fwd ingress prio 100 bpf da object-pinned "
+           PIN_FNAME(ingress_fwdns_prio100));
+       SYS("tc filter add dev veth_src_fwd ingress prio 101 bpf da object-pinned "
+           PIN_FNAME(ingress_fwdns_prio101));
+       SYS("tc filter add dev veth_src_fwd egress prio 100 bpf da object-pinned "
+           PIN_FNAME(egress_fwdns_prio100));
+       SYS("tc filter add dev veth_src_fwd egress prio 101 bpf da object-pinned "
+           PIN_FNAME(egress_fwdns_prio101));
+       close_netns(nstoken);
+
+#undef PIN
+
+       return 0;
+
+fail:
+       close_netns(nstoken);
+       return -1;
+}
+
+enum {
+       INGRESS_FWDNS_P100,
+       INGRESS_FWDNS_P101,
+       EGRESS_FWDNS_P100,
+       EGRESS_FWDNS_P101,
+       INGRESS_ENDHOST,
+       EGRESS_ENDHOST,
+       SET_DTIME,
+       __MAX_CNT,
+};
+
+const char *cnt_names[] = {
+       "ingress_fwdns_p100",
+       "ingress_fwdns_p101",
+       "egress_fwdns_p100",
+       "egress_fwdns_p101",
+       "ingress_endhost",
+       "egress_endhost",
+       "set_dtime",
+};
+
+enum {
+       TCP_IP6_CLEAR_DTIME,
+       TCP_IP4,
+       TCP_IP6,
+       UDP_IP4,
+       UDP_IP6,
+       TCP_IP4_RT_FWD,
+       TCP_IP6_RT_FWD,
+       UDP_IP4_RT_FWD,
+       UDP_IP6_RT_FWD,
+       UKN_TEST,
+       __NR_TESTS,
+};
+
+const char *test_names[] = {
+       "tcp ip6 clear dtime",
+       "tcp ip4",
+       "tcp ip6",
+       "udp ip4",
+       "udp ip6",
+       "tcp ip4 rt fwd",
+       "tcp ip6 rt fwd",
+       "udp ip4 rt fwd",
+       "udp ip6 rt fwd",
+};
+
+static const char *dtime_cnt_str(int test, int cnt)
+{
+       static char name[64];
+
+       snprintf(name, sizeof(name), "%s %s", test_names[test], cnt_names[cnt]);
+
+       return name;
+}
+
+static const char *dtime_err_str(int test, int cnt)
+{
+       static char name[64];
+
+       snprintf(name, sizeof(name), "%s %s errs", test_names[test],
+                cnt_names[cnt]);
+
+       return name;
+}
+
+static void test_tcp_clear_dtime(struct test_tc_dtime *skel)
+{
+       int i, t = TCP_IP6_CLEAR_DTIME;
+       __u32 *dtimes = skel->bss->dtimes[t];
+       __u32 *errs = skel->bss->errs[t];
+
+       skel->bss->test = t;
+       test_inet_dtime(AF_INET6, SOCK_STREAM, IP6_DST, 0);
+
+       ASSERT_EQ(dtimes[INGRESS_FWDNS_P100], 0,
+                 dtime_cnt_str(t, INGRESS_FWDNS_P100));
+       ASSERT_EQ(dtimes[INGRESS_FWDNS_P101], 0,
+                 dtime_cnt_str(t, INGRESS_FWDNS_P101));
+       ASSERT_GT(dtimes[EGRESS_FWDNS_P100], 0,
+                 dtime_cnt_str(t, EGRESS_FWDNS_P100));
+       ASSERT_EQ(dtimes[EGRESS_FWDNS_P101], 0,
+                 dtime_cnt_str(t, EGRESS_FWDNS_P101));
+       ASSERT_GT(dtimes[EGRESS_ENDHOST], 0,
+                 dtime_cnt_str(t, EGRESS_ENDHOST));
+       ASSERT_GT(dtimes[INGRESS_ENDHOST], 0,
+                 dtime_cnt_str(t, INGRESS_ENDHOST));
+
+       for (i = INGRESS_FWDNS_P100; i < __MAX_CNT; i++)
+               ASSERT_EQ(errs[i], 0, dtime_err_str(t, i));
+}
+
+static void test_tcp_dtime(struct test_tc_dtime *skel, int family, bool bpf_fwd)
+{
+       __u32 *dtimes, *errs;
+       const char *addr;
+       int i, t;
+
+       if (family == AF_INET) {
+               t = bpf_fwd ? TCP_IP4 : TCP_IP4_RT_FWD;
+               addr = IP4_DST;
+       } else {
+               t = bpf_fwd ? TCP_IP6 : TCP_IP6_RT_FWD;
+               addr = IP6_DST;
+       }
+
+       dtimes = skel->bss->dtimes[t];
+       errs = skel->bss->errs[t];
+
+       skel->bss->test = t;
+       test_inet_dtime(family, SOCK_STREAM, addr, 0);
+
+       /* fwdns_prio100 prog does not read delivery_time_type, so
+        * kernel puts the (rcv) timetamp in __sk_buff->tstamp
+        */
+       ASSERT_EQ(dtimes[INGRESS_FWDNS_P100], 0,
+                 dtime_cnt_str(t, INGRESS_FWDNS_P100));
+       for (i = INGRESS_FWDNS_P101; i < SET_DTIME; i++)
+               ASSERT_GT(dtimes[i], 0, dtime_cnt_str(t, i));
+
+       for (i = INGRESS_FWDNS_P100; i < __MAX_CNT; i++)
+               ASSERT_EQ(errs[i], 0, dtime_err_str(t, i));
+}
+
+static void test_udp_dtime(struct test_tc_dtime *skel, int family, bool bpf_fwd)
+{
+       __u32 *dtimes, *errs;
+       const char *addr;
+       int i, t;
+
+       if (family == AF_INET) {
+               t = bpf_fwd ? UDP_IP4 : UDP_IP4_RT_FWD;
+               addr = IP4_DST;
+       } else {
+               t = bpf_fwd ? UDP_IP6 : UDP_IP6_RT_FWD;
+               addr = IP6_DST;
+       }
+
+       dtimes = skel->bss->dtimes[t];
+       errs = skel->bss->errs[t];
+
+       skel->bss->test = t;
+       test_inet_dtime(family, SOCK_DGRAM, addr, 0);
+
+       ASSERT_EQ(dtimes[INGRESS_FWDNS_P100], 0,
+                 dtime_cnt_str(t, INGRESS_FWDNS_P100));
+       /* non mono delivery time is not forwarded */
+       ASSERT_EQ(dtimes[INGRESS_FWDNS_P101], 0,
+                 dtime_cnt_str(t, INGRESS_FWDNS_P100));
+       for (i = EGRESS_FWDNS_P100; i < SET_DTIME; i++)
+               ASSERT_GT(dtimes[i], 0, dtime_cnt_str(t, i));
+
+       for (i = INGRESS_FWDNS_P100; i < __MAX_CNT; i++)
+               ASSERT_EQ(errs[i], 0, dtime_err_str(t, i));
+}
+
+static void test_tc_redirect_dtime(struct netns_setup_result *setup_result)
+{
+       struct test_tc_dtime *skel;
+       struct nstoken *nstoken;
+       int err;
+
+       skel = test_tc_dtime__open();
+       if (!ASSERT_OK_PTR(skel, "test_tc_dtime__open"))
+               return;
+
+       skel->rodata->IFINDEX_SRC = setup_result->ifindex_veth_src_fwd;
+       skel->rodata->IFINDEX_DST = setup_result->ifindex_veth_dst_fwd;
+
+       err = test_tc_dtime__load(skel);
+       if (!ASSERT_OK(err, "test_tc_dtime__load"))
+               goto done;
+
+       if (netns_load_dtime_bpf(skel))
+               goto done;
+
+       nstoken = open_netns(NS_FWD);
+       if (!ASSERT_OK_PTR(nstoken, "setns fwd"))
+               goto done;
+       err = set_forwarding(false);
+       close_netns(nstoken);
+       if (!ASSERT_OK(err, "disable forwarding"))
+               goto done;
+
+       test_tcp_clear_dtime(skel);
+
+       test_tcp_dtime(skel, AF_INET, true);
+       test_tcp_dtime(skel, AF_INET6, true);
+       test_udp_dtime(skel, AF_INET, true);
+       test_udp_dtime(skel, AF_INET6, true);
+
+       /* Test the kernel ip[6]_forward path instead
+        * of bpf_redirect_neigh().
+        */
+       nstoken = open_netns(NS_FWD);
+       if (!ASSERT_OK_PTR(nstoken, "setns fwd"))
+               goto done;
+       err = set_forwarding(true);
+       close_netns(nstoken);
+       if (!ASSERT_OK(err, "enable forwarding"))
+               goto done;
+
+       test_tcp_dtime(skel, AF_INET, false);
+       test_tcp_dtime(skel, AF_INET6, false);
+       test_udp_dtime(skel, AF_INET, false);
+       test_udp_dtime(skel, AF_INET6, false);
+
+done:
+       test_tc_dtime__destroy(skel);
+}
+
 static void test_tc_redirect_neigh_fib(struct netns_setup_result *setup_result)
 {
        struct nstoken *nstoken = NULL;
@@ -787,6 +1220,7 @@ static void *test_tc_redirect_run_tests(void *arg)
        RUN_TEST(tc_redirect_peer_l3);
        RUN_TEST(tc_redirect_neigh);
        RUN_TEST(tc_redirect_neigh_fib);
+       RUN_TEST(tc_redirect_dtime);
        return NULL;
 }
 
diff --git a/tools/testing/selftests/bpf/progs/test_tc_dtime.c b/tools/testing/selftests/bpf/progs/test_tc_dtime.c
new file mode 100644 (file)
index 0000000..9d9e8e1
--- /dev/null
@@ -0,0 +1,349 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2022 Meta
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <linux/bpf.h>
+#include <linux/stddef.h>
+#include <linux/pkt_cls.h>
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include <sys/socket.h>
+
+/* veth_src --- veth_src_fwd --- veth_det_fwd --- veth_dst
+ *           |                                 |
+ *  ns_src   |              ns_fwd             |   ns_dst
+ *
+ * ns_src and ns_dst: ENDHOST namespace
+ *            ns_fwd: Fowarding namespace
+ */
+
+#define ctx_ptr(field)         (void *)(long)(field)
+
+#define ip4_src                        __bpf_htonl(0xac100164) /* 172.16.1.100 */
+#define ip4_dst                        __bpf_htonl(0xac100264) /* 172.16.2.100 */
+
+#define ip6_src                        { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
+                                 0x00, 0x01, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe }
+#define ip6_dst                        { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
+                                 0x00, 0x02, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe }
+
+#define v6_equal(a, b)         (a.s6_addr32[0] == b.s6_addr32[0] && \
+                                a.s6_addr32[1] == b.s6_addr32[1] && \
+                                a.s6_addr32[2] == b.s6_addr32[2] && \
+                                a.s6_addr32[3] == b.s6_addr32[3])
+
+volatile const __u32 IFINDEX_SRC;
+volatile const __u32 IFINDEX_DST;
+
+#define EGRESS_ENDHOST_MAGIC   0x0b9fbeef
+#define INGRESS_FWDNS_MAGIC    0x1b9fbeef
+#define EGRESS_FWDNS_MAGIC     0x2b9fbeef
+
+enum {
+       INGRESS_FWDNS_P100,
+       INGRESS_FWDNS_P101,
+       EGRESS_FWDNS_P100,
+       EGRESS_FWDNS_P101,
+       INGRESS_ENDHOST,
+       EGRESS_ENDHOST,
+       SET_DTIME,
+       __MAX_CNT,
+};
+
+enum {
+       TCP_IP6_CLEAR_DTIME,
+       TCP_IP4,
+       TCP_IP6,
+       UDP_IP4,
+       UDP_IP6,
+       TCP_IP4_RT_FWD,
+       TCP_IP6_RT_FWD,
+       UDP_IP4_RT_FWD,
+       UDP_IP6_RT_FWD,
+       UKN_TEST,
+       __NR_TESTS,
+};
+
+enum {
+       SRC_NS = 1,
+       DST_NS,
+};
+
+__u32 dtimes[__NR_TESTS][__MAX_CNT] = {};
+__u32 errs[__NR_TESTS][__MAX_CNT] = {};
+__u32 test = 0;
+
+static void inc_dtimes(__u32 idx)
+{
+       if (test < __NR_TESTS)
+               dtimes[test][idx]++;
+       else
+               dtimes[UKN_TEST][idx]++;
+}
+
+static void inc_errs(__u32 idx)
+{
+       if (test < __NR_TESTS)
+               errs[test][idx]++;
+       else
+               errs[UKN_TEST][idx]++;
+}
+
+static int skb_proto(int type)
+{
+       return type & 0xff;
+}
+
+static int skb_ns(int type)
+{
+       return (type >> 8) & 0xff;
+}
+
+static bool fwdns_clear_dtime(void)
+{
+       return test == TCP_IP6_CLEAR_DTIME;
+}
+
+static bool bpf_fwd(void)
+{
+       return test < TCP_IP4_RT_FWD;
+}
+
+/* -1: parse error: TC_ACT_SHOT
+ *  0: not testing traffic: TC_ACT_OK
+ * >0: first byte is the inet_proto, second byte has the netns
+ *     of the sender
+ */
+static int skb_get_type(struct __sk_buff *skb)
+{
+       void *data_end = ctx_ptr(skb->data_end);
+       void *data = ctx_ptr(skb->data);
+       __u8 inet_proto = 0, ns = 0;
+       struct ipv6hdr *ip6h;
+       struct iphdr *iph;
+
+       switch (skb->protocol) {
+       case __bpf_htons(ETH_P_IP):
+               iph = data + sizeof(struct ethhdr);
+               if (iph + 1 > data_end)
+                       return -1;
+               if (iph->saddr == ip4_src)
+                       ns = SRC_NS;
+               else if (iph->saddr == ip4_dst)
+                       ns = DST_NS;
+               inet_proto = iph->protocol;
+               break;
+       case __bpf_htons(ETH_P_IPV6):
+               ip6h = data + sizeof(struct ethhdr);
+               if (ip6h + 1 > data_end)
+                       return -1;
+               if (v6_equal(ip6h->saddr, (struct in6_addr)ip6_src))
+                       ns = SRC_NS;
+               else if (v6_equal(ip6h->saddr, (struct in6_addr)ip6_dst))
+                       ns = DST_NS;
+               inet_proto = ip6h->nexthdr;
+               break;
+       default:
+               return 0;
+       }
+
+       if ((inet_proto != IPPROTO_TCP && inet_proto != IPPROTO_UDP) || !ns)
+               return 0;
+
+       return (ns << 8 | inet_proto);
+}
+
+/* format: direction@iface@netns
+ * egress@veth_(src|dst)@ns_(src|dst)
+ */
+SEC("tc")
+int egress_host(struct __sk_buff *skb)
+{
+       int skb_type;
+
+       skb_type = skb_get_type(skb);
+       if (skb_type == -1)
+               return TC_ACT_SHOT;
+       if (!skb_type)
+               return TC_ACT_OK;
+
+       if (skb_proto(skb_type) == IPPROTO_TCP) {
+               if (skb->delivery_time_type == BPF_SKB_DELIVERY_TIME_MONO &&
+                   skb->tstamp)
+                       inc_dtimes(EGRESS_ENDHOST);
+               else
+                       inc_errs(EGRESS_ENDHOST);
+       } else {
+               if (skb->delivery_time_type == BPF_SKB_DELIVERY_TIME_UNSPEC &&
+                   skb->tstamp)
+                       inc_dtimes(EGRESS_ENDHOST);
+               else
+                       inc_errs(EGRESS_ENDHOST);
+       }
+
+       skb->tstamp = EGRESS_ENDHOST_MAGIC;
+
+       return TC_ACT_OK;
+}
+
+/* ingress@veth_(src|dst)@ns_(src|dst) */
+SEC("tc")
+int ingress_host(struct __sk_buff *skb)
+{
+       int skb_type;
+
+       skb_type = skb_get_type(skb);
+       if (skb_type == -1)
+               return TC_ACT_SHOT;
+       if (!skb_type)
+               return TC_ACT_OK;
+
+       if (skb->delivery_time_type == BPF_SKB_DELIVERY_TIME_MONO &&
+           skb->tstamp == EGRESS_FWDNS_MAGIC)
+               inc_dtimes(INGRESS_ENDHOST);
+       else
+               inc_errs(INGRESS_ENDHOST);
+
+       return TC_ACT_OK;
+}
+
+/* ingress@veth_(src|dst)_fwd@ns_fwd priority 100 */
+SEC("tc")
+int ingress_fwdns_prio100(struct __sk_buff *skb)
+{
+       int skb_type;
+
+       skb_type = skb_get_type(skb);
+       if (skb_type == -1)
+               return TC_ACT_SHOT;
+       if (!skb_type)
+               return TC_ACT_OK;
+
+       /* delivery_time is only available to the ingress
+        * if the tc-bpf checks the skb->delivery_time_type.
+        */
+       if (skb->tstamp == EGRESS_ENDHOST_MAGIC)
+               inc_errs(INGRESS_FWDNS_P100);
+
+       if (fwdns_clear_dtime())
+               skb->tstamp = 0;
+
+       return TC_ACT_UNSPEC;
+}
+
+/* egress@veth_(src|dst)_fwd@ns_fwd priority 100 */
+SEC("tc")
+int egress_fwdns_prio100(struct __sk_buff *skb)
+{
+       int skb_type;
+
+       skb_type = skb_get_type(skb);
+       if (skb_type == -1)
+               return TC_ACT_SHOT;
+       if (!skb_type)
+               return TC_ACT_OK;
+
+       /* delivery_time is always available to egress even
+        * the tc-bpf did not use the delivery_time_type.
+        */
+       if (skb->tstamp == INGRESS_FWDNS_MAGIC)
+               inc_dtimes(EGRESS_FWDNS_P100);
+       else
+               inc_errs(EGRESS_FWDNS_P100);
+
+       if (fwdns_clear_dtime())
+               skb->tstamp = 0;
+
+       return TC_ACT_UNSPEC;
+}
+
+/* ingress@veth_(src|dst)_fwd@ns_fwd priority 101 */
+SEC("tc")
+int ingress_fwdns_prio101(struct __sk_buff *skb)
+{
+       __u64 expected_dtime = EGRESS_ENDHOST_MAGIC;
+       int skb_type;
+
+       skb_type = skb_get_type(skb);
+       if (skb_type == -1 || !skb_type)
+               /* Should have handled in prio100 */
+               return TC_ACT_SHOT;
+
+       if (skb_proto(skb_type) == IPPROTO_UDP)
+               expected_dtime = 0;
+
+       if (skb->delivery_time_type) {
+               if (fwdns_clear_dtime() ||
+                   skb->delivery_time_type != BPF_SKB_DELIVERY_TIME_MONO ||
+                   skb->tstamp != expected_dtime)
+                       inc_errs(INGRESS_FWDNS_P101);
+               else
+                       inc_dtimes(INGRESS_FWDNS_P101);
+       } else {
+               if (!fwdns_clear_dtime() && expected_dtime)
+                       inc_errs(INGRESS_FWDNS_P101);
+       }
+
+       if (skb->delivery_time_type == BPF_SKB_DELIVERY_TIME_MONO) {
+               skb->tstamp = INGRESS_FWDNS_MAGIC;
+       } else {
+               if (bpf_skb_set_delivery_time(skb, INGRESS_FWDNS_MAGIC,
+                                             BPF_SKB_DELIVERY_TIME_MONO))
+                       inc_errs(SET_DTIME);
+               if (!bpf_skb_set_delivery_time(skb, INGRESS_FWDNS_MAGIC,
+                                              BPF_SKB_DELIVERY_TIME_UNSPEC))
+                       inc_errs(SET_DTIME);
+       }
+
+       if (skb_ns(skb_type) == SRC_NS)
+               return bpf_fwd() ?
+                       bpf_redirect_neigh(IFINDEX_DST, NULL, 0, 0) : TC_ACT_OK;
+       else
+               return bpf_fwd() ?
+                       bpf_redirect_neigh(IFINDEX_SRC, NULL, 0, 0) : TC_ACT_OK;
+}
+
+/* egress@veth_(src|dst)_fwd@ns_fwd priority 101 */
+SEC("tc")
+int egress_fwdns_prio101(struct __sk_buff *skb)
+{
+       int skb_type;
+
+       skb_type = skb_get_type(skb);
+       if (skb_type == -1 || !skb_type)
+               /* Should have handled in prio100 */
+               return TC_ACT_SHOT;
+
+       if (skb->delivery_time_type) {
+               if (fwdns_clear_dtime() ||
+                   skb->delivery_time_type != BPF_SKB_DELIVERY_TIME_MONO ||
+                   skb->tstamp != INGRESS_FWDNS_MAGIC)
+                       inc_errs(EGRESS_FWDNS_P101);
+               else
+                       inc_dtimes(EGRESS_FWDNS_P101);
+       } else {
+               if (!fwdns_clear_dtime())
+                       inc_errs(EGRESS_FWDNS_P101);
+       }
+
+       if (skb->delivery_time_type == BPF_SKB_DELIVERY_TIME_MONO) {
+               skb->tstamp = EGRESS_FWDNS_MAGIC;
+       } else {
+               if (bpf_skb_set_delivery_time(skb, EGRESS_FWDNS_MAGIC,
+                                             BPF_SKB_DELIVERY_TIME_MONO))
+                       inc_errs(SET_DTIME);
+               if (!bpf_skb_set_delivery_time(skb, EGRESS_FWDNS_MAGIC,
+                                              BPF_SKB_DELIVERY_TIME_UNSPEC))
+                       inc_errs(SET_DTIME);
+       }
+
+       return TC_ACT_OK;
+}
+
+char __license[] SEC("license") = "GPL";