OSDN Git Service

Merge tag 'block-5.6-2020-03-13' of git://git.kernel.dk/linux-block
[tomoyo/tomoyo-test1.git] / net / mptcp / options.c
1 // SPDX-License-Identifier: GPL-2.0
2 /* Multipath TCP
3  *
4  * Copyright (c) 2017 - 2019, Intel Corporation.
5  */
6
7 #include <linux/kernel.h>
8 #include <net/tcp.h>
9 #include <net/mptcp.h>
10 #include "protocol.h"
11
12 static bool mptcp_cap_flag_sha256(u8 flags)
13 {
14         return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256;
15 }
16
17 void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr,
18                         int opsize, struct tcp_options_received *opt_rx)
19 {
20         struct mptcp_options_received *mp_opt = &opt_rx->mptcp;
21         u8 subtype = *ptr >> 4;
22         int expected_opsize;
23         u8 version;
24         u8 flags;
25
26         switch (subtype) {
27         case MPTCPOPT_MP_CAPABLE:
28                 /* strict size checking */
29                 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
30                         if (skb->len > tcp_hdr(skb)->doff << 2)
31                                 expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA;
32                         else
33                                 expected_opsize = TCPOLEN_MPTCP_MPC_ACK;
34                 } else {
35                         if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)
36                                 expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK;
37                         else
38                                 expected_opsize = TCPOLEN_MPTCP_MPC_SYN;
39                 }
40                 if (opsize != expected_opsize)
41                         break;
42
43                 /* try to be gentle vs future versions on the initial syn */
44                 version = *ptr++ & MPTCP_VERSION_MASK;
45                 if (opsize != TCPOLEN_MPTCP_MPC_SYN) {
46                         if (version != MPTCP_SUPPORTED_VERSION)
47                                 break;
48                 } else if (version < MPTCP_SUPPORTED_VERSION) {
49                         break;
50                 }
51
52                 flags = *ptr++;
53                 if (!mptcp_cap_flag_sha256(flags) ||
54                     (flags & MPTCP_CAP_EXTENSIBILITY))
55                         break;
56
57                 /* RFC 6824, Section 3.1:
58                  * "For the Checksum Required bit (labeled "A"), if either
59                  * host requires the use of checksums, checksums MUST be used.
60                  * In other words, the only way for checksums not to be used
61                  * is if both hosts in their SYNs set A=0."
62                  *
63                  * Section 3.3.0:
64                  * "If a checksum is not present when its use has been
65                  * negotiated, the receiver MUST close the subflow with a RST as
66                  * it is considered broken."
67                  *
68                  * We don't implement DSS checksum - fall back to TCP.
69                  */
70                 if (flags & MPTCP_CAP_CHECKSUM_REQD)
71                         break;
72
73                 mp_opt->mp_capable = 1;
74                 if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) {
75                         mp_opt->sndr_key = get_unaligned_be64(ptr);
76                         ptr += 8;
77                 }
78                 if (opsize >= TCPOLEN_MPTCP_MPC_ACK) {
79                         mp_opt->rcvr_key = get_unaligned_be64(ptr);
80                         ptr += 8;
81                 }
82                 if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA) {
83                         /* Section 3.1.:
84                          * "the data parameters in a MP_CAPABLE are semantically
85                          * equivalent to those in a DSS option and can be used
86                          * interchangeably."
87                          */
88                         mp_opt->dss = 1;
89                         mp_opt->use_map = 1;
90                         mp_opt->mpc_map = 1;
91                         mp_opt->data_len = get_unaligned_be16(ptr);
92                         ptr += 2;
93                 }
94                 pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d",
95                          version, flags, opsize, mp_opt->sndr_key,
96                          mp_opt->rcvr_key, mp_opt->data_len);
97                 break;
98
99         case MPTCPOPT_DSS:
100                 pr_debug("DSS");
101                 ptr++;
102
103                 /* we must clear 'mpc_map' be able to detect MP_CAPABLE
104                  * map vs DSS map in mptcp_incoming_options(), and reconstruct
105                  * map info accordingly
106                  */
107                 mp_opt->mpc_map = 0;
108                 flags = (*ptr++) & MPTCP_DSS_FLAG_MASK;
109                 mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0;
110                 mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0;
111                 mp_opt->use_map = (flags & MPTCP_DSS_HAS_MAP) != 0;
112                 mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0;
113                 mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK);
114
115                 pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d",
116                          mp_opt->data_fin, mp_opt->dsn64,
117                          mp_opt->use_map, mp_opt->ack64,
118                          mp_opt->use_ack);
119
120                 expected_opsize = TCPOLEN_MPTCP_DSS_BASE;
121
122                 if (mp_opt->use_ack) {
123                         if (mp_opt->ack64)
124                                 expected_opsize += TCPOLEN_MPTCP_DSS_ACK64;
125                         else
126                                 expected_opsize += TCPOLEN_MPTCP_DSS_ACK32;
127                 }
128
129                 if (mp_opt->use_map) {
130                         if (mp_opt->dsn64)
131                                 expected_opsize += TCPOLEN_MPTCP_DSS_MAP64;
132                         else
133                                 expected_opsize += TCPOLEN_MPTCP_DSS_MAP32;
134                 }
135
136                 /* RFC 6824, Section 3.3:
137                  * If a checksum is present, but its use had
138                  * not been negotiated in the MP_CAPABLE handshake,
139                  * the checksum field MUST be ignored.
140                  */
141                 if (opsize != expected_opsize &&
142                     opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM)
143                         break;
144
145                 mp_opt->dss = 1;
146
147                 if (mp_opt->use_ack) {
148                         if (mp_opt->ack64) {
149                                 mp_opt->data_ack = get_unaligned_be64(ptr);
150                                 ptr += 8;
151                         } else {
152                                 mp_opt->data_ack = get_unaligned_be32(ptr);
153                                 ptr += 4;
154                         }
155
156                         pr_debug("data_ack=%llu", mp_opt->data_ack);
157                 }
158
159                 if (mp_opt->use_map) {
160                         if (mp_opt->dsn64) {
161                                 mp_opt->data_seq = get_unaligned_be64(ptr);
162                                 ptr += 8;
163                         } else {
164                                 mp_opt->data_seq = get_unaligned_be32(ptr);
165                                 ptr += 4;
166                         }
167
168                         mp_opt->subflow_seq = get_unaligned_be32(ptr);
169                         ptr += 4;
170
171                         mp_opt->data_len = get_unaligned_be16(ptr);
172                         ptr += 2;
173
174                         pr_debug("data_seq=%llu subflow_seq=%u data_len=%u",
175                                  mp_opt->data_seq, mp_opt->subflow_seq,
176                                  mp_opt->data_len);
177                 }
178
179                 break;
180
181         default:
182                 break;
183         }
184 }
185
186 void mptcp_get_options(const struct sk_buff *skb,
187                        struct tcp_options_received *opt_rx)
188 {
189         const unsigned char *ptr;
190         const struct tcphdr *th = tcp_hdr(skb);
191         int length = (th->doff * 4) - sizeof(struct tcphdr);
192
193         ptr = (const unsigned char *)(th + 1);
194
195         while (length > 0) {
196                 int opcode = *ptr++;
197                 int opsize;
198
199                 switch (opcode) {
200                 case TCPOPT_EOL:
201                         return;
202                 case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
203                         length--;
204                         continue;
205                 default:
206                         opsize = *ptr++;
207                         if (opsize < 2) /* "silly options" */
208                                 return;
209                         if (opsize > length)
210                                 return; /* don't parse partial options */
211                         if (opcode == TCPOPT_MPTCP)
212                                 mptcp_parse_option(skb, ptr, opsize, opt_rx);
213                         ptr += opsize - 2;
214                         length -= opsize;
215                 }
216         }
217 }
218
219 bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
220                        unsigned int *size, struct mptcp_out_options *opts)
221 {
222         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
223
224         /* we will use snd_isn to detect first pkt [re]transmission
225          * in mptcp_established_options_mp()
226          */
227         subflow->snd_isn = TCP_SKB_CB(skb)->end_seq;
228         if (subflow->request_mptcp) {
229                 pr_debug("local_key=%llu", subflow->local_key);
230                 opts->suboptions = OPTION_MPTCP_MPC_SYN;
231                 opts->sndr_key = subflow->local_key;
232                 *size = TCPOLEN_MPTCP_MPC_SYN;
233                 return true;
234         }
235         return false;
236 }
237
238 void mptcp_rcv_synsent(struct sock *sk)
239 {
240         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
241         struct tcp_sock *tp = tcp_sk(sk);
242
243         pr_debug("subflow=%p", subflow);
244         if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) {
245                 subflow->mp_capable = 1;
246                 subflow->can_ack = 1;
247                 subflow->remote_key = tp->rx_opt.mptcp.sndr_key;
248         } else {
249                 tcp_sk(sk)->is_mptcp = 0;
250         }
251 }
252
253 static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
254                                          unsigned int *size,
255                                          unsigned int remaining,
256                                          struct mptcp_out_options *opts)
257 {
258         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
259         struct mptcp_ext *mpext;
260         unsigned int data_len;
261
262         pr_debug("subflow=%p fourth_ack=%d seq=%x:%x remaining=%d", subflow,
263                  subflow->fourth_ack, subflow->snd_isn,
264                  skb ? TCP_SKB_CB(skb)->seq : 0, remaining);
265
266         if (subflow->mp_capable && !subflow->fourth_ack && skb &&
267             subflow->snd_isn == TCP_SKB_CB(skb)->seq) {
268                 /* When skb is not available, we better over-estimate the
269                  * emitted options len. A full DSS option is longer than
270                  * TCPOLEN_MPTCP_MPC_ACK_DATA, so let's the caller try to fit
271                  * that.
272                  */
273                 mpext = mptcp_get_ext(skb);
274                 data_len = mpext ? mpext->data_len : 0;
275
276                 /* we will check ext_copy.data_len in mptcp_write_options() to
277                  * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and
278                  * TCPOLEN_MPTCP_MPC_ACK
279                  */
280                 opts->ext_copy.data_len = data_len;
281                 opts->suboptions = OPTION_MPTCP_MPC_ACK;
282                 opts->sndr_key = subflow->local_key;
283                 opts->rcvr_key = subflow->remote_key;
284
285                 /* Section 3.1.
286                  * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK
287                  * packets that start the first subflow of an MPTCP connection,
288                  * as well as the first packet that carries data
289                  */
290                 if (data_len > 0)
291                         *size = ALIGN(TCPOLEN_MPTCP_MPC_ACK_DATA, 4);
292                 else
293                         *size = TCPOLEN_MPTCP_MPC_ACK;
294
295                 pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d",
296                          subflow, subflow->local_key, subflow->remote_key,
297                          data_len);
298
299                 return true;
300         }
301         return false;
302 }
303
304 static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow,
305                                  struct mptcp_ext *ext)
306 {
307         ext->data_fin = 1;
308
309         if (!ext->use_map) {
310                 /* RFC6824 requires a DSS mapping with specific values
311                  * if DATA_FIN is set but no data payload is mapped
312                  */
313                 ext->use_map = 1;
314                 ext->dsn64 = 1;
315                 ext->data_seq = mptcp_sk(subflow->conn)->write_seq;
316                 ext->subflow_seq = 0;
317                 ext->data_len = 1;
318         } else {
319                 /* If there's an existing DSS mapping, DATA_FIN consumes
320                  * 1 additional byte of mapping space.
321                  */
322                 ext->data_len++;
323         }
324 }
325
326 static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
327                                           unsigned int *size,
328                                           unsigned int remaining,
329                                           struct mptcp_out_options *opts)
330 {
331         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
332         unsigned int dss_size = 0;
333         struct mptcp_ext *mpext;
334         struct mptcp_sock *msk;
335         unsigned int ack_size;
336         bool ret = false;
337         bool can_ack;
338         u64 ack_seq;
339         u8 tcp_fin;
340
341         if (skb) {
342                 mpext = mptcp_get_ext(skb);
343                 tcp_fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
344         } else {
345                 mpext = NULL;
346                 tcp_fin = 0;
347         }
348
349         if (!skb || (mpext && mpext->use_map) || tcp_fin) {
350                 unsigned int map_size;
351
352                 map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64;
353
354                 remaining -= map_size;
355                 dss_size = map_size;
356                 if (mpext)
357                         opts->ext_copy = *mpext;
358
359                 if (skb && tcp_fin &&
360                     subflow->conn->sk_state != TCP_ESTABLISHED)
361                         mptcp_write_data_fin(subflow, &opts->ext_copy);
362                 ret = true;
363         }
364
365         /* passive sockets msk will set the 'can_ack' after accept(), even
366          * if the first subflow may have the already the remote key handy
367          */
368         can_ack = true;
369         opts->ext_copy.use_ack = 0;
370         msk = mptcp_sk(subflow->conn);
371         if (likely(msk && READ_ONCE(msk->can_ack))) {
372                 ack_seq = msk->ack_seq;
373         } else if (subflow->can_ack) {
374                 mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq);
375                 ack_seq++;
376         } else {
377                 can_ack = false;
378         }
379
380         if (unlikely(!can_ack)) {
381                 *size = ALIGN(dss_size, 4);
382                 return ret;
383         }
384
385         ack_size = TCPOLEN_MPTCP_DSS_ACK64;
386
387         /* Add kind/length/subtype/flag overhead if mapping is not populated */
388         if (dss_size == 0)
389                 ack_size += TCPOLEN_MPTCP_DSS_BASE;
390
391         dss_size += ack_size;
392
393         opts->ext_copy.data_ack = ack_seq;
394         opts->ext_copy.ack64 = 1;
395         opts->ext_copy.use_ack = 1;
396
397         *size = ALIGN(dss_size, 4);
398         return true;
399 }
400
401 bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
402                                unsigned int *size, unsigned int remaining,
403                                struct mptcp_out_options *opts)
404 {
405         unsigned int opt_size = 0;
406         bool ret = false;
407
408         if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts))
409                 ret = true;
410         else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining,
411                                                opts))
412                 ret = true;
413
414         /* we reserved enough space for the above options, and exceeding the
415          * TCP option space would be fatal
416          */
417         if (WARN_ON_ONCE(opt_size > remaining))
418                 return false;
419
420         *size += opt_size;
421         remaining -= opt_size;
422
423         return ret;
424 }
425
426 bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
427                           struct mptcp_out_options *opts)
428 {
429         struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
430
431         if (subflow_req->mp_capable) {
432                 opts->suboptions = OPTION_MPTCP_MPC_SYNACK;
433                 opts->sndr_key = subflow_req->local_key;
434                 *size = TCPOLEN_MPTCP_MPC_SYNACK;
435                 pr_debug("subflow_req=%p, local_key=%llu",
436                          subflow_req, subflow_req->local_key);
437                 return true;
438         }
439         return false;
440 }
441
442 static bool check_fourth_ack(struct mptcp_subflow_context *subflow,
443                              struct sk_buff *skb,
444                              struct mptcp_options_received *mp_opt)
445 {
446         /* here we can process OoO, in-window pkts, only in-sequence 4th ack
447          * are relevant
448          */
449         if (likely(subflow->fourth_ack ||
450                    TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1))
451                 return true;
452
453         if (mp_opt->use_ack)
454                 subflow->fourth_ack = 1;
455
456         if (subflow->can_ack)
457                 return true;
458
459         /* If the first established packet does not contain MP_CAPABLE + data
460          * then fallback to TCP
461          */
462         if (!mp_opt->mp_capable) {
463                 subflow->mp_capable = 0;
464                 tcp_sk(mptcp_subflow_tcp_sock(subflow))->is_mptcp = 0;
465                 return false;
466         }
467         subflow->remote_key = mp_opt->sndr_key;
468         subflow->can_ack = 1;
469         return true;
470 }
471
472 void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
473                             struct tcp_options_received *opt_rx)
474 {
475         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
476         struct mptcp_options_received *mp_opt;
477         struct mptcp_ext *mpext;
478
479         mp_opt = &opt_rx->mptcp;
480         if (!check_fourth_ack(subflow, skb, mp_opt))
481                 return;
482
483         if (!mp_opt->dss)
484                 return;
485
486         mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
487         if (!mpext)
488                 return;
489
490         memset(mpext, 0, sizeof(*mpext));
491
492         if (mp_opt->use_map) {
493                 if (mp_opt->mpc_map) {
494                         /* this is an MP_CAPABLE carrying MPTCP data
495                          * we know this map the first chunk of data
496                          */
497                         mptcp_crypto_key_sha(subflow->remote_key, NULL,
498                                              &mpext->data_seq);
499                         mpext->data_seq++;
500                         mpext->subflow_seq = 1;
501                         mpext->dsn64 = 1;
502                         mpext->mpc_map = 1;
503                 } else {
504                         mpext->data_seq = mp_opt->data_seq;
505                         mpext->subflow_seq = mp_opt->subflow_seq;
506                         mpext->dsn64 = mp_opt->dsn64;
507                 }
508                 mpext->data_len = mp_opt->data_len;
509                 mpext->use_map = 1;
510         }
511
512         if (mp_opt->use_ack) {
513                 mpext->data_ack = mp_opt->data_ack;
514                 mpext->use_ack = 1;
515                 mpext->ack64 = mp_opt->ack64;
516         }
517
518         mpext->data_fin = mp_opt->data_fin;
519 }
520
521 void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
522 {
523         if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK |
524              OPTION_MPTCP_MPC_ACK) & opts->suboptions) {
525                 u8 len;
526
527                 if (OPTION_MPTCP_MPC_SYN & opts->suboptions)
528                         len = TCPOLEN_MPTCP_MPC_SYN;
529                 else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions)
530                         len = TCPOLEN_MPTCP_MPC_SYNACK;
531                 else if (opts->ext_copy.data_len)
532                         len = TCPOLEN_MPTCP_MPC_ACK_DATA;
533                 else
534                         len = TCPOLEN_MPTCP_MPC_ACK;
535
536                 *ptr++ = htonl((TCPOPT_MPTCP << 24) | (len << 16) |
537                                (MPTCPOPT_MP_CAPABLE << 12) |
538                                (MPTCP_SUPPORTED_VERSION << 8) |
539                                MPTCP_CAP_HMAC_SHA256);
540
541                 if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) &
542                     opts->suboptions))
543                         goto mp_capable_done;
544
545                 put_unaligned_be64(opts->sndr_key, ptr);
546                 ptr += 2;
547                 if (!((OPTION_MPTCP_MPC_ACK) & opts->suboptions))
548                         goto mp_capable_done;
549
550                 put_unaligned_be64(opts->rcvr_key, ptr);
551                 ptr += 2;
552                 if (!opts->ext_copy.data_len)
553                         goto mp_capable_done;
554
555                 put_unaligned_be32(opts->ext_copy.data_len << 16 |
556                                    TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
557                 ptr += 1;
558         }
559
560 mp_capable_done:
561         if (opts->ext_copy.use_ack || opts->ext_copy.use_map) {
562                 struct mptcp_ext *mpext = &opts->ext_copy;
563                 u8 len = TCPOLEN_MPTCP_DSS_BASE;
564                 u8 flags = 0;
565
566                 if (mpext->use_ack) {
567                         len += TCPOLEN_MPTCP_DSS_ACK64;
568                         flags = MPTCP_DSS_HAS_ACK | MPTCP_DSS_ACK64;
569                 }
570
571                 if (mpext->use_map) {
572                         len += TCPOLEN_MPTCP_DSS_MAP64;
573
574                         /* Use only 64-bit mapping flags for now, add
575                          * support for optional 32-bit mappings later.
576                          */
577                         flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64;
578                         if (mpext->data_fin)
579                                 flags |= MPTCP_DSS_DATA_FIN;
580                 }
581
582                 *ptr++ = htonl((TCPOPT_MPTCP << 24) |
583                                (len  << 16) |
584                                (MPTCPOPT_DSS << 12) |
585                                (flags));
586
587                 if (mpext->use_ack) {
588                         put_unaligned_be64(mpext->data_ack, ptr);
589                         ptr += 2;
590                 }
591
592                 if (mpext->use_map) {
593                         put_unaligned_be64(mpext->data_seq, ptr);
594                         ptr += 2;
595                         put_unaligned_be32(mpext->subflow_seq, ptr);
596                         ptr += 1;
597                         put_unaligned_be32(mpext->data_len << 16 |
598                                            TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
599                 }
600         }
601 }