OSDN Git Service

Merge tag 'for-linus-5.0-rc6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git...
[uclinux-h8/linux.git] / net / smc / af_smc.c
1 /*
2  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
3  *
4  *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
5  *  applies to SOCK_STREAM sockets only
6  *  offers an alternative communication option for TCP-protocol sockets
7  *  applicable with RoCE-cards only
8  *
9  *  Initial restrictions:
10  *    - support for alternate links postponed
11  *
12  *  Copyright IBM Corp. 2016, 2018
13  *
14  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
15  *              based on prototype from Frank Blaschka
16  */
17
18 #define KMSG_COMPONENT "smc"
19 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
20
21 #include <linux/module.h>
22 #include <linux/socket.h>
23 #include <linux/workqueue.h>
24 #include <linux/in.h>
25 #include <linux/sched/signal.h>
26 #include <linux/if_vlan.h>
27
28 #include <net/sock.h>
29 #include <net/tcp.h>
30 #include <net/smc.h>
31 #include <asm/ioctls.h>
32
33 #include "smc.h"
34 #include "smc_clc.h"
35 #include "smc_llc.h"
36 #include "smc_cdc.h"
37 #include "smc_core.h"
38 #include "smc_ib.h"
39 #include "smc_ism.h"
40 #include "smc_pnet.h"
41 #include "smc_tx.h"
42 #include "smc_rx.h"
43 #include "smc_close.h"
44
45 static DEFINE_MUTEX(smc_create_lgr_pending);    /* serialize link group
46                                                  * creation
47                                                  */
48
49 static void smc_tcp_listen_work(struct work_struct *);
50 static void smc_connect_work(struct work_struct *);
51
52 static void smc_set_keepalive(struct sock *sk, int val)
53 {
54         struct smc_sock *smc = smc_sk(sk);
55
56         smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
57 }
58
59 static struct smc_hashinfo smc_v4_hashinfo = {
60         .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
61 };
62
63 static struct smc_hashinfo smc_v6_hashinfo = {
64         .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
65 };
66
67 int smc_hash_sk(struct sock *sk)
68 {
69         struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
70         struct hlist_head *head;
71
72         head = &h->ht;
73
74         write_lock_bh(&h->lock);
75         sk_add_node(sk, head);
76         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
77         write_unlock_bh(&h->lock);
78
79         return 0;
80 }
81 EXPORT_SYMBOL_GPL(smc_hash_sk);
82
83 void smc_unhash_sk(struct sock *sk)
84 {
85         struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
86
87         write_lock_bh(&h->lock);
88         if (sk_del_node_init(sk))
89                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
90         write_unlock_bh(&h->lock);
91 }
92 EXPORT_SYMBOL_GPL(smc_unhash_sk);
93
94 struct proto smc_proto = {
95         .name           = "SMC",
96         .owner          = THIS_MODULE,
97         .keepalive      = smc_set_keepalive,
98         .hash           = smc_hash_sk,
99         .unhash         = smc_unhash_sk,
100         .obj_size       = sizeof(struct smc_sock),
101         .h.smc_hash     = &smc_v4_hashinfo,
102         .slab_flags     = SLAB_TYPESAFE_BY_RCU,
103 };
104 EXPORT_SYMBOL_GPL(smc_proto);
105
106 struct proto smc_proto6 = {
107         .name           = "SMC6",
108         .owner          = THIS_MODULE,
109         .keepalive      = smc_set_keepalive,
110         .hash           = smc_hash_sk,
111         .unhash         = smc_unhash_sk,
112         .obj_size       = sizeof(struct smc_sock),
113         .h.smc_hash     = &smc_v6_hashinfo,
114         .slab_flags     = SLAB_TYPESAFE_BY_RCU,
115 };
116 EXPORT_SYMBOL_GPL(smc_proto6);
117
118 static int smc_release(struct socket *sock)
119 {
120         struct sock *sk = sock->sk;
121         struct smc_sock *smc;
122         int rc = 0;
123
124         if (!sk)
125                 goto out;
126
127         smc = smc_sk(sk);
128
129         /* cleanup for a dangling non-blocking connect */
130         if (smc->connect_info && sk->sk_state == SMC_INIT)
131                 tcp_abort(smc->clcsock->sk, ECONNABORTED);
132         flush_work(&smc->connect_work);
133         kfree(smc->connect_info);
134         smc->connect_info = NULL;
135
136         if (sk->sk_state == SMC_LISTEN)
137                 /* smc_close_non_accepted() is called and acquires
138                  * sock lock for child sockets again
139                  */
140                 lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
141         else
142                 lock_sock(sk);
143
144         if (!smc->use_fallback) {
145                 rc = smc_close_active(smc);
146                 sock_set_flag(sk, SOCK_DEAD);
147                 sk->sk_shutdown |= SHUTDOWN_MASK;
148         }
149
150         sk->sk_prot->unhash(sk);
151
152         if (smc->clcsock) {
153                 if (smc->use_fallback && sk->sk_state == SMC_LISTEN) {
154                         /* wake up clcsock accept */
155                         rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
156                 }
157                 mutex_lock(&smc->clcsock_release_lock);
158                 sock_release(smc->clcsock);
159                 smc->clcsock = NULL;
160                 mutex_unlock(&smc->clcsock_release_lock);
161         }
162         if (smc->use_fallback) {
163                 if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT)
164                         sock_put(sk); /* passive closing */
165                 sk->sk_state = SMC_CLOSED;
166                 sk->sk_state_change(sk);
167         }
168
169         /* detach socket */
170         sock_orphan(sk);
171         sock->sk = NULL;
172         if (!smc->use_fallback && sk->sk_state == SMC_CLOSED)
173                 smc_conn_free(&smc->conn);
174         release_sock(sk);
175
176         sock_put(sk); /* final sock_put */
177 out:
178         return rc;
179 }
180
181 static void smc_destruct(struct sock *sk)
182 {
183         if (sk->sk_state != SMC_CLOSED)
184                 return;
185         if (!sock_flag(sk, SOCK_DEAD))
186                 return;
187
188         sk_refcnt_debug_dec(sk);
189 }
190
191 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
192                                    int protocol)
193 {
194         struct smc_sock *smc;
195         struct proto *prot;
196         struct sock *sk;
197
198         prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
199         sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
200         if (!sk)
201                 return NULL;
202
203         sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
204         sk->sk_state = SMC_INIT;
205         sk->sk_destruct = smc_destruct;
206         sk->sk_protocol = protocol;
207         smc = smc_sk(sk);
208         INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
209         INIT_WORK(&smc->connect_work, smc_connect_work);
210         INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
211         INIT_LIST_HEAD(&smc->accept_q);
212         spin_lock_init(&smc->accept_q_lock);
213         spin_lock_init(&smc->conn.send_lock);
214         sk->sk_prot->hash(sk);
215         sk_refcnt_debug_inc(sk);
216         mutex_init(&smc->clcsock_release_lock);
217
218         return sk;
219 }
220
221 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
222                     int addr_len)
223 {
224         struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
225         struct sock *sk = sock->sk;
226         struct smc_sock *smc;
227         int rc;
228
229         smc = smc_sk(sk);
230
231         /* replicate tests from inet_bind(), to be safe wrt. future changes */
232         rc = -EINVAL;
233         if (addr_len < sizeof(struct sockaddr_in))
234                 goto out;
235
236         rc = -EAFNOSUPPORT;
237         if (addr->sin_family != AF_INET &&
238             addr->sin_family != AF_INET6 &&
239             addr->sin_family != AF_UNSPEC)
240                 goto out;
241         /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
242         if (addr->sin_family == AF_UNSPEC &&
243             addr->sin_addr.s_addr != htonl(INADDR_ANY))
244                 goto out;
245
246         lock_sock(sk);
247
248         /* Check if socket is already active */
249         rc = -EINVAL;
250         if (sk->sk_state != SMC_INIT)
251                 goto out_rel;
252
253         smc->clcsock->sk->sk_reuse = sk->sk_reuse;
254         rc = kernel_bind(smc->clcsock, uaddr, addr_len);
255
256 out_rel:
257         release_sock(sk);
258 out:
259         return rc;
260 }
261
262 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
263                                    unsigned long mask)
264 {
265         /* options we don't get control via setsockopt for */
266         nsk->sk_type = osk->sk_type;
267         nsk->sk_sndbuf = osk->sk_sndbuf;
268         nsk->sk_rcvbuf = osk->sk_rcvbuf;
269         nsk->sk_sndtimeo = osk->sk_sndtimeo;
270         nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
271         nsk->sk_mark = osk->sk_mark;
272         nsk->sk_priority = osk->sk_priority;
273         nsk->sk_rcvlowat = osk->sk_rcvlowat;
274         nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
275         nsk->sk_err = osk->sk_err;
276
277         nsk->sk_flags &= ~mask;
278         nsk->sk_flags |= osk->sk_flags & mask;
279 }
280
281 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
282                              (1UL << SOCK_KEEPOPEN) | \
283                              (1UL << SOCK_LINGER) | \
284                              (1UL << SOCK_BROADCAST) | \
285                              (1UL << SOCK_TIMESTAMP) | \
286                              (1UL << SOCK_DBG) | \
287                              (1UL << SOCK_RCVTSTAMP) | \
288                              (1UL << SOCK_RCVTSTAMPNS) | \
289                              (1UL << SOCK_LOCALROUTE) | \
290                              (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
291                              (1UL << SOCK_RXQ_OVFL) | \
292                              (1UL << SOCK_WIFI_STATUS) | \
293                              (1UL << SOCK_NOFCS) | \
294                              (1UL << SOCK_FILTER_LOCKED))
295 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
296  * clc socket (since smc is not called for these options from net/core)
297  */
298 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
299 {
300         smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
301 }
302
303 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
304                              (1UL << SOCK_KEEPOPEN) | \
305                              (1UL << SOCK_LINGER) | \
306                              (1UL << SOCK_DBG))
307 /* copy only settings and flags relevant for smc from clc to smc socket */
308 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
309 {
310         smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
311 }
312
313 /* register a new rmb, send confirm_rkey msg to register with peer */
314 static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc,
315                        bool conf_rkey)
316 {
317         if (!rmb_desc->wr_reg) {
318                 /* register memory region for new rmb */
319                 if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
320                         rmb_desc->regerr = 1;
321                         return -EFAULT;
322                 }
323                 rmb_desc->wr_reg = 1;
324         }
325         if (!conf_rkey)
326                 return 0;
327         /* exchange confirm_rkey msg with peer */
328         if (smc_llc_do_confirm_rkey(link, rmb_desc)) {
329                 rmb_desc->regerr = 1;
330                 return -EFAULT;
331         }
332         return 0;
333 }
334
335 static int smc_clnt_conf_first_link(struct smc_sock *smc)
336 {
337         struct net *net = sock_net(smc->clcsock->sk);
338         struct smc_link_group *lgr = smc->conn.lgr;
339         struct smc_link *link;
340         int rest;
341         int rc;
342
343         link = &lgr->lnk[SMC_SINGLE_LINK];
344         /* receive CONFIRM LINK request from server over RoCE fabric */
345         rest = wait_for_completion_interruptible_timeout(
346                 &link->llc_confirm,
347                 SMC_LLC_WAIT_FIRST_TIME);
348         if (rest <= 0) {
349                 struct smc_clc_msg_decline dclc;
350
351                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
352                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
353                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
354         }
355
356         if (link->llc_confirm_rc)
357                 return SMC_CLC_DECL_RMBE_EC;
358
359         rc = smc_ib_modify_qp_rts(link);
360         if (rc)
361                 return SMC_CLC_DECL_ERR_RDYLNK;
362
363         smc_wr_remember_qp_attr(link);
364
365         if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
366                 return SMC_CLC_DECL_ERR_REGRMB;
367
368         /* send CONFIRM LINK response over RoCE fabric */
369         rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
370         if (rc < 0)
371                 return SMC_CLC_DECL_TIMEOUT_CL;
372
373         /* receive ADD LINK request from server over RoCE fabric */
374         rest = wait_for_completion_interruptible_timeout(&link->llc_add,
375                                                          SMC_LLC_WAIT_TIME);
376         if (rest <= 0) {
377                 struct smc_clc_msg_decline dclc;
378
379                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
380                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
381                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
382         }
383
384         /* send add link reject message, only one link supported for now */
385         rc = smc_llc_send_add_link(link,
386                                    link->smcibdev->mac[link->ibport - 1],
387                                    link->gid, SMC_LLC_RESP);
388         if (rc < 0)
389                 return SMC_CLC_DECL_TIMEOUT_AL;
390
391         smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
392
393         return 0;
394 }
395
396 static void smcr_conn_save_peer_info(struct smc_sock *smc,
397                                      struct smc_clc_msg_accept_confirm *clc)
398 {
399         int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
400
401         smc->conn.peer_rmbe_idx = clc->rmbe_idx;
402         smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
403         smc->conn.peer_rmbe_size = bufsize;
404         atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
405         smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
406 }
407
408 static void smcd_conn_save_peer_info(struct smc_sock *smc,
409                                      struct smc_clc_msg_accept_confirm *clc)
410 {
411         int bufsize = smc_uncompress_bufsize(clc->dmbe_size);
412
413         smc->conn.peer_rmbe_idx = clc->dmbe_idx;
414         smc->conn.peer_token = clc->token;
415         /* msg header takes up space in the buffer */
416         smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
417         atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
418         smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
419 }
420
421 static void smc_conn_save_peer_info(struct smc_sock *smc,
422                                     struct smc_clc_msg_accept_confirm *clc)
423 {
424         if (smc->conn.lgr->is_smcd)
425                 smcd_conn_save_peer_info(smc, clc);
426         else
427                 smcr_conn_save_peer_info(smc, clc);
428 }
429
430 static void smc_link_save_peer_info(struct smc_link *link,
431                                     struct smc_clc_msg_accept_confirm *clc)
432 {
433         link->peer_qpn = ntoh24(clc->qpn);
434         memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
435         memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
436         link->peer_psn = ntoh24(clc->psn);
437         link->peer_mtu = clc->qp_mtu;
438 }
439
440 /* fall back during connect */
441 static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
442 {
443         smc->use_fallback = true;
444         smc->fallback_rsn = reason_code;
445         smc_copy_sock_settings_to_clc(smc);
446         if (smc->sk.sk_state == SMC_INIT)
447                 smc->sk.sk_state = SMC_ACTIVE;
448         return 0;
449 }
450
451 /* decline and fall back during connect */
452 static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
453 {
454         int rc;
455
456         if (reason_code < 0) { /* error, fallback is not possible */
457                 if (smc->sk.sk_state == SMC_INIT)
458                         sock_put(&smc->sk); /* passive closing */
459                 return reason_code;
460         }
461         if (reason_code != SMC_CLC_DECL_PEERDECL) {
462                 rc = smc_clc_send_decline(smc, reason_code);
463                 if (rc < 0) {
464                         if (smc->sk.sk_state == SMC_INIT)
465                                 sock_put(&smc->sk); /* passive closing */
466                         return rc;
467                 }
468         }
469         return smc_connect_fallback(smc, reason_code);
470 }
471
472 /* abort connecting */
473 static int smc_connect_abort(struct smc_sock *smc, int reason_code,
474                              int local_contact)
475 {
476         if (local_contact == SMC_FIRST_CONTACT)
477                 smc_lgr_forget(smc->conn.lgr);
478         mutex_unlock(&smc_create_lgr_pending);
479         smc_conn_free(&smc->conn);
480         return reason_code;
481 }
482
483 /* check if there is a rdma device available for this connection. */
484 /* called for connect and listen */
485 static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev,
486                           u8 *ibport, unsigned short vlan_id, u8 gid[])
487 {
488         int reason_code = 0;
489
490         /* PNET table look up: search active ib_device and port
491          * within same PNETID that also contains the ethernet device
492          * used for the internal TCP socket
493          */
494         smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport, vlan_id,
495                                     gid);
496         if (!(*ibdev))
497                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
498
499         return reason_code;
500 }
501
502 /* check if there is an ISM device available for this connection. */
503 /* called for connect and listen */
504 static int smc_check_ism(struct smc_sock *smc, struct smcd_dev **ismdev)
505 {
506         /* Find ISM device with same PNETID as connecting interface  */
507         smc_pnet_find_ism_resource(smc->clcsock->sk, ismdev);
508         if (!(*ismdev))
509                 return SMC_CLC_DECL_CNFERR; /* configuration error */
510         return 0;
511 }
512
513 /* Check for VLAN ID and register it on ISM device just for CLC handshake */
514 static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
515                                       struct smcd_dev *ismdev,
516                                       unsigned short vlan_id)
517 {
518         if (vlan_id && smc_ism_get_vlan(ismdev, vlan_id))
519                 return SMC_CLC_DECL_CNFERR;
520         return 0;
521 }
522
523 /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
524  * used, the VLAN ID will be registered again during the connection setup.
525  */
526 static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd,
527                                         struct smcd_dev *ismdev,
528                                         unsigned short vlan_id)
529 {
530         if (!is_smcd)
531                 return 0;
532         if (vlan_id && smc_ism_put_vlan(ismdev, vlan_id))
533                 return SMC_CLC_DECL_CNFERR;
534         return 0;
535 }
536
537 /* CLC handshake during connect */
538 static int smc_connect_clc(struct smc_sock *smc, int smc_type,
539                            struct smc_clc_msg_accept_confirm *aclc,
540                            struct smc_ib_device *ibdev, u8 ibport,
541                            u8 gid[], struct smcd_dev *ismdev)
542 {
543         int rc = 0;
544
545         /* do inband token exchange */
546         rc = smc_clc_send_proposal(smc, smc_type, ibdev, ibport, gid, ismdev);
547         if (rc)
548                 return rc;
549         /* receive SMC Accept CLC message */
550         return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT,
551                                 CLC_WAIT_TIME);
552 }
553
554 /* setup for RDMA connection of client */
555 static int smc_connect_rdma(struct smc_sock *smc,
556                             struct smc_clc_msg_accept_confirm *aclc,
557                             struct smc_ib_device *ibdev, u8 ibport)
558 {
559         int local_contact = SMC_FIRST_CONTACT;
560         struct smc_link *link;
561         int reason_code = 0;
562
563         mutex_lock(&smc_create_lgr_pending);
564         local_contact = smc_conn_create(smc, false, aclc->hdr.flag, ibdev,
565                                         ibport, ntoh24(aclc->qpn), &aclc->lcl,
566                                         NULL, 0);
567         if (local_contact < 0) {
568                 if (local_contact == -ENOMEM)
569                         reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
570                 else if (local_contact == -ENOLINK)
571                         reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
572                 else
573                         reason_code = SMC_CLC_DECL_INTERR; /* other error */
574                 return smc_connect_abort(smc, reason_code, 0);
575         }
576         link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
577
578         smc_conn_save_peer_info(smc, aclc);
579
580         /* create send buffer and rmb */
581         if (smc_buf_create(smc, false))
582                 return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
583
584         if (local_contact == SMC_FIRST_CONTACT)
585                 smc_link_save_peer_info(link, aclc);
586
587         if (smc_rmb_rtoken_handling(&smc->conn, aclc))
588                 return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK,
589                                          local_contact);
590
591         smc_close_init(smc);
592         smc_rx_init(smc);
593
594         if (local_contact == SMC_FIRST_CONTACT) {
595                 if (smc_ib_ready_link(link))
596                         return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
597                                                  local_contact);
598         } else {
599                 if (smc_reg_rmb(link, smc->conn.rmb_desc, true))
600                         return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
601                                                  local_contact);
602         }
603         smc_rmb_sync_sg_for_device(&smc->conn);
604
605         reason_code = smc_clc_send_confirm(smc);
606         if (reason_code)
607                 return smc_connect_abort(smc, reason_code, local_contact);
608
609         smc_tx_init(smc);
610
611         if (local_contact == SMC_FIRST_CONTACT) {
612                 /* QP confirmation over RoCE fabric */
613                 reason_code = smc_clnt_conf_first_link(smc);
614                 if (reason_code)
615                         return smc_connect_abort(smc, reason_code,
616                                                  local_contact);
617         }
618         mutex_unlock(&smc_create_lgr_pending);
619
620         smc_copy_sock_settings_to_clc(smc);
621         if (smc->sk.sk_state == SMC_INIT)
622                 smc->sk.sk_state = SMC_ACTIVE;
623
624         return 0;
625 }
626
627 /* setup for ISM connection of client */
628 static int smc_connect_ism(struct smc_sock *smc,
629                            struct smc_clc_msg_accept_confirm *aclc,
630                            struct smcd_dev *ismdev)
631 {
632         int local_contact = SMC_FIRST_CONTACT;
633         int rc = 0;
634
635         mutex_lock(&smc_create_lgr_pending);
636         local_contact = smc_conn_create(smc, true, aclc->hdr.flag, NULL, 0, 0,
637                                         NULL, ismdev, aclc->gid);
638         if (local_contact < 0)
639                 return smc_connect_abort(smc, SMC_CLC_DECL_MEM, 0);
640
641         /* Create send and receive buffers */
642         if (smc_buf_create(smc, true))
643                 return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
644
645         smc_conn_save_peer_info(smc, aclc);
646         smc_close_init(smc);
647         smc_rx_init(smc);
648         smc_tx_init(smc);
649
650         rc = smc_clc_send_confirm(smc);
651         if (rc)
652                 return smc_connect_abort(smc, rc, local_contact);
653         mutex_unlock(&smc_create_lgr_pending);
654
655         smc_copy_sock_settings_to_clc(smc);
656         if (smc->sk.sk_state == SMC_INIT)
657                 smc->sk.sk_state = SMC_ACTIVE;
658
659         return 0;
660 }
661
662 /* perform steps before actually connecting */
663 static int __smc_connect(struct smc_sock *smc)
664 {
665         bool ism_supported = false, rdma_supported = false;
666         struct smc_clc_msg_accept_confirm aclc;
667         struct smc_ib_device *ibdev;
668         struct smcd_dev *ismdev;
669         u8 gid[SMC_GID_SIZE];
670         unsigned short vlan;
671         int smc_type;
672         int rc = 0;
673         u8 ibport;
674
675         sock_hold(&smc->sk); /* sock put in passive closing */
676
677         if (smc->use_fallback)
678                 return smc_connect_fallback(smc, smc->fallback_rsn);
679
680         /* if peer has not signalled SMC-capability, fall back */
681         if (!tcp_sk(smc->clcsock->sk)->syn_smc)
682                 return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
683
684         /* IPSec connections opt out of SMC-R optimizations */
685         if (using_ipsec(smc))
686                 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
687
688         /* check for VLAN ID */
689         if (smc_vlan_by_tcpsk(smc->clcsock, &vlan))
690                 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR);
691
692         /* check if there is an ism device available */
693         if (!smc_check_ism(smc, &ismdev) &&
694             !smc_connect_ism_vlan_setup(smc, ismdev, vlan)) {
695                 /* ISM is supported for this connection */
696                 ism_supported = true;
697                 smc_type = SMC_TYPE_D;
698         }
699
700         /* check if there is a rdma device available */
701         if (!smc_check_rdma(smc, &ibdev, &ibport, vlan, gid)) {
702                 /* RDMA is supported for this connection */
703                 rdma_supported = true;
704                 if (ism_supported)
705                         smc_type = SMC_TYPE_B; /* both */
706                 else
707                         smc_type = SMC_TYPE_R; /* only RDMA */
708         }
709
710         /* if neither ISM nor RDMA are supported, fallback */
711         if (!rdma_supported && !ism_supported)
712                 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV);
713
714         /* perform CLC handshake */
715         rc = smc_connect_clc(smc, smc_type, &aclc, ibdev, ibport, gid, ismdev);
716         if (rc) {
717                 smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
718                 return smc_connect_decline_fallback(smc, rc);
719         }
720
721         /* depending on previous steps, connect using rdma or ism */
722         if (rdma_supported && aclc.hdr.path == SMC_TYPE_R)
723                 rc = smc_connect_rdma(smc, &aclc, ibdev, ibport);
724         else if (ism_supported && aclc.hdr.path == SMC_TYPE_D)
725                 rc = smc_connect_ism(smc, &aclc, ismdev);
726         else
727                 rc = SMC_CLC_DECL_MODEUNSUPP;
728         if (rc) {
729                 smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
730                 return smc_connect_decline_fallback(smc, rc);
731         }
732
733         smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
734         return 0;
735 }
736
737 static void smc_connect_work(struct work_struct *work)
738 {
739         struct smc_sock *smc = container_of(work, struct smc_sock,
740                                             connect_work);
741         int rc;
742
743         lock_sock(&smc->sk);
744         rc = kernel_connect(smc->clcsock, &smc->connect_info->addr,
745                             smc->connect_info->alen, smc->connect_info->flags);
746         if (smc->clcsock->sk->sk_err) {
747                 smc->sk.sk_err = smc->clcsock->sk->sk_err;
748                 goto out;
749         }
750         if (rc < 0) {
751                 smc->sk.sk_err = -rc;
752                 goto out;
753         }
754
755         rc = __smc_connect(smc);
756         if (rc < 0)
757                 smc->sk.sk_err = -rc;
758
759 out:
760         if (smc->sk.sk_err)
761                 smc->sk.sk_state_change(&smc->sk);
762         else
763                 smc->sk.sk_write_space(&smc->sk);
764         kfree(smc->connect_info);
765         smc->connect_info = NULL;
766         release_sock(&smc->sk);
767 }
768
769 static int smc_connect(struct socket *sock, struct sockaddr *addr,
770                        int alen, int flags)
771 {
772         struct sock *sk = sock->sk;
773         struct smc_sock *smc;
774         int rc = -EINVAL;
775
776         smc = smc_sk(sk);
777
778         /* separate smc parameter checking to be safe */
779         if (alen < sizeof(addr->sa_family))
780                 goto out_err;
781         if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
782                 goto out_err;
783
784         lock_sock(sk);
785         switch (sk->sk_state) {
786         default:
787                 goto out;
788         case SMC_ACTIVE:
789                 rc = -EISCONN;
790                 goto out;
791         case SMC_INIT:
792                 rc = 0;
793                 break;
794         }
795
796         smc_copy_sock_settings_to_clc(smc);
797         tcp_sk(smc->clcsock->sk)->syn_smc = 1;
798         if (flags & O_NONBLOCK) {
799                 if (smc->connect_info) {
800                         rc = -EALREADY;
801                         goto out;
802                 }
803                 smc->connect_info = kzalloc(alen + 2 * sizeof(int), GFP_KERNEL);
804                 if (!smc->connect_info) {
805                         rc = -ENOMEM;
806                         goto out;
807                 }
808                 smc->connect_info->alen = alen;
809                 smc->connect_info->flags = flags ^ O_NONBLOCK;
810                 memcpy(&smc->connect_info->addr, addr, alen);
811                 schedule_work(&smc->connect_work);
812                 rc = -EINPROGRESS;
813         } else {
814                 rc = kernel_connect(smc->clcsock, addr, alen, flags);
815                 if (rc)
816                         goto out;
817
818                 rc = __smc_connect(smc);
819                 if (rc < 0)
820                         goto out;
821                 else
822                         rc = 0; /* success cases including fallback */
823         }
824
825 out:
826         release_sock(sk);
827 out_err:
828         return rc;
829 }
830
831 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
832 {
833         struct socket *new_clcsock = NULL;
834         struct sock *lsk = &lsmc->sk;
835         struct sock *new_sk;
836         int rc = -EINVAL;
837
838         release_sock(lsk);
839         new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
840         if (!new_sk) {
841                 rc = -ENOMEM;
842                 lsk->sk_err = ENOMEM;
843                 *new_smc = NULL;
844                 lock_sock(lsk);
845                 goto out;
846         }
847         *new_smc = smc_sk(new_sk);
848
849         mutex_lock(&lsmc->clcsock_release_lock);
850         if (lsmc->clcsock)
851                 rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
852         mutex_unlock(&lsmc->clcsock_release_lock);
853         lock_sock(lsk);
854         if  (rc < 0)
855                 lsk->sk_err = -rc;
856         if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
857                 if (new_clcsock)
858                         sock_release(new_clcsock);
859                 new_sk->sk_state = SMC_CLOSED;
860                 sock_set_flag(new_sk, SOCK_DEAD);
861                 new_sk->sk_prot->unhash(new_sk);
862                 sock_put(new_sk); /* final */
863                 *new_smc = NULL;
864                 goto out;
865         }
866
867         (*new_smc)->clcsock = new_clcsock;
868 out:
869         return rc;
870 }
871
872 /* add a just created sock to the accept queue of the listen sock as
873  * candidate for a following socket accept call from user space
874  */
875 static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
876 {
877         struct smc_sock *par = smc_sk(parent);
878
879         sock_hold(sk); /* sock_put in smc_accept_unlink () */
880         spin_lock(&par->accept_q_lock);
881         list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
882         spin_unlock(&par->accept_q_lock);
883         sk_acceptq_added(parent);
884 }
885
886 /* remove a socket from the accept queue of its parental listening socket */
887 static void smc_accept_unlink(struct sock *sk)
888 {
889         struct smc_sock *par = smc_sk(sk)->listen_smc;
890
891         spin_lock(&par->accept_q_lock);
892         list_del_init(&smc_sk(sk)->accept_q);
893         spin_unlock(&par->accept_q_lock);
894         sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
895         sock_put(sk); /* sock_hold in smc_accept_enqueue */
896 }
897
898 /* remove a sock from the accept queue to bind it to a new socket created
899  * for a socket accept call from user space
900  */
901 struct sock *smc_accept_dequeue(struct sock *parent,
902                                 struct socket *new_sock)
903 {
904         struct smc_sock *isk, *n;
905         struct sock *new_sk;
906
907         list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
908                 new_sk = (struct sock *)isk;
909
910                 smc_accept_unlink(new_sk);
911                 if (new_sk->sk_state == SMC_CLOSED) {
912                         if (isk->clcsock) {
913                                 sock_release(isk->clcsock);
914                                 isk->clcsock = NULL;
915                         }
916                         new_sk->sk_prot->unhash(new_sk);
917                         sock_put(new_sk); /* final */
918                         continue;
919                 }
920                 if (new_sock)
921                         sock_graft(new_sk, new_sock);
922                 return new_sk;
923         }
924         return NULL;
925 }
926
927 /* clean up for a created but never accepted sock */
928 void smc_close_non_accepted(struct sock *sk)
929 {
930         struct smc_sock *smc = smc_sk(sk);
931
932         lock_sock(sk);
933         if (!sk->sk_lingertime)
934                 /* wait for peer closing */
935                 sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
936         if (!smc->use_fallback) {
937                 smc_close_active(smc);
938                 sock_set_flag(sk, SOCK_DEAD);
939                 sk->sk_shutdown |= SHUTDOWN_MASK;
940         }
941         if (smc->clcsock) {
942                 struct socket *tcp;
943
944                 tcp = smc->clcsock;
945                 smc->clcsock = NULL;
946                 sock_release(tcp);
947         }
948         if (smc->use_fallback) {
949                 sock_put(sk); /* passive closing */
950                 sk->sk_state = SMC_CLOSED;
951         } else {
952                 if (sk->sk_state == SMC_CLOSED)
953                         smc_conn_free(&smc->conn);
954         }
955         release_sock(sk);
956         sk->sk_prot->unhash(sk);
957         sock_put(sk); /* final sock_put */
958 }
959
960 static int smc_serv_conf_first_link(struct smc_sock *smc)
961 {
962         struct net *net = sock_net(smc->clcsock->sk);
963         struct smc_link_group *lgr = smc->conn.lgr;
964         struct smc_link *link;
965         int rest;
966         int rc;
967
968         link = &lgr->lnk[SMC_SINGLE_LINK];
969
970         if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
971                 return SMC_CLC_DECL_ERR_REGRMB;
972
973         /* send CONFIRM LINK request to client over the RoCE fabric */
974         rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
975         if (rc < 0)
976                 return SMC_CLC_DECL_TIMEOUT_CL;
977
978         /* receive CONFIRM LINK response from client over the RoCE fabric */
979         rest = wait_for_completion_interruptible_timeout(
980                 &link->llc_confirm_resp,
981                 SMC_LLC_WAIT_FIRST_TIME);
982         if (rest <= 0) {
983                 struct smc_clc_msg_decline dclc;
984
985                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
986                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
987                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
988         }
989
990         if (link->llc_confirm_resp_rc)
991                 return SMC_CLC_DECL_RMBE_EC;
992
993         /* send ADD LINK request to client over the RoCE fabric */
994         rc = smc_llc_send_add_link(link,
995                                    link->smcibdev->mac[link->ibport - 1],
996                                    link->gid, SMC_LLC_REQ);
997         if (rc < 0)
998                 return SMC_CLC_DECL_TIMEOUT_AL;
999
1000         /* receive ADD LINK response from client over the RoCE fabric */
1001         rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
1002                                                          SMC_LLC_WAIT_TIME);
1003         if (rest <= 0) {
1004                 struct smc_clc_msg_decline dclc;
1005
1006                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
1007                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
1008                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
1009         }
1010
1011         smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
1012
1013         return 0;
1014 }
1015
1016 /* listen worker: finish */
1017 static void smc_listen_out(struct smc_sock *new_smc)
1018 {
1019         struct smc_sock *lsmc = new_smc->listen_smc;
1020         struct sock *newsmcsk = &new_smc->sk;
1021
1022         lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
1023         if (lsmc->sk.sk_state == SMC_LISTEN) {
1024                 smc_accept_enqueue(&lsmc->sk, newsmcsk);
1025         } else { /* no longer listening */
1026                 smc_close_non_accepted(newsmcsk);
1027         }
1028         release_sock(&lsmc->sk);
1029
1030         /* Wake up accept */
1031         lsmc->sk.sk_data_ready(&lsmc->sk);
1032         sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
1033 }
1034
1035 /* listen worker: finish in state connected */
1036 static void smc_listen_out_connected(struct smc_sock *new_smc)
1037 {
1038         struct sock *newsmcsk = &new_smc->sk;
1039
1040         sk_refcnt_debug_inc(newsmcsk);
1041         if (newsmcsk->sk_state == SMC_INIT)
1042                 newsmcsk->sk_state = SMC_ACTIVE;
1043
1044         smc_listen_out(new_smc);
1045 }
1046
1047 /* listen worker: finish in error state */
1048 static void smc_listen_out_err(struct smc_sock *new_smc)
1049 {
1050         struct sock *newsmcsk = &new_smc->sk;
1051
1052         if (newsmcsk->sk_state == SMC_INIT)
1053                 sock_put(&new_smc->sk); /* passive closing */
1054         newsmcsk->sk_state = SMC_CLOSED;
1055         smc_conn_free(&new_smc->conn);
1056
1057         smc_listen_out(new_smc);
1058 }
1059
1060 /* listen worker: decline and fall back if possible */
1061 static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
1062                                int local_contact)
1063 {
1064         /* RDMA setup failed, switch back to TCP */
1065         if (local_contact == SMC_FIRST_CONTACT)
1066                 smc_lgr_forget(new_smc->conn.lgr);
1067         if (reason_code < 0) { /* error, no fallback possible */
1068                 smc_listen_out_err(new_smc);
1069                 return;
1070         }
1071         smc_conn_free(&new_smc->conn);
1072         new_smc->use_fallback = true;
1073         new_smc->fallback_rsn = reason_code;
1074         if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
1075                 if (smc_clc_send_decline(new_smc, reason_code) < 0) {
1076                         smc_listen_out_err(new_smc);
1077                         return;
1078                 }
1079         }
1080         smc_listen_out_connected(new_smc);
1081 }
1082
1083 /* listen worker: check prefixes */
1084 static int smc_listen_rdma_check(struct smc_sock *new_smc,
1085                                  struct smc_clc_msg_proposal *pclc)
1086 {
1087         struct smc_clc_msg_proposal_prefix *pclc_prfx;
1088         struct socket *newclcsock = new_smc->clcsock;
1089
1090         pclc_prfx = smc_clc_proposal_get_prefix(pclc);
1091         if (smc_clc_prfx_match(newclcsock, pclc_prfx))
1092                 return SMC_CLC_DECL_CNFERR;
1093
1094         return 0;
1095 }
1096
1097 /* listen worker: initialize connection and buffers */
1098 static int smc_listen_rdma_init(struct smc_sock *new_smc,
1099                                 struct smc_clc_msg_proposal *pclc,
1100                                 struct smc_ib_device *ibdev, u8 ibport,
1101                                 int *local_contact)
1102 {
1103         /* allocate connection / link group */
1104         *local_contact = smc_conn_create(new_smc, false, 0, ibdev, ibport, 0,
1105                                          &pclc->lcl, NULL, 0);
1106         if (*local_contact < 0) {
1107                 if (*local_contact == -ENOMEM)
1108                         return SMC_CLC_DECL_MEM;/* insufficient memory*/
1109                 return SMC_CLC_DECL_INTERR; /* other error */
1110         }
1111
1112         /* create send buffer and rmb */
1113         if (smc_buf_create(new_smc, false))
1114                 return SMC_CLC_DECL_MEM;
1115
1116         return 0;
1117 }
1118
1119 /* listen worker: initialize connection and buffers for SMC-D */
1120 static int smc_listen_ism_init(struct smc_sock *new_smc,
1121                                struct smc_clc_msg_proposal *pclc,
1122                                struct smcd_dev *ismdev,
1123                                int *local_contact)
1124 {
1125         struct smc_clc_msg_smcd *pclc_smcd;
1126
1127         pclc_smcd = smc_get_clc_msg_smcd(pclc);
1128         *local_contact = smc_conn_create(new_smc, true, 0, NULL, 0, 0, NULL,
1129                                          ismdev, pclc_smcd->gid);
1130         if (*local_contact < 0) {
1131                 if (*local_contact == -ENOMEM)
1132                         return SMC_CLC_DECL_MEM;/* insufficient memory*/
1133                 return SMC_CLC_DECL_INTERR; /* other error */
1134         }
1135
1136         /* Check if peer can be reached via ISM device */
1137         if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid,
1138                             new_smc->conn.lgr->vlan_id,
1139                             new_smc->conn.lgr->smcd)) {
1140                 if (*local_contact == SMC_FIRST_CONTACT)
1141                         smc_lgr_forget(new_smc->conn.lgr);
1142                 smc_conn_free(&new_smc->conn);
1143                 return SMC_CLC_DECL_CNFERR;
1144         }
1145
1146         /* Create send and receive buffers */
1147         if (smc_buf_create(new_smc, true)) {
1148                 if (*local_contact == SMC_FIRST_CONTACT)
1149                         smc_lgr_forget(new_smc->conn.lgr);
1150                 smc_conn_free(&new_smc->conn);
1151                 return SMC_CLC_DECL_MEM;
1152         }
1153
1154         return 0;
1155 }
1156
1157 /* listen worker: register buffers */
1158 static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
1159 {
1160         struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
1161
1162         if (local_contact != SMC_FIRST_CONTACT) {
1163                 if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
1164                         return SMC_CLC_DECL_ERR_REGRMB;
1165         }
1166         smc_rmb_sync_sg_for_device(&new_smc->conn);
1167
1168         return 0;
1169 }
1170
1171 /* listen worker: finish RDMA setup */
1172 static int smc_listen_rdma_finish(struct smc_sock *new_smc,
1173                                   struct smc_clc_msg_accept_confirm *cclc,
1174                                   int local_contact)
1175 {
1176         struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
1177         int reason_code = 0;
1178
1179         if (local_contact == SMC_FIRST_CONTACT)
1180                 smc_link_save_peer_info(link, cclc);
1181
1182         if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) {
1183                 reason_code = SMC_CLC_DECL_ERR_RTOK;
1184                 goto decline;
1185         }
1186
1187         if (local_contact == SMC_FIRST_CONTACT) {
1188                 if (smc_ib_ready_link(link)) {
1189                         reason_code = SMC_CLC_DECL_ERR_RDYLNK;
1190                         goto decline;
1191                 }
1192                 /* QP confirmation over RoCE fabric */
1193                 reason_code = smc_serv_conf_first_link(new_smc);
1194                 if (reason_code)
1195                         goto decline;
1196         }
1197         return 0;
1198
1199 decline:
1200         smc_listen_decline(new_smc, reason_code, local_contact);
1201         return reason_code;
1202 }
1203
1204 /* setup for RDMA connection of server */
1205 static void smc_listen_work(struct work_struct *work)
1206 {
1207         struct smc_sock *new_smc = container_of(work, struct smc_sock,
1208                                                 smc_listen_work);
1209         struct socket *newclcsock = new_smc->clcsock;
1210         struct smc_clc_msg_accept_confirm cclc;
1211         struct smc_clc_msg_proposal *pclc;
1212         struct smc_ib_device *ibdev;
1213         bool ism_supported = false;
1214         struct smcd_dev *ismdev;
1215         u8 buf[SMC_CLC_MAX_LEN];
1216         int local_contact = 0;
1217         unsigned short vlan;
1218         int reason_code = 0;
1219         int rc = 0;
1220         u8 ibport;
1221
1222         if (new_smc->use_fallback) {
1223                 smc_listen_out_connected(new_smc);
1224                 return;
1225         }
1226
1227         /* check if peer is smc capable */
1228         if (!tcp_sk(newclcsock->sk)->syn_smc) {
1229                 new_smc->use_fallback = true;
1230                 new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC;
1231                 smc_listen_out_connected(new_smc);
1232                 return;
1233         }
1234
1235         /* do inband token exchange -
1236          * wait for and receive SMC Proposal CLC message
1237          */
1238         pclc = (struct smc_clc_msg_proposal *)&buf;
1239         reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
1240                                        SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
1241         if (reason_code) {
1242                 smc_listen_decline(new_smc, reason_code, 0);
1243                 return;
1244         }
1245
1246         /* IPSec connections opt out of SMC-R optimizations */
1247         if (using_ipsec(new_smc)) {
1248                 smc_listen_decline(new_smc, SMC_CLC_DECL_IPSEC, 0);
1249                 return;
1250         }
1251
1252         mutex_lock(&smc_create_lgr_pending);
1253         smc_close_init(new_smc);
1254         smc_rx_init(new_smc);
1255         smc_tx_init(new_smc);
1256
1257         /* check if ISM is available */
1258         if ((pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) &&
1259             !smc_check_ism(new_smc, &ismdev) &&
1260             !smc_listen_ism_init(new_smc, pclc, ismdev, &local_contact)) {
1261                 ism_supported = true;
1262         }
1263
1264         /* check if RDMA is available */
1265         if (!ism_supported &&
1266             ((pclc->hdr.path != SMC_TYPE_R && pclc->hdr.path != SMC_TYPE_B) ||
1267              smc_vlan_by_tcpsk(new_smc->clcsock, &vlan) ||
1268              smc_check_rdma(new_smc, &ibdev, &ibport, vlan, NULL) ||
1269              smc_listen_rdma_check(new_smc, pclc) ||
1270              smc_listen_rdma_init(new_smc, pclc, ibdev, ibport,
1271                                   &local_contact) ||
1272              smc_listen_rdma_reg(new_smc, local_contact))) {
1273                 /* SMC not supported, decline */
1274                 mutex_unlock(&smc_create_lgr_pending);
1275                 smc_listen_decline(new_smc, SMC_CLC_DECL_MODEUNSUPP,
1276                                    local_contact);
1277                 return;
1278         }
1279
1280         /* send SMC Accept CLC message */
1281         rc = smc_clc_send_accept(new_smc, local_contact);
1282         if (rc) {
1283                 mutex_unlock(&smc_create_lgr_pending);
1284                 smc_listen_decline(new_smc, rc, local_contact);
1285                 return;
1286         }
1287
1288         /* receive SMC Confirm CLC message */
1289         reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
1290                                        SMC_CLC_CONFIRM, CLC_WAIT_TIME);
1291         if (reason_code) {
1292                 mutex_unlock(&smc_create_lgr_pending);
1293                 smc_listen_decline(new_smc, reason_code, local_contact);
1294                 return;
1295         }
1296
1297         /* finish worker */
1298         if (!ism_supported) {
1299                 if (smc_listen_rdma_finish(new_smc, &cclc, local_contact)) {
1300                         mutex_unlock(&smc_create_lgr_pending);
1301                         return;
1302                 }
1303         }
1304         smc_conn_save_peer_info(new_smc, &cclc);
1305         mutex_unlock(&smc_create_lgr_pending);
1306         smc_listen_out_connected(new_smc);
1307 }
1308
1309 static void smc_tcp_listen_work(struct work_struct *work)
1310 {
1311         struct smc_sock *lsmc = container_of(work, struct smc_sock,
1312                                              tcp_listen_work);
1313         struct sock *lsk = &lsmc->sk;
1314         struct smc_sock *new_smc;
1315         int rc = 0;
1316
1317         lock_sock(lsk);
1318         while (lsk->sk_state == SMC_LISTEN) {
1319                 rc = smc_clcsock_accept(lsmc, &new_smc);
1320                 if (rc)
1321                         goto out;
1322                 if (!new_smc)
1323                         continue;
1324
1325                 new_smc->listen_smc = lsmc;
1326                 new_smc->use_fallback = lsmc->use_fallback;
1327                 new_smc->fallback_rsn = lsmc->fallback_rsn;
1328                 sock_hold(lsk); /* sock_put in smc_listen_work */
1329                 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
1330                 smc_copy_sock_settings_to_smc(new_smc);
1331                 new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
1332                 new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
1333                 sock_hold(&new_smc->sk); /* sock_put in passive closing */
1334                 if (!schedule_work(&new_smc->smc_listen_work))
1335                         sock_put(&new_smc->sk);
1336         }
1337
1338 out:
1339         release_sock(lsk);
1340         sock_put(&lsmc->sk); /* sock_hold in smc_listen */
1341 }
1342
1343 static int smc_listen(struct socket *sock, int backlog)
1344 {
1345         struct sock *sk = sock->sk;
1346         struct smc_sock *smc;
1347         int rc;
1348
1349         smc = smc_sk(sk);
1350         lock_sock(sk);
1351
1352         rc = -EINVAL;
1353         if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
1354                 goto out;
1355
1356         rc = 0;
1357         if (sk->sk_state == SMC_LISTEN) {
1358                 sk->sk_max_ack_backlog = backlog;
1359                 goto out;
1360         }
1361         /* some socket options are handled in core, so we could not apply
1362          * them to the clc socket -- copy smc socket options to clc socket
1363          */
1364         smc_copy_sock_settings_to_clc(smc);
1365         if (!smc->use_fallback)
1366                 tcp_sk(smc->clcsock->sk)->syn_smc = 1;
1367
1368         rc = kernel_listen(smc->clcsock, backlog);
1369         if (rc)
1370                 goto out;
1371         sk->sk_max_ack_backlog = backlog;
1372         sk->sk_ack_backlog = 0;
1373         sk->sk_state = SMC_LISTEN;
1374         sock_hold(sk); /* sock_hold in tcp_listen_worker */
1375         if (!schedule_work(&smc->tcp_listen_work))
1376                 sock_put(sk);
1377
1378 out:
1379         release_sock(sk);
1380         return rc;
1381 }
1382
1383 static int smc_accept(struct socket *sock, struct socket *new_sock,
1384                       int flags, bool kern)
1385 {
1386         struct sock *sk = sock->sk, *nsk;
1387         DECLARE_WAITQUEUE(wait, current);
1388         struct smc_sock *lsmc;
1389         long timeo;
1390         int rc = 0;
1391
1392         lsmc = smc_sk(sk);
1393         sock_hold(sk); /* sock_put below */
1394         lock_sock(sk);
1395
1396         if (lsmc->sk.sk_state != SMC_LISTEN) {
1397                 rc = -EINVAL;
1398                 release_sock(sk);
1399                 goto out;
1400         }
1401
1402         /* Wait for an incoming connection */
1403         timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1404         add_wait_queue_exclusive(sk_sleep(sk), &wait);
1405         while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
1406                 set_current_state(TASK_INTERRUPTIBLE);
1407                 if (!timeo) {
1408                         rc = -EAGAIN;
1409                         break;
1410                 }
1411                 release_sock(sk);
1412                 timeo = schedule_timeout(timeo);
1413                 /* wakeup by sk_data_ready in smc_listen_work() */
1414                 sched_annotate_sleep();
1415                 lock_sock(sk);
1416                 if (signal_pending(current)) {
1417                         rc = sock_intr_errno(timeo);
1418                         break;
1419                 }
1420         }
1421         set_current_state(TASK_RUNNING);
1422         remove_wait_queue(sk_sleep(sk), &wait);
1423
1424         if (!rc)
1425                 rc = sock_error(nsk);
1426         release_sock(sk);
1427         if (rc)
1428                 goto out;
1429
1430         if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
1431                 /* wait till data arrives on the socket */
1432                 timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
1433                                                                 MSEC_PER_SEC);
1434                 if (smc_sk(nsk)->use_fallback) {
1435                         struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
1436
1437                         lock_sock(clcsk);
1438                         if (skb_queue_empty(&clcsk->sk_receive_queue))
1439                                 sk_wait_data(clcsk, &timeo, NULL);
1440                         release_sock(clcsk);
1441                 } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
1442                         lock_sock(nsk);
1443                         smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
1444                         release_sock(nsk);
1445                 }
1446         }
1447
1448 out:
1449         sock_put(sk); /* sock_hold above */
1450         return rc;
1451 }
1452
1453 static int smc_getname(struct socket *sock, struct sockaddr *addr,
1454                        int peer)
1455 {
1456         struct smc_sock *smc;
1457
1458         if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1459             (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1460                 return -ENOTCONN;
1461
1462         smc = smc_sk(sock->sk);
1463
1464         return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
1465 }
1466
1467 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1468 {
1469         struct sock *sk = sock->sk;
1470         struct smc_sock *smc;
1471         int rc = -EPIPE;
1472
1473         smc = smc_sk(sk);
1474         lock_sock(sk);
1475         if ((sk->sk_state != SMC_ACTIVE) &&
1476             (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1477             (sk->sk_state != SMC_INIT))
1478                 goto out;
1479
1480         if (msg->msg_flags & MSG_FASTOPEN) {
1481                 if (sk->sk_state == SMC_INIT) {
1482                         smc->use_fallback = true;
1483                         smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1484                 } else {
1485                         rc = -EINVAL;
1486                         goto out;
1487                 }
1488         }
1489
1490         if (smc->use_fallback)
1491                 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1492         else
1493                 rc = smc_tx_sendmsg(smc, msg, len);
1494 out:
1495         release_sock(sk);
1496         return rc;
1497 }
1498
1499 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1500                        int flags)
1501 {
1502         struct sock *sk = sock->sk;
1503         struct smc_sock *smc;
1504         int rc = -ENOTCONN;
1505
1506         smc = smc_sk(sk);
1507         lock_sock(sk);
1508         if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
1509                 /* socket was connected before, no more data to read */
1510                 rc = 0;
1511                 goto out;
1512         }
1513         if ((sk->sk_state == SMC_INIT) ||
1514             (sk->sk_state == SMC_LISTEN) ||
1515             (sk->sk_state == SMC_CLOSED))
1516                 goto out;
1517
1518         if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1519                 rc = 0;
1520                 goto out;
1521         }
1522
1523         if (smc->use_fallback) {
1524                 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1525         } else {
1526                 msg->msg_namelen = 0;
1527                 rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
1528         }
1529
1530 out:
1531         release_sock(sk);
1532         return rc;
1533 }
1534
1535 static __poll_t smc_accept_poll(struct sock *parent)
1536 {
1537         struct smc_sock *isk = smc_sk(parent);
1538         __poll_t mask = 0;
1539
1540         spin_lock(&isk->accept_q_lock);
1541         if (!list_empty(&isk->accept_q))
1542                 mask = EPOLLIN | EPOLLRDNORM;
1543         spin_unlock(&isk->accept_q_lock);
1544
1545         return mask;
1546 }
1547
1548 static __poll_t smc_poll(struct file *file, struct socket *sock,
1549                              poll_table *wait)
1550 {
1551         struct sock *sk = sock->sk;
1552         __poll_t mask = 0;
1553         struct smc_sock *smc;
1554
1555         if (!sk)
1556                 return EPOLLNVAL;
1557
1558         smc = smc_sk(sock->sk);
1559         if (smc->use_fallback) {
1560                 /* delegate to CLC child sock */
1561                 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1562                 sk->sk_err = smc->clcsock->sk->sk_err;
1563                 if (sk->sk_err)
1564                         mask |= EPOLLERR;
1565         } else {
1566                 if (sk->sk_state != SMC_CLOSED)
1567                         sock_poll_wait(file, sock, wait);
1568                 if (sk->sk_err)
1569                         mask |= EPOLLERR;
1570                 if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1571                     (sk->sk_state == SMC_CLOSED))
1572                         mask |= EPOLLHUP;
1573                 if (sk->sk_state == SMC_LISTEN) {
1574                         /* woken up by sk_data_ready in smc_listen_work() */
1575                         mask = smc_accept_poll(sk);
1576                 } else {
1577                         if (atomic_read(&smc->conn.sndbuf_space) ||
1578                             sk->sk_shutdown & SEND_SHUTDOWN) {
1579                                 mask |= EPOLLOUT | EPOLLWRNORM;
1580                         } else {
1581                                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1582                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1583                         }
1584                         if (atomic_read(&smc->conn.bytes_to_rcv))
1585                                 mask |= EPOLLIN | EPOLLRDNORM;
1586                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1587                                 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
1588                         if (sk->sk_state == SMC_APPCLOSEWAIT1)
1589                                 mask |= EPOLLIN;
1590                         if (smc->conn.urg_state == SMC_URG_VALID)
1591                                 mask |= EPOLLPRI;
1592                 }
1593         }
1594
1595         return mask;
1596 }
1597
1598 static int smc_shutdown(struct socket *sock, int how)
1599 {
1600         struct sock *sk = sock->sk;
1601         struct smc_sock *smc;
1602         int rc = -EINVAL;
1603         int rc1 = 0;
1604
1605         smc = smc_sk(sk);
1606
1607         if ((how < SHUT_RD) || (how > SHUT_RDWR))
1608                 return rc;
1609
1610         lock_sock(sk);
1611
1612         rc = -ENOTCONN;
1613         if ((sk->sk_state != SMC_ACTIVE) &&
1614             (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1615             (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1616             (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1617             (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1618             (sk->sk_state != SMC_APPFINCLOSEWAIT))
1619                 goto out;
1620         if (smc->use_fallback) {
1621                 rc = kernel_sock_shutdown(smc->clcsock, how);
1622                 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1623                 if (sk->sk_shutdown == SHUTDOWN_MASK)
1624                         sk->sk_state = SMC_CLOSED;
1625                 goto out;
1626         }
1627         switch (how) {
1628         case SHUT_RDWR:         /* shutdown in both directions */
1629                 rc = smc_close_active(smc);
1630                 break;
1631         case SHUT_WR:
1632                 rc = smc_close_shutdown_write(smc);
1633                 break;
1634         case SHUT_RD:
1635                 rc = 0;
1636                 /* nothing more to do because peer is not involved */
1637                 break;
1638         }
1639         if (smc->clcsock)
1640                 rc1 = kernel_sock_shutdown(smc->clcsock, how);
1641         /* map sock_shutdown_cmd constants to sk_shutdown value range */
1642         sk->sk_shutdown |= how + 1;
1643
1644 out:
1645         release_sock(sk);
1646         return rc ? rc : rc1;
1647 }
1648
1649 static int smc_setsockopt(struct socket *sock, int level, int optname,
1650                           char __user *optval, unsigned int optlen)
1651 {
1652         struct sock *sk = sock->sk;
1653         struct smc_sock *smc;
1654         int val, rc;
1655
1656         smc = smc_sk(sk);
1657
1658         /* generic setsockopts reaching us here always apply to the
1659          * CLC socket
1660          */
1661         rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1662                                            optval, optlen);
1663         if (smc->clcsock->sk->sk_err) {
1664                 sk->sk_err = smc->clcsock->sk->sk_err;
1665                 sk->sk_error_report(sk);
1666         }
1667         if (rc)
1668                 return rc;
1669
1670         if (optlen < sizeof(int))
1671                 return -EINVAL;
1672         if (get_user(val, (int __user *)optval))
1673                 return -EFAULT;
1674
1675         lock_sock(sk);
1676         switch (optname) {
1677         case TCP_ULP:
1678         case TCP_FASTOPEN:
1679         case TCP_FASTOPEN_CONNECT:
1680         case TCP_FASTOPEN_KEY:
1681         case TCP_FASTOPEN_NO_COOKIE:
1682                 /* option not supported by SMC */
1683                 if (sk->sk_state == SMC_INIT) {
1684                         smc->use_fallback = true;
1685                         smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1686                 } else {
1687                         if (!smc->use_fallback)
1688                                 rc = -EINVAL;
1689                 }
1690                 break;
1691         case TCP_NODELAY:
1692                 if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
1693                         if (val && !smc->use_fallback)
1694                                 mod_delayed_work(system_wq, &smc->conn.tx_work,
1695                                                  0);
1696                 }
1697                 break;
1698         case TCP_CORK:
1699                 if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
1700                         if (!val && !smc->use_fallback)
1701                                 mod_delayed_work(system_wq, &smc->conn.tx_work,
1702                                                  0);
1703                 }
1704                 break;
1705         case TCP_DEFER_ACCEPT:
1706                 smc->sockopt_defer_accept = val;
1707                 break;
1708         default:
1709                 break;
1710         }
1711         release_sock(sk);
1712
1713         return rc;
1714 }
1715
1716 static int smc_getsockopt(struct socket *sock, int level, int optname,
1717                           char __user *optval, int __user *optlen)
1718 {
1719         struct smc_sock *smc;
1720
1721         smc = smc_sk(sock->sk);
1722         /* socket options apply to the CLC socket */
1723         return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1724                                              optval, optlen);
1725 }
1726
1727 static int smc_ioctl(struct socket *sock, unsigned int cmd,
1728                      unsigned long arg)
1729 {
1730         union smc_host_cursor cons, urg;
1731         struct smc_connection *conn;
1732         struct smc_sock *smc;
1733         int answ;
1734
1735         smc = smc_sk(sock->sk);
1736         conn = &smc->conn;
1737         lock_sock(&smc->sk);
1738         if (smc->use_fallback) {
1739                 if (!smc->clcsock) {
1740                         release_sock(&smc->sk);
1741                         return -EBADF;
1742                 }
1743                 answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1744                 release_sock(&smc->sk);
1745                 return answ;
1746         }
1747         switch (cmd) {
1748         case SIOCINQ: /* same as FIONREAD */
1749                 if (smc->sk.sk_state == SMC_LISTEN) {
1750                         release_sock(&smc->sk);
1751                         return -EINVAL;
1752                 }
1753                 if (smc->sk.sk_state == SMC_INIT ||
1754                     smc->sk.sk_state == SMC_CLOSED)
1755                         answ = 0;
1756                 else
1757                         answ = atomic_read(&smc->conn.bytes_to_rcv);
1758                 break;
1759         case SIOCOUTQ:
1760                 /* output queue size (not send + not acked) */
1761                 if (smc->sk.sk_state == SMC_LISTEN) {
1762                         release_sock(&smc->sk);
1763                         return -EINVAL;
1764                 }
1765                 if (smc->sk.sk_state == SMC_INIT ||
1766                     smc->sk.sk_state == SMC_CLOSED)
1767                         answ = 0;
1768                 else
1769                         answ = smc->conn.sndbuf_desc->len -
1770                                         atomic_read(&smc->conn.sndbuf_space);
1771                 break;
1772         case SIOCOUTQNSD:
1773                 /* output queue size (not send only) */
1774                 if (smc->sk.sk_state == SMC_LISTEN) {
1775                         release_sock(&smc->sk);
1776                         return -EINVAL;
1777                 }
1778                 if (smc->sk.sk_state == SMC_INIT ||
1779                     smc->sk.sk_state == SMC_CLOSED)
1780                         answ = 0;
1781                 else
1782                         answ = smc_tx_prepared_sends(&smc->conn);
1783                 break;
1784         case SIOCATMARK:
1785                 if (smc->sk.sk_state == SMC_LISTEN) {
1786                         release_sock(&smc->sk);
1787                         return -EINVAL;
1788                 }
1789                 if (smc->sk.sk_state == SMC_INIT ||
1790                     smc->sk.sk_state == SMC_CLOSED) {
1791                         answ = 0;
1792                 } else {
1793                         smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
1794                         smc_curs_copy(&urg, &conn->urg_curs, conn);
1795                         answ = smc_curs_diff(conn->rmb_desc->len,
1796                                              &cons, &urg) == 1;
1797                 }
1798                 break;
1799         default:
1800                 release_sock(&smc->sk);
1801                 return -ENOIOCTLCMD;
1802         }
1803         release_sock(&smc->sk);
1804
1805         return put_user(answ, (int __user *)arg);
1806 }
1807
1808 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1809                             int offset, size_t size, int flags)
1810 {
1811         struct sock *sk = sock->sk;
1812         struct smc_sock *smc;
1813         int rc = -EPIPE;
1814
1815         smc = smc_sk(sk);
1816         lock_sock(sk);
1817         if (sk->sk_state != SMC_ACTIVE) {
1818                 release_sock(sk);
1819                 goto out;
1820         }
1821         release_sock(sk);
1822         if (smc->use_fallback)
1823                 rc = kernel_sendpage(smc->clcsock, page, offset,
1824                                      size, flags);
1825         else
1826                 rc = sock_no_sendpage(sock, page, offset, size, flags);
1827
1828 out:
1829         return rc;
1830 }
1831
1832 /* Map the affected portions of the rmbe into an spd, note the number of bytes
1833  * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
1834  * updates till whenever a respective page has been fully processed.
1835  * Note that subsequent recv() calls have to wait till all splice() processing
1836  * completed.
1837  */
1838 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1839                                struct pipe_inode_info *pipe, size_t len,
1840                                unsigned int flags)
1841 {
1842         struct sock *sk = sock->sk;
1843         struct smc_sock *smc;
1844         int rc = -ENOTCONN;
1845
1846         smc = smc_sk(sk);
1847         lock_sock(sk);
1848         if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
1849                 /* socket was connected before, no more data to read */
1850                 rc = 0;
1851                 goto out;
1852         }
1853         if (sk->sk_state == SMC_INIT ||
1854             sk->sk_state == SMC_LISTEN ||
1855             sk->sk_state == SMC_CLOSED)
1856                 goto out;
1857
1858         if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1859                 rc = 0;
1860                 goto out;
1861         }
1862
1863         if (smc->use_fallback) {
1864                 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1865                                                     pipe, len, flags);
1866         } else {
1867                 if (*ppos) {
1868                         rc = -ESPIPE;
1869                         goto out;
1870                 }
1871                 if (flags & SPLICE_F_NONBLOCK)
1872                         flags = MSG_DONTWAIT;
1873                 else
1874                         flags = 0;
1875                 rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
1876         }
1877 out:
1878         release_sock(sk);
1879
1880         return rc;
1881 }
1882
1883 /* must look like tcp */
1884 static const struct proto_ops smc_sock_ops = {
1885         .family         = PF_SMC,
1886         .owner          = THIS_MODULE,
1887         .release        = smc_release,
1888         .bind           = smc_bind,
1889         .connect        = smc_connect,
1890         .socketpair     = sock_no_socketpair,
1891         .accept         = smc_accept,
1892         .getname        = smc_getname,
1893         .poll           = smc_poll,
1894         .ioctl          = smc_ioctl,
1895         .listen         = smc_listen,
1896         .shutdown       = smc_shutdown,
1897         .setsockopt     = smc_setsockopt,
1898         .getsockopt     = smc_getsockopt,
1899         .sendmsg        = smc_sendmsg,
1900         .recvmsg        = smc_recvmsg,
1901         .mmap           = sock_no_mmap,
1902         .sendpage       = smc_sendpage,
1903         .splice_read    = smc_splice_read,
1904 };
1905
1906 static int smc_create(struct net *net, struct socket *sock, int protocol,
1907                       int kern)
1908 {
1909         int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
1910         struct smc_sock *smc;
1911         struct sock *sk;
1912         int rc;
1913
1914         rc = -ESOCKTNOSUPPORT;
1915         if (sock->type != SOCK_STREAM)
1916                 goto out;
1917
1918         rc = -EPROTONOSUPPORT;
1919         if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
1920                 goto out;
1921
1922         rc = -ENOBUFS;
1923         sock->ops = &smc_sock_ops;
1924         sk = smc_sock_alloc(net, sock, protocol);
1925         if (!sk)
1926                 goto out;
1927
1928         /* create internal TCP socket for CLC handshake and fallback */
1929         smc = smc_sk(sk);
1930         smc->use_fallback = false; /* assume rdma capability first */
1931         smc->fallback_rsn = 0;
1932         rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
1933                               &smc->clcsock);
1934         if (rc) {
1935                 sk_common_release(sk);
1936                 goto out;
1937         }
1938         smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1939         smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1940
1941 out:
1942         return rc;
1943 }
1944
1945 static const struct net_proto_family smc_sock_family_ops = {
1946         .family = PF_SMC,
1947         .owner  = THIS_MODULE,
1948         .create = smc_create,
1949 };
1950
1951 static int __init smc_init(void)
1952 {
1953         int rc;
1954
1955         rc = smc_pnet_init();
1956         if (rc)
1957                 return rc;
1958
1959         rc = smc_llc_init();
1960         if (rc) {
1961                 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
1962                 goto out_pnet;
1963         }
1964
1965         rc = smc_cdc_init();
1966         if (rc) {
1967                 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
1968                 goto out_pnet;
1969         }
1970
1971         rc = proto_register(&smc_proto, 1);
1972         if (rc) {
1973                 pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
1974                 goto out_pnet;
1975         }
1976
1977         rc = proto_register(&smc_proto6, 1);
1978         if (rc) {
1979                 pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
1980                 goto out_proto;
1981         }
1982
1983         rc = sock_register(&smc_sock_family_ops);
1984         if (rc) {
1985                 pr_err("%s: sock_register fails with %d\n", __func__, rc);
1986                 goto out_proto6;
1987         }
1988         INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
1989         INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
1990
1991         rc = smc_ib_register_client();
1992         if (rc) {
1993                 pr_err("%s: ib_register fails with %d\n", __func__, rc);
1994                 goto out_sock;
1995         }
1996
1997         static_branch_enable(&tcp_have_smc);
1998         return 0;
1999
2000 out_sock:
2001         sock_unregister(PF_SMC);
2002 out_proto6:
2003         proto_unregister(&smc_proto6);
2004 out_proto:
2005         proto_unregister(&smc_proto);
2006 out_pnet:
2007         smc_pnet_exit();
2008         return rc;
2009 }
2010
2011 static void __exit smc_exit(void)
2012 {
2013         smc_core_exit();
2014         static_branch_disable(&tcp_have_smc);
2015         smc_ib_unregister_client();
2016         sock_unregister(PF_SMC);
2017         proto_unregister(&smc_proto6);
2018         proto_unregister(&smc_proto);
2019         smc_pnet_exit();
2020 }
2021
2022 module_init(smc_init);
2023 module_exit(smc_exit);
2024
2025 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
2026 MODULE_DESCRIPTION("smc socket address family");
2027 MODULE_LICENSE("GPL");
2028 MODULE_ALIAS_NETPROTO(PF_SMC);