netlink_domain.c 26 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015
  1. /*-
  2. * SPDX-License-Identifier: BSD-2-Clause
  3. *
  4. * Copyright (c) 2021 Ng Peng Nam Sean
  5. * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
  6. * Copyright (c) 2023 Gleb Smirnoff <glebius@FreeBSD.org>
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. * 1. Redistributions of source code must retain the above copyright
  12. * notice, this list of conditions and the following disclaimer.
  13. * 2. Redistributions in binary form must reproduce the above copyright
  14. * notice, this list of conditions and the following disclaimer in the
  15. * documentation and/or other materials provided with the distribution.
  16. *
  17. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  18. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  21. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27. * SUCH DAMAGE.
  28. */
  29. /*
  30. * This file contains socket and protocol bindings for netlink.
  31. */
  32. #include <sys/param.h>
  33. #include <sys/kernel.h>
  34. #include <sys/malloc.h>
  35. #include <sys/lock.h>
  36. #include <sys/rmlock.h>
  37. #include <sys/domain.h>
  38. #include <sys/jail.h>
  39. #include <sys/mbuf.h>
  40. #include <sys/osd.h>
  41. #include <sys/protosw.h>
  42. #include <sys/proc.h>
  43. #include <sys/ck.h>
  44. #include <sys/socket.h>
  45. #include <sys/socketvar.h>
  46. #include <sys/sysent.h>
  47. #include <sys/syslog.h>
  48. #include <sys/priv.h> /* priv_check */
  49. #include <sys/uio.h>
  50. #include <netlink/netlink.h>
  51. #include <netlink/netlink_ctl.h>
  52. #include <netlink/netlink_var.h>
  53. #define DEBUG_MOD_NAME nl_domain
  54. #define DEBUG_MAX_LEVEL LOG_DEBUG3
  55. #include <netlink/netlink_debug.h>
  56. _DECLARE_DEBUG(LOG_INFO);
  57. _Static_assert((NLP_MAX_GROUPS % 64) == 0,
  58. "NLP_MAX_GROUPS has to be multiple of 64");
  59. _Static_assert(NLP_MAX_GROUPS >= 64,
  60. "NLP_MAX_GROUPS has to be at least 64");
  61. #define NLCTL_TRACKER struct rm_priotracker nl_tracker
  62. #define NLCTL_RLOCK(_ctl) rm_rlock(&((_ctl)->ctl_lock), &nl_tracker)
  63. #define NLCTL_RUNLOCK(_ctl) rm_runlock(&((_ctl)->ctl_lock), &nl_tracker)
  64. #define NLCTL_WLOCK(_ctl) rm_wlock(&((_ctl)->ctl_lock))
  65. #define NLCTL_WUNLOCK(_ctl) rm_wunlock(&((_ctl)->ctl_lock))
  66. static u_long nl_sendspace = NLSNDQ;
  67. SYSCTL_ULONG(_net_netlink, OID_AUTO, sendspace, CTLFLAG_RW, &nl_sendspace, 0,
  68. "Default netlink socket send space");
  69. static u_long nl_recvspace = NLSNDQ;
  70. SYSCTL_ULONG(_net_netlink, OID_AUTO, recvspace, CTLFLAG_RW, &nl_recvspace, 0,
  71. "Default netlink socket receive space");
  72. extern u_long sb_max_adj;
  73. static u_long nl_maxsockbuf = 512 * 1024 * 1024; /* 512M, XXX: init based on physmem */
  74. static int sysctl_handle_nl_maxsockbuf(SYSCTL_HANDLER_ARGS);
  75. SYSCTL_OID(_net_netlink, OID_AUTO, nl_maxsockbuf,
  76. CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, &nl_maxsockbuf, 0,
  77. sysctl_handle_nl_maxsockbuf, "LU",
  78. "Maximum Netlink socket buffer size");
  79. static unsigned int osd_slot_id = 0;
  80. void
  81. nl_osd_register(void)
  82. {
  83. osd_slot_id = osd_register(OSD_THREAD, NULL, NULL);
  84. }
  85. void
  86. nl_osd_unregister(void)
  87. {
  88. osd_deregister(OSD_THREAD, osd_slot_id);
  89. }
  90. struct nlpcb *
  91. _nl_get_thread_nlp(struct thread *td)
  92. {
  93. return (osd_get(OSD_THREAD, &td->td_osd, osd_slot_id));
  94. }
  95. void
  96. nl_set_thread_nlp(struct thread *td, struct nlpcb *nlp)
  97. {
  98. NLP_LOG(LOG_DEBUG2, nlp, "Set thread %p nlp to %p (slot %u)", td, nlp, osd_slot_id);
  99. if (osd_set(OSD_THREAD, &td->td_osd, osd_slot_id, nlp) == 0)
  100. return;
  101. /* Failed, need to realloc */
  102. void **rsv = osd_reserve(osd_slot_id);
  103. osd_set_reserved(OSD_THREAD, &td->td_osd, osd_slot_id, rsv, nlp);
  104. }
  105. /*
  106. * Looks up a nlpcb struct based on the @portid. Need to claim nlsock_mtx.
  107. * Returns nlpcb pointer if present else NULL
  108. */
  109. static struct nlpcb *
  110. nl_port_lookup(uint32_t port_id)
  111. {
  112. struct nlpcb *nlp;
  113. CK_LIST_FOREACH(nlp, &V_nl_ctl->ctl_port_head, nl_port_next) {
  114. if (nlp->nl_port == port_id)
  115. return (nlp);
  116. }
  117. return (NULL);
  118. }
  119. static void
  120. nl_add_group_locked(struct nlpcb *nlp, unsigned int group_id)
  121. {
  122. MPASS(group_id <= NLP_MAX_GROUPS);
  123. --group_id;
  124. /* TODO: add family handler callback */
  125. if (!nlp_unconstrained_vnet(nlp))
  126. return;
  127. nlp->nl_groups[group_id / 64] |= (uint64_t)1 << (group_id % 64);
  128. }
  129. static void
  130. nl_del_group_locked(struct nlpcb *nlp, unsigned int group_id)
  131. {
  132. MPASS(group_id <= NLP_MAX_GROUPS);
  133. --group_id;
  134. nlp->nl_groups[group_id / 64] &= ~((uint64_t)1 << (group_id % 64));
  135. }
  136. static bool
  137. nl_isset_group_locked(struct nlpcb *nlp, unsigned int group_id)
  138. {
  139. MPASS(group_id <= NLP_MAX_GROUPS);
  140. --group_id;
  141. return (nlp->nl_groups[group_id / 64] & ((uint64_t)1 << (group_id % 64)));
  142. }
  143. static uint32_t
  144. nl_get_groups_compat(struct nlpcb *nlp)
  145. {
  146. uint32_t groups_mask = 0;
  147. for (int i = 0; i < 32; i++) {
  148. if (nl_isset_group_locked(nlp, i + 1))
  149. groups_mask |= (1 << i);
  150. }
  151. return (groups_mask);
  152. }
  153. static struct nl_buf *
  154. nl_buf_copy(struct nl_buf *nb)
  155. {
  156. struct nl_buf *copy;
  157. copy = nl_buf_alloc(nb->buflen, M_NOWAIT);
  158. if (__predict_false(copy == NULL))
  159. return (NULL);
  160. memcpy(copy, nb, sizeof(*nb) + nb->buflen);
  161. return (copy);
  162. }
  163. /*
  164. * Broadcasts in the writer's buffer.
  165. */
  166. bool
  167. nl_send_group(struct nl_writer *nw)
  168. {
  169. struct nl_buf *nb = nw->buf;
  170. struct nlpcb *nlp_last = NULL;
  171. struct nlpcb *nlp;
  172. NLCTL_TRACKER;
  173. IF_DEBUG_LEVEL(LOG_DEBUG2) {
  174. struct nlmsghdr *hdr = (struct nlmsghdr *)nb->data;
  175. NL_LOG(LOG_DEBUG2, "MCAST len %u msg type %d len %u to group %d/%d",
  176. nb->datalen, hdr->nlmsg_type, hdr->nlmsg_len,
  177. nw->group.proto, nw->group.id);
  178. }
  179. nw->buf = NULL;
  180. struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
  181. if (__predict_false(ctl == NULL)) {
  182. /*
  183. * Can be the case when notification is sent within VNET
  184. * which doesn't have any netlink sockets.
  185. */
  186. nl_buf_free(nb);
  187. return (false);
  188. }
  189. NLCTL_RLOCK(ctl);
  190. CK_LIST_FOREACH(nlp, &ctl->ctl_pcb_head, nl_next) {
  191. if (nl_isset_group_locked(nlp, nw->group.id) &&
  192. nlp->nl_proto == nw->group.proto) {
  193. if (nlp_last != NULL) {
  194. struct nl_buf *copy;
  195. copy = nl_buf_copy(nb);
  196. if (copy != NULL) {
  197. nw->buf = copy;
  198. (void)nl_send(nw, nlp_last);
  199. } else {
  200. NLP_LOCK(nlp_last);
  201. if (nlp_last->nl_socket != NULL)
  202. sorwakeup(nlp_last->nl_socket);
  203. NLP_UNLOCK(nlp_last);
  204. }
  205. }
  206. nlp_last = nlp;
  207. }
  208. }
  209. if (nlp_last != NULL) {
  210. nw->buf = nb;
  211. (void)nl_send(nw, nlp_last);
  212. } else
  213. nl_buf_free(nb);
  214. NLCTL_RUNLOCK(ctl);
  215. return (true);
  216. }
  217. bool
  218. nl_has_listeners(int netlink_family, uint32_t groups_mask)
  219. {
  220. return (V_nl_ctl != NULL);
  221. }
  222. static uint32_t
  223. nl_find_port(void)
  224. {
  225. /*
  226. * app can open multiple netlink sockets.
  227. * Start with current pid, if already taken,
  228. * try random numbers in 65k..256k+65k space,
  229. * avoiding clash with pids.
  230. */
  231. if (nl_port_lookup(curproc->p_pid) == NULL)
  232. return (curproc->p_pid);
  233. for (int i = 0; i < 16; i++) {
  234. uint32_t nl_port = (arc4random() % 65536) + 65536 * 4;
  235. if (nl_port_lookup(nl_port) == 0)
  236. return (nl_port);
  237. NL_LOG(LOG_DEBUG3, "tried %u\n", nl_port);
  238. }
  239. return (curproc->p_pid);
  240. }
  241. static int
  242. nl_bind_locked(struct nlpcb *nlp, struct sockaddr_nl *snl)
  243. {
  244. if (nlp->nl_bound) {
  245. if (nlp->nl_port != snl->nl_pid) {
  246. NL_LOG(LOG_DEBUG,
  247. "bind() failed: program pid %d "
  248. "is different from provided pid %d",
  249. nlp->nl_port, snl->nl_pid);
  250. return (EINVAL); // XXX: better error
  251. }
  252. } else {
  253. if (snl->nl_pid == 0)
  254. snl->nl_pid = nl_find_port();
  255. if (nl_port_lookup(snl->nl_pid) != NULL)
  256. return (EADDRINUSE);
  257. nlp->nl_port = snl->nl_pid;
  258. nlp->nl_bound = true;
  259. CK_LIST_INSERT_HEAD(&V_nl_ctl->ctl_port_head, nlp, nl_port_next);
  260. }
  261. for (int i = 0; i < 32; i++) {
  262. if (snl->nl_groups & ((uint32_t)1 << i))
  263. nl_add_group_locked(nlp, i + 1);
  264. else
  265. nl_del_group_locked(nlp, i + 1);
  266. }
  267. return (0);
  268. }
  269. static int
  270. nl_pru_attach(struct socket *so, int proto, struct thread *td)
  271. {
  272. struct nlpcb *nlp;
  273. int error;
  274. if (__predict_false(netlink_unloading != 0))
  275. return (EAFNOSUPPORT);
  276. error = nl_verify_proto(proto);
  277. if (error != 0)
  278. return (error);
  279. bool is_linux = SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX;
  280. NL_LOG(LOG_DEBUG2, "socket %p, %sPID %d: attaching socket to %s",
  281. so, is_linux ? "(linux) " : "", curproc->p_pid,
  282. nl_get_proto_name(proto));
  283. /* Create per-VNET state on first socket init */
  284. struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
  285. if (ctl == NULL)
  286. ctl = vnet_nl_ctl_init();
  287. KASSERT(V_nl_ctl != NULL, ("nl_attach: vnet_sock_init() failed"));
  288. MPASS(sotonlpcb(so) == NULL);
  289. nlp = malloc(sizeof(struct nlpcb), M_PCB, M_WAITOK | M_ZERO);
  290. error = soreserve(so, nl_sendspace, nl_recvspace);
  291. if (error != 0) {
  292. free(nlp, M_PCB);
  293. return (error);
  294. }
  295. TAILQ_INIT(&so->so_rcv.nl_queue);
  296. TAILQ_INIT(&so->so_snd.nl_queue);
  297. so->so_pcb = nlp;
  298. nlp->nl_socket = so;
  299. /* Copy so_cred to avoid having socket_var.h in every header */
  300. nlp->nl_cred = so->so_cred;
  301. nlp->nl_proto = proto;
  302. nlp->nl_process_id = curproc->p_pid;
  303. nlp->nl_linux = is_linux;
  304. nlp->nl_unconstrained_vnet = !jailed_without_vnet(so->so_cred);
  305. nlp->nl_need_thread_setup = true;
  306. NLP_LOCK_INIT(nlp);
  307. refcount_init(&nlp->nl_refcount, 1);
  308. nlp->nl_taskqueue = taskqueue_create("netlink_socket", M_WAITOK,
  309. taskqueue_thread_enqueue, &nlp->nl_taskqueue);
  310. TASK_INIT(&nlp->nl_task, 0, nl_taskqueue_handler, nlp);
  311. taskqueue_start_threads(&nlp->nl_taskqueue, 1, PWAIT,
  312. "netlink_socket (PID %u)", nlp->nl_process_id);
  313. NLCTL_WLOCK(ctl);
  314. /* XXX: check ctl is still alive */
  315. CK_LIST_INSERT_HEAD(&ctl->ctl_pcb_head, nlp, nl_next);
  316. NLCTL_WUNLOCK(ctl);
  317. soisconnected(so);
  318. return (0);
  319. }
  320. static int
  321. nl_pru_bind(struct socket *so, struct sockaddr *sa, struct thread *td)
  322. {
  323. struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
  324. struct nlpcb *nlp = sotonlpcb(so);
  325. struct sockaddr_nl *snl = (struct sockaddr_nl *)sa;
  326. int error;
  327. NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
  328. if (snl->nl_len != sizeof(*snl)) {
  329. NL_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so);
  330. return (EINVAL);
  331. }
  332. NLCTL_WLOCK(ctl);
  333. NLP_LOCK(nlp);
  334. error = nl_bind_locked(nlp, snl);
  335. NLP_UNLOCK(nlp);
  336. NLCTL_WUNLOCK(ctl);
  337. NL_LOG(LOG_DEBUG2, "socket %p, bind() to %u, groups %u, error %d", so,
  338. snl->nl_pid, snl->nl_groups, error);
  339. return (error);
  340. }
  341. static int
  342. nl_assign_port(struct nlpcb *nlp, uint32_t port_id)
  343. {
  344. struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
  345. struct sockaddr_nl snl = {
  346. .nl_pid = port_id,
  347. };
  348. int error;
  349. NLCTL_WLOCK(ctl);
  350. NLP_LOCK(nlp);
  351. snl.nl_groups = nl_get_groups_compat(nlp);
  352. error = nl_bind_locked(nlp, &snl);
  353. NLP_UNLOCK(nlp);
  354. NLCTL_WUNLOCK(ctl);
  355. NL_LOG(LOG_DEBUG3, "socket %p, port assign: %d, error: %d", nlp->nl_socket, port_id, error);
  356. return (error);
  357. }
  358. /*
  359. * nl_autobind_port binds a unused portid to @nlp
  360. * @nlp: pcb data for the netlink socket
  361. * @candidate_id: first id to consider
  362. */
  363. static int
  364. nl_autobind_port(struct nlpcb *nlp, uint32_t candidate_id)
  365. {
  366. struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
  367. uint32_t port_id = candidate_id;
  368. NLCTL_TRACKER;
  369. bool exist;
  370. int error = EADDRINUSE;
  371. for (int i = 0; i < 10; i++) {
  372. NL_LOG(LOG_DEBUG3, "socket %p, trying to assign port %d", nlp->nl_socket, port_id);
  373. NLCTL_RLOCK(ctl);
  374. exist = nl_port_lookup(port_id) != 0;
  375. NLCTL_RUNLOCK(ctl);
  376. if (!exist) {
  377. error = nl_assign_port(nlp, port_id);
  378. if (error != EADDRINUSE)
  379. break;
  380. }
  381. port_id++;
  382. }
  383. NL_LOG(LOG_DEBUG3, "socket %p, autobind to %d, error: %d", nlp->nl_socket, port_id, error);
  384. return (error);
  385. }
  386. static int
  387. nl_pru_connect(struct socket *so, struct sockaddr *sa, struct thread *td)
  388. {
  389. struct sockaddr_nl *snl = (struct sockaddr_nl *)sa;
  390. struct nlpcb *nlp;
  391. NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
  392. if (snl->nl_len != sizeof(*snl)) {
  393. NL_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so);
  394. return (EINVAL);
  395. }
  396. nlp = sotonlpcb(so);
  397. if (!nlp->nl_bound) {
  398. int error = nl_autobind_port(nlp, td->td_proc->p_pid);
  399. if (error != 0) {
  400. NL_LOG(LOG_DEBUG, "socket %p, nl_autobind() failed: %d", so, error);
  401. return (error);
  402. }
  403. }
  404. /* XXX: Handle socket flags & multicast */
  405. soisconnected(so);
  406. NL_LOG(LOG_DEBUG2, "socket %p, connect to %u", so, snl->nl_pid);
  407. return (0);
  408. }
  409. static void
  410. destroy_nlpcb_epoch(epoch_context_t ctx)
  411. {
  412. struct nlpcb *nlp;
  413. nlp = __containerof(ctx, struct nlpcb, nl_epoch_ctx);
  414. NLP_LOCK_DESTROY(nlp);
  415. free(nlp, M_PCB);
  416. }
  417. static void
  418. nl_close(struct socket *so)
  419. {
  420. struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
  421. MPASS(sotonlpcb(so) != NULL);
  422. struct nlpcb *nlp;
  423. struct nl_buf *nb;
  424. NL_LOG(LOG_DEBUG2, "detaching socket %p, PID %d", so, curproc->p_pid);
  425. nlp = sotonlpcb(so);
  426. /* Mark as inactive so no new work can be enqueued */
  427. NLP_LOCK(nlp);
  428. bool was_bound = nlp->nl_bound;
  429. NLP_UNLOCK(nlp);
  430. /* Wait till all scheduled work has been completed */
  431. taskqueue_drain_all(nlp->nl_taskqueue);
  432. taskqueue_free(nlp->nl_taskqueue);
  433. NLCTL_WLOCK(ctl);
  434. NLP_LOCK(nlp);
  435. if (was_bound) {
  436. CK_LIST_REMOVE(nlp, nl_port_next);
  437. NL_LOG(LOG_DEBUG3, "socket %p, unlinking bound pid %u", so, nlp->nl_port);
  438. }
  439. CK_LIST_REMOVE(nlp, nl_next);
  440. nlp->nl_socket = NULL;
  441. NLP_UNLOCK(nlp);
  442. NLCTL_WUNLOCK(ctl);
  443. so->so_pcb = NULL;
  444. while ((nb = TAILQ_FIRST(&so->so_snd.nl_queue)) != NULL) {
  445. TAILQ_REMOVE(&so->so_snd.nl_queue, nb, tailq);
  446. nl_buf_free(nb);
  447. }
  448. while ((nb = TAILQ_FIRST(&so->so_rcv.nl_queue)) != NULL) {
  449. TAILQ_REMOVE(&so->so_rcv.nl_queue, nb, tailq);
  450. nl_buf_free(nb);
  451. }
  452. NL_LOG(LOG_DEBUG3, "socket %p, detached", so);
  453. /* XXX: is delayed free needed? */
  454. NET_EPOCH_CALL(destroy_nlpcb_epoch, &nlp->nl_epoch_ctx);
  455. }
  456. static int
  457. nl_pru_disconnect(struct socket *so)
  458. {
  459. NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
  460. MPASS(sotonlpcb(so) != NULL);
  461. return (ENOTCONN);
  462. }
  463. static int
  464. nl_sockaddr(struct socket *so, struct sockaddr *sa)
  465. {
  466. *(struct sockaddr_nl *)sa = (struct sockaddr_nl ){
  467. /* TODO: set other fields */
  468. .nl_len = sizeof(struct sockaddr_nl),
  469. .nl_family = AF_NETLINK,
  470. .nl_pid = sotonlpcb(so)->nl_port,
  471. };
  472. return (0);
  473. }
  474. static int
  475. nl_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
  476. struct mbuf *m, struct mbuf *control, int flags, struct thread *td)
  477. {
  478. struct nlpcb *nlp = sotonlpcb(so);
  479. struct sockbuf *sb = &so->so_snd;
  480. struct nl_buf *nb;
  481. u_int len;
  482. int error;
  483. MPASS(m == NULL && uio != NULL);
  484. NL_LOG(LOG_DEBUG2, "sending message to kernel");
  485. if (__predict_false(control != NULL)) {
  486. m_freem(control);
  487. return (EINVAL);
  488. }
  489. if (__predict_false(flags & MSG_OOB)) /* XXXGL: or just ignore? */
  490. return (EOPNOTSUPP);
  491. if (__predict_false(uio->uio_resid < sizeof(struct nlmsghdr)))
  492. return (ENOBUFS); /* XXXGL: any better error? */
  493. NL_LOG(LOG_DEBUG3, "sending message to kernel async processing");
  494. error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
  495. if (error)
  496. return (error);
  497. len = roundup2(uio->uio_resid, 8) + SCRATCH_BUFFER_SIZE;
  498. if (nlp->nl_linux)
  499. len += roundup2(uio->uio_resid, 8);
  500. nb = nl_buf_alloc(len, M_WAITOK);
  501. nb->datalen = uio->uio_resid;
  502. error = uiomove(&nb->data[0], uio->uio_resid, uio);
  503. if (__predict_false(error))
  504. goto out;
  505. SOCK_SENDBUF_LOCK(so);
  506. restart:
  507. if (sb->sb_hiwat - sb->sb_ccc >= nb->datalen) {
  508. TAILQ_INSERT_TAIL(&sb->nl_queue, nb, tailq);
  509. sb->sb_acc += nb->datalen;
  510. sb->sb_ccc += nb->datalen;
  511. nb = NULL;
  512. } else if ((so->so_state & SS_NBIO) ||
  513. (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
  514. SOCK_SENDBUF_UNLOCK(so);
  515. error = EWOULDBLOCK;
  516. goto out;
  517. } else {
  518. if ((error = sbwait(so, SO_SND)) != 0) {
  519. SOCK_SENDBUF_UNLOCK(so);
  520. goto out;
  521. } else
  522. goto restart;
  523. }
  524. SOCK_SENDBUF_UNLOCK(so);
  525. if (nb == NULL) {
  526. NL_LOG(LOG_DEBUG3, "enqueue %u bytes", nb->datalen);
  527. NLP_LOCK(nlp);
  528. nl_schedule_taskqueue(nlp);
  529. NLP_UNLOCK(nlp);
  530. }
  531. out:
  532. SOCK_IO_SEND_UNLOCK(so);
  533. if (nb != NULL)
  534. nl_buf_free(nb);
  535. return (error);
  536. }
  537. /* Create control data for recvmsg(2) on Netlink socket. */
  538. static struct mbuf *
  539. nl_createcontrol(struct nlpcb *nlp)
  540. {
  541. struct {
  542. struct nlattr nla;
  543. uint32_t val;
  544. } data[] = {
  545. {
  546. .nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t),
  547. .nla.nla_type = NLMSGINFO_ATTR_PROCESS_ID,
  548. .val = nlp->nl_process_id,
  549. },
  550. {
  551. .nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t),
  552. .nla.nla_type = NLMSGINFO_ATTR_PORT_ID,
  553. .val = nlp->nl_port,
  554. },
  555. };
  556. return (sbcreatecontrol(data, sizeof(data), NETLINK_MSG_INFO,
  557. SOL_NETLINK, M_WAITOK));
  558. }
  559. static int
  560. nl_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
  561. struct mbuf **mp, struct mbuf **controlp, int *flagsp)
  562. {
  563. static const struct sockaddr_nl nl_empty_src = {
  564. .nl_len = sizeof(struct sockaddr_nl),
  565. .nl_family = PF_NETLINK,
  566. .nl_pid = 0 /* comes from the kernel */
  567. };
  568. struct sockbuf *sb = &so->so_rcv;
  569. struct nlpcb *nlp = sotonlpcb(so);
  570. struct nl_buf *first, *last, *nb, *next;
  571. struct nlmsghdr *hdr;
  572. int flags, error;
  573. u_int len, overflow, partoff, partlen, msgrcv, datalen;
  574. bool nonblock, trunc, peek;
  575. MPASS(mp == NULL && uio != NULL);
  576. NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
  577. if (psa != NULL)
  578. *psa = sodupsockaddr((const struct sockaddr *)&nl_empty_src,
  579. M_WAITOK);
  580. if (controlp != NULL && (nlp->nl_flags & NLF_MSG_INFO))
  581. *controlp = nl_createcontrol(nlp);
  582. flags = flagsp != NULL ? *flagsp & ~MSG_TRUNC : 0;
  583. trunc = flagsp != NULL ? *flagsp & MSG_TRUNC : false;
  584. nonblock = (so->so_state & SS_NBIO) ||
  585. (flags & (MSG_DONTWAIT | MSG_NBIO));
  586. peek = flags & MSG_PEEK;
  587. error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
  588. if (__predict_false(error))
  589. return (error);
  590. len = 0;
  591. overflow = 0;
  592. msgrcv = 0;
  593. datalen = 0;
  594. SOCK_RECVBUF_LOCK(so);
  595. while ((first = TAILQ_FIRST(&sb->nl_queue)) == NULL) {
  596. if (nonblock) {
  597. SOCK_RECVBUF_UNLOCK(so);
  598. SOCK_IO_RECV_UNLOCK(so);
  599. return (EWOULDBLOCK);
  600. }
  601. error = sbwait(so, SO_RCV);
  602. if (error) {
  603. SOCK_RECVBUF_UNLOCK(so);
  604. SOCK_IO_RECV_UNLOCK(so);
  605. return (error);
  606. }
  607. }
  608. /*
  609. * Netlink socket buffer consists of a queue of nl_bufs, but for the
  610. * userland there should be no boundaries. However, there are Netlink
  611. * messages, that shouldn't be split. Internal invariant is that a
  612. * message never spans two nl_bufs.
  613. * If a large userland buffer is provided, we would traverse the queue
  614. * until either queue end is reached or the buffer is fulfilled. If
  615. * an application provides a buffer that isn't able to fit a single
  616. * message, we would truncate it and lose its tail. This is the only
  617. * condition where we would lose data. If buffer is able to fit at
  618. * least one message, we would return it and won't truncate the next.
  619. *
  620. * We use same code for normal and MSG_PEEK case. At first queue pass
  621. * we scan nl_bufs and count lenght. In case we can read entire buffer
  622. * at one write everything is trivial. In case we can not, we save
  623. * pointer to the last (or partial) nl_buf and in the !peek case we
  624. * split the queue into two pieces. We can safely drop the queue lock,
  625. * as kernel would only append nl_bufs to the end of the queue, and
  626. * we are the exclusive owner of queue beginning due to sleepable lock.
  627. * At the second pass we copy data out and in !peek case free nl_bufs.
  628. */
  629. TAILQ_FOREACH(nb, &sb->nl_queue, tailq) {
  630. u_int offset;
  631. MPASS(nb->offset < nb->datalen);
  632. offset = nb->offset;
  633. while (offset < nb->datalen) {
  634. hdr = (struct nlmsghdr *)&nb->data[offset];
  635. MPASS(nb->offset + hdr->nlmsg_len <= nb->datalen);
  636. if (uio->uio_resid < len + hdr->nlmsg_len) {
  637. overflow = len + hdr->nlmsg_len -
  638. uio->uio_resid;
  639. partoff = nb->offset;
  640. if (offset > partoff) {
  641. partlen = offset - partoff;
  642. if (!peek) {
  643. nb->offset = offset;
  644. datalen += partlen;
  645. }
  646. } else if (len == 0 && uio->uio_resid > 0) {
  647. flags |= MSG_TRUNC;
  648. partlen = uio->uio_resid;
  649. if (peek)
  650. goto nospace;
  651. datalen += hdr->nlmsg_len;
  652. if (nb->offset + hdr->nlmsg_len ==
  653. nb->datalen) {
  654. /*
  655. * Avoid leaving empty nb.
  656. * Process last nb normally.
  657. * Trust uiomove() to care
  658. * about negative uio_resid.
  659. */
  660. nb = TAILQ_NEXT(nb, tailq);
  661. overflow = 0;
  662. partlen = 0;
  663. } else
  664. nb->offset += hdr->nlmsg_len;
  665. msgrcv++;
  666. } else
  667. partlen = 0;
  668. goto nospace;
  669. }
  670. len += hdr->nlmsg_len;
  671. offset += hdr->nlmsg_len;
  672. MPASS(offset <= nb->buflen);
  673. msgrcv++;
  674. }
  675. MPASS(offset == nb->datalen);
  676. datalen += nb->datalen - nb->offset;
  677. }
  678. nospace:
  679. last = nb;
  680. if (!peek) {
  681. if (last == NULL)
  682. TAILQ_INIT(&sb->nl_queue);
  683. else {
  684. /* XXXGL: create TAILQ_SPLIT */
  685. TAILQ_FIRST(&sb->nl_queue) = last;
  686. last->tailq.tqe_prev = &TAILQ_FIRST(&sb->nl_queue);
  687. }
  688. MPASS(sb->sb_acc >= datalen);
  689. sb->sb_acc -= datalen;
  690. sb->sb_ccc -= datalen;
  691. }
  692. SOCK_RECVBUF_UNLOCK(so);
  693. for (nb = first; nb != last; nb = next) {
  694. next = TAILQ_NEXT(nb, tailq);
  695. if (__predict_true(error == 0))
  696. error = uiomove(&nb->data[nb->offset],
  697. (int)(nb->datalen - nb->offset), uio);
  698. if (!peek)
  699. nl_buf_free(nb);
  700. }
  701. if (last != NULL && partlen > 0 && __predict_true(error == 0))
  702. error = uiomove(&nb->data[partoff], (int)partlen, uio);
  703. if (trunc && overflow > 0) {
  704. uio->uio_resid -= overflow;
  705. MPASS(uio->uio_resid < 0);
  706. } else
  707. MPASS(uio->uio_resid >= 0);
  708. if (uio->uio_td)
  709. uio->uio_td->td_ru.ru_msgrcv += msgrcv;
  710. if (flagsp != NULL)
  711. *flagsp |= flags;
  712. SOCK_IO_RECV_UNLOCK(so);
  713. nl_on_transmit(sotonlpcb(so));
  714. return (error);
  715. }
  716. static int
  717. nl_getoptflag(int sopt_name)
  718. {
  719. switch (sopt_name) {
  720. case NETLINK_CAP_ACK:
  721. return (NLF_CAP_ACK);
  722. case NETLINK_EXT_ACK:
  723. return (NLF_EXT_ACK);
  724. case NETLINK_GET_STRICT_CHK:
  725. return (NLF_STRICT);
  726. case NETLINK_MSG_INFO:
  727. return (NLF_MSG_INFO);
  728. }
  729. return (0);
  730. }
  731. static int
  732. nl_ctloutput(struct socket *so, struct sockopt *sopt)
  733. {
  734. struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
  735. struct nlpcb *nlp = sotonlpcb(so);
  736. uint32_t flag;
  737. int optval, error = 0;
  738. NLCTL_TRACKER;
  739. NL_LOG(LOG_DEBUG2, "%ssockopt(%p, %d)", (sopt->sopt_dir) ? "set" : "get",
  740. so, sopt->sopt_name);
  741. switch (sopt->sopt_dir) {
  742. case SOPT_SET:
  743. switch (sopt->sopt_name) {
  744. case NETLINK_ADD_MEMBERSHIP:
  745. case NETLINK_DROP_MEMBERSHIP:
  746. error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
  747. if (error != 0)
  748. break;
  749. if (optval <= 0 || optval >= NLP_MAX_GROUPS) {
  750. error = ERANGE;
  751. break;
  752. }
  753. NL_LOG(LOG_DEBUG2, "ADD/DEL group %d", (uint32_t)optval);
  754. NLCTL_WLOCK(ctl);
  755. if (sopt->sopt_name == NETLINK_ADD_MEMBERSHIP)
  756. nl_add_group_locked(nlp, optval);
  757. else
  758. nl_del_group_locked(nlp, optval);
  759. NLCTL_WUNLOCK(ctl);
  760. break;
  761. case NETLINK_CAP_ACK:
  762. case NETLINK_EXT_ACK:
  763. case NETLINK_GET_STRICT_CHK:
  764. case NETLINK_MSG_INFO:
  765. error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
  766. if (error != 0)
  767. break;
  768. flag = nl_getoptflag(sopt->sopt_name);
  769. if ((flag == NLF_MSG_INFO) && nlp->nl_linux) {
  770. error = EINVAL;
  771. break;
  772. }
  773. NLCTL_WLOCK(ctl);
  774. if (optval != 0)
  775. nlp->nl_flags |= flag;
  776. else
  777. nlp->nl_flags &= ~flag;
  778. NLCTL_WUNLOCK(ctl);
  779. break;
  780. default:
  781. error = ENOPROTOOPT;
  782. }
  783. break;
  784. case SOPT_GET:
  785. switch (sopt->sopt_name) {
  786. case NETLINK_LIST_MEMBERSHIPS:
  787. NLCTL_RLOCK(ctl);
  788. optval = nl_get_groups_compat(nlp);
  789. NLCTL_RUNLOCK(ctl);
  790. error = sooptcopyout(sopt, &optval, sizeof(optval));
  791. break;
  792. case NETLINK_CAP_ACK:
  793. case NETLINK_EXT_ACK:
  794. case NETLINK_GET_STRICT_CHK:
  795. case NETLINK_MSG_INFO:
  796. NLCTL_RLOCK(ctl);
  797. optval = (nlp->nl_flags & nl_getoptflag(sopt->sopt_name)) != 0;
  798. NLCTL_RUNLOCK(ctl);
  799. error = sooptcopyout(sopt, &optval, sizeof(optval));
  800. break;
  801. default:
  802. error = ENOPROTOOPT;
  803. }
  804. break;
  805. default:
  806. error = ENOPROTOOPT;
  807. }
  808. return (error);
  809. }
  810. static int
  811. sysctl_handle_nl_maxsockbuf(SYSCTL_HANDLER_ARGS)
  812. {
  813. int error = 0;
  814. u_long tmp_maxsockbuf = nl_maxsockbuf;
  815. error = sysctl_handle_long(oidp, &tmp_maxsockbuf, arg2, req);
  816. if (error || !req->newptr)
  817. return (error);
  818. if (tmp_maxsockbuf < MSIZE + MCLBYTES)
  819. return (EINVAL);
  820. nl_maxsockbuf = tmp_maxsockbuf;
  821. return (0);
  822. }
  823. static int
  824. nl_setsbopt(struct socket *so, struct sockopt *sopt)
  825. {
  826. int error, optval;
  827. bool result;
  828. if (sopt->sopt_name != SO_RCVBUF)
  829. return (sbsetopt(so, sopt));
  830. /* Allow to override max buffer size in certain conditions */
  831. error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
  832. if (error != 0)
  833. return (error);
  834. NL_LOG(LOG_DEBUG2, "socket %p, PID %d, SO_RCVBUF=%d", so, curproc->p_pid, optval);
  835. if (optval > sb_max_adj) {
  836. if (priv_check(curthread, PRIV_NET_ROUTE) != 0)
  837. return (EPERM);
  838. }
  839. SOCK_RECVBUF_LOCK(so);
  840. result = sbreserve_locked_limit(so, SO_RCV, optval, nl_maxsockbuf, curthread);
  841. SOCK_RECVBUF_UNLOCK(so);
  842. return (result ? 0 : ENOBUFS);
  843. }
  844. #define NETLINK_PROTOSW \
  845. .pr_flags = PR_ATOMIC | PR_ADDR | PR_SOCKBUF, \
  846. .pr_ctloutput = nl_ctloutput, \
  847. .pr_setsbopt = nl_setsbopt, \
  848. .pr_attach = nl_pru_attach, \
  849. .pr_bind = nl_pru_bind, \
  850. .pr_connect = nl_pru_connect, \
  851. .pr_disconnect = nl_pru_disconnect, \
  852. .pr_sosend = nl_sosend, \
  853. .pr_soreceive = nl_soreceive, \
  854. .pr_sockaddr = nl_sockaddr, \
  855. .pr_close = nl_close
  856. static struct protosw netlink_raw_sw = {
  857. .pr_type = SOCK_RAW,
  858. NETLINK_PROTOSW
  859. };
  860. static struct protosw netlink_dgram_sw = {
  861. .pr_type = SOCK_DGRAM,
  862. NETLINK_PROTOSW
  863. };
  864. static struct domain netlinkdomain = {
  865. .dom_family = PF_NETLINK,
  866. .dom_name = "netlink",
  867. .dom_flags = DOMF_UNLOADABLE,
  868. .dom_nprotosw = 2,
  869. .dom_protosw = { &netlink_raw_sw, &netlink_dgram_sw },
  870. };
  871. DOMAIN_SET(netlink);