netlink_io.c 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365
  1. /*-
  2. * SPDX-License-Identifier: BSD-2-Clause
  3. *
  4. * Copyright (c) 2021 Ng Peng Nam Sean
  5. * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
  6. *
  7. * Redistribution and use in source and binary forms, with or without
  8. * modification, are permitted provided that the following conditions
  9. * are met:
  10. * 1. Redistributions of source code must retain the above copyright
  11. * notice, this list of conditions and the following disclaimer.
  12. * 2. Redistributions in binary form must reproduce the above copyright
  13. * notice, this list of conditions and the following disclaimer in the
  14. * documentation and/or other materials provided with the distribution.
  15. *
  16. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  17. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  20. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26. * SUCH DAMAGE.
  27. */
  28. #include <sys/param.h>
  29. #include <sys/ck.h>
  30. #include <sys/lock.h>
  31. #include <sys/malloc.h>
  32. #include <sys/mbuf.h>
  33. #include <sys/mutex.h>
  34. #include <sys/socket.h>
  35. #include <sys/socketvar.h>
  36. #include <sys/syslog.h>
  37. #include <netlink/netlink.h>
  38. #include <netlink/netlink_ctl.h>
  39. #include <netlink/netlink_linux.h>
  40. #include <netlink/netlink_var.h>
  41. #define DEBUG_MOD_NAME nl_io
  42. #define DEBUG_MAX_LEVEL LOG_DEBUG3
  43. #include <netlink/netlink_debug.h>
  44. _DECLARE_DEBUG(LOG_INFO);
  45. /*
  46. * The logic below provide a p2p interface for receiving and
  47. * sending netlink data between the kernel and userland.
  48. */
  49. static bool nl_process_nbuf(struct nl_buf *nb, struct nlpcb *nlp);
  50. struct nl_buf *
  51. nl_buf_alloc(size_t len, int mflag)
  52. {
  53. struct nl_buf *nb;
  54. nb = malloc(sizeof(struct nl_buf) + len, M_NETLINK, mflag);
  55. if (__predict_true(nb != NULL)) {
  56. nb->buflen = len;
  57. nb->datalen = nb->offset = 0;
  58. }
  59. return (nb);
  60. }
  61. void
  62. nl_buf_free(struct nl_buf *nb)
  63. {
  64. free(nb, M_NETLINK);
  65. }
  66. void
  67. nl_schedule_taskqueue(struct nlpcb *nlp)
  68. {
  69. if (!nlp->nl_task_pending) {
  70. nlp->nl_task_pending = true;
  71. taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task);
  72. NL_LOG(LOG_DEBUG3, "taskqueue scheduled");
  73. } else {
  74. NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped");
  75. }
  76. }
  77. static bool
  78. nl_process_received_one(struct nlpcb *nlp)
  79. {
  80. struct socket *so = nlp->nl_socket;
  81. struct sockbuf *sb;
  82. struct nl_buf *nb;
  83. bool reschedule = false;
  84. NLP_LOCK(nlp);
  85. nlp->nl_task_pending = false;
  86. NLP_UNLOCK(nlp);
  87. /*
  88. * Do not process queued up requests if there is no space to queue
  89. * replies.
  90. */
  91. sb = &so->so_rcv;
  92. SOCK_RECVBUF_LOCK(so);
  93. if (sb->sb_hiwat <= sb->sb_ccc) {
  94. SOCK_RECVBUF_UNLOCK(so);
  95. return (false);
  96. }
  97. SOCK_RECVBUF_UNLOCK(so);
  98. sb = &so->so_snd;
  99. SOCK_SENDBUF_LOCK(so);
  100. while ((nb = TAILQ_FIRST(&sb->nl_queue)) != NULL) {
  101. TAILQ_REMOVE(&sb->nl_queue, nb, tailq);
  102. SOCK_SENDBUF_UNLOCK(so);
  103. reschedule = nl_process_nbuf(nb, nlp);
  104. SOCK_SENDBUF_LOCK(so);
  105. if (reschedule) {
  106. sb->sb_acc -= nb->datalen;
  107. sb->sb_ccc -= nb->datalen;
  108. /* XXXGL: potentially can reduce lock&unlock count. */
  109. sowwakeup_locked(so);
  110. nl_buf_free(nb);
  111. SOCK_SENDBUF_LOCK(so);
  112. } else {
  113. TAILQ_INSERT_HEAD(&sb->nl_queue, nb, tailq);
  114. break;
  115. }
  116. }
  117. SOCK_SENDBUF_UNLOCK(so);
  118. return (reschedule);
  119. }
  120. static void
  121. nl_process_received(struct nlpcb *nlp)
  122. {
  123. NL_LOG(LOG_DEBUG3, "taskqueue called");
  124. if (__predict_false(nlp->nl_need_thread_setup)) {
  125. nl_set_thread_nlp(curthread, nlp);
  126. NLP_LOCK(nlp);
  127. nlp->nl_need_thread_setup = false;
  128. NLP_UNLOCK(nlp);
  129. }
  130. while (nl_process_received_one(nlp))
  131. ;
  132. }
  133. /*
  134. * Called after some data have been read from the socket.
  135. */
  136. void
  137. nl_on_transmit(struct nlpcb *nlp)
  138. {
  139. NLP_LOCK(nlp);
  140. struct socket *so = nlp->nl_socket;
  141. if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) {
  142. unsigned long dropped_bytes = nlp->nl_dropped_bytes;
  143. unsigned long dropped_messages = nlp->nl_dropped_messages;
  144. nlp->nl_dropped_bytes = 0;
  145. nlp->nl_dropped_messages = 0;
  146. struct sockbuf *sb = &so->so_rcv;
  147. NLP_LOG(LOG_DEBUG, nlp,
  148. "socket RX overflowed, %lu messages (%lu bytes) dropped. "
  149. "bytes: [%u/%u]", dropped_messages, dropped_bytes,
  150. sb->sb_ccc, sb->sb_hiwat);
  151. /* TODO: send netlink message */
  152. }
  153. nl_schedule_taskqueue(nlp);
  154. NLP_UNLOCK(nlp);
  155. }
  156. void
  157. nl_taskqueue_handler(void *_arg, int pending)
  158. {
  159. struct nlpcb *nlp = (struct nlpcb *)_arg;
  160. CURVNET_SET(nlp->nl_socket->so_vnet);
  161. nl_process_received(nlp);
  162. CURVNET_RESTORE();
  163. }
  164. /*
  165. * Tries to send current data buffer from writer.
  166. *
  167. * Returns true on success.
  168. * If no queue overrunes happened, wakes up socket owner.
  169. */
  170. bool
  171. nl_send(struct nl_writer *nw, struct nlpcb *nlp)
  172. {
  173. struct socket *so = nlp->nl_socket;
  174. struct sockbuf *sb = &so->so_rcv;
  175. struct nl_buf *nb;
  176. MPASS(nw->hdr == NULL);
  177. MPASS(nw->buf != NULL);
  178. MPASS(nw->buf->datalen > 0);
  179. IF_DEBUG_LEVEL(LOG_DEBUG2) {
  180. struct nlmsghdr *hdr = (struct nlmsghdr *)nw->buf->data;
  181. NLP_LOG(LOG_DEBUG2, nlp,
  182. "TX len %u msgs %u msg type %d first hdrlen %u",
  183. nw->buf->datalen, nw->num_messages, hdr->nlmsg_type,
  184. hdr->nlmsg_len);
  185. }
  186. if (nlp->nl_linux && linux_netlink_p != NULL &&
  187. __predict_false(!linux_netlink_p->msgs_to_linux(nw, nlp))) {
  188. nl_buf_free(nw->buf);
  189. nw->buf = NULL;
  190. return (false);
  191. }
  192. nb = nw->buf;
  193. nw->buf = NULL;
  194. SOCK_RECVBUF_LOCK(so);
  195. if (!nw->ignore_limit && __predict_false(sb->sb_hiwat <= sb->sb_ccc)) {
  196. SOCK_RECVBUF_UNLOCK(so);
  197. NLP_LOCK(nlp);
  198. nlp->nl_dropped_bytes += nb->datalen;
  199. nlp->nl_dropped_messages += nw->num_messages;
  200. NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)",
  201. (unsigned long)nlp->nl_dropped_messages, nw->num_messages,
  202. (unsigned long)nlp->nl_dropped_bytes, nb->datalen);
  203. NLP_UNLOCK(nlp);
  204. nl_buf_free(nb);
  205. return (false);
  206. } else {
  207. bool full;
  208. TAILQ_INSERT_TAIL(&sb->nl_queue, nb, tailq);
  209. sb->sb_acc += nb->datalen;
  210. sb->sb_ccc += nb->datalen;
  211. full = sb->sb_hiwat <= sb->sb_ccc;
  212. sorwakeup_locked(so);
  213. if (full) {
  214. NLP_LOCK(nlp);
  215. nlp->nl_tx_blocked = true;
  216. NLP_UNLOCK(nlp);
  217. }
  218. return (true);
  219. }
  220. }
  221. static int
  222. nl_receive_message(struct nlmsghdr *hdr, int remaining_length,
  223. struct nlpcb *nlp, struct nl_pstate *npt)
  224. {
  225. nl_handler_f handler = nl_handlers[nlp->nl_proto].cb;
  226. int error = 0;
  227. NLP_LOG(LOG_DEBUG2, nlp, "msg len: %u type: %d: flags: 0x%X seq: %u pid: %u",
  228. hdr->nlmsg_len, hdr->nlmsg_type, hdr->nlmsg_flags, hdr->nlmsg_seq,
  229. hdr->nlmsg_pid);
  230. if (__predict_false(hdr->nlmsg_len > remaining_length)) {
  231. NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d",
  232. hdr->nlmsg_len, remaining_length);
  233. return (EINVAL);
  234. } else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) {
  235. NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len);
  236. return (EINVAL);
  237. }
  238. /* Stamp each message with sender pid */
  239. hdr->nlmsg_pid = nlp->nl_port;
  240. npt->hdr = hdr;
  241. if (hdr->nlmsg_flags & NLM_F_REQUEST &&
  242. hdr->nlmsg_type >= NLMSG_MIN_TYPE) {
  243. NL_LOG(LOG_DEBUG2, "handling message with msg type: %d",
  244. hdr->nlmsg_type);
  245. if (nlp->nl_linux) {
  246. MPASS(linux_netlink_p != NULL);
  247. error = linux_netlink_p->msg_from_linux(nlp->nl_proto,
  248. &hdr, npt);
  249. if (error)
  250. goto ack;
  251. }
  252. error = handler(hdr, npt);
  253. NL_LOG(LOG_DEBUG2, "retcode: %d", error);
  254. }
  255. ack:
  256. if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) {
  257. if (!npt->nw->suppress_ack) {
  258. NL_LOG(LOG_DEBUG3, "ack");
  259. nlmsg_ack(nlp, error, hdr, npt);
  260. }
  261. }
  262. return (0);
  263. }
  264. static void
  265. npt_clear(struct nl_pstate *npt)
  266. {
  267. lb_clear(&npt->lb);
  268. npt->error = 0;
  269. npt->err_msg = NULL;
  270. npt->err_off = 0;
  271. npt->hdr = NULL;
  272. npt->nw->suppress_ack = false;
  273. }
  274. /*
  275. * Processes an incoming packet, which can contain multiple netlink messages
  276. */
  277. static bool
  278. nl_process_nbuf(struct nl_buf *nb, struct nlpcb *nlp)
  279. {
  280. struct nlmsghdr *hdr;
  281. int error;
  282. NL_LOG(LOG_DEBUG3, "RX netlink buf %p on %p", nb, nlp->nl_socket);
  283. struct nl_writer nw = {};
  284. if (!nlmsg_get_unicast_writer(&nw, NLMSG_SMALL, nlp)) {
  285. NL_LOG(LOG_DEBUG, "error allocating socket writer");
  286. return (true);
  287. }
  288. nlmsg_ignore_limit(&nw);
  289. struct nl_pstate npt = {
  290. .nlp = nlp,
  291. .lb.base = &nb->data[roundup2(nb->datalen, 8)],
  292. .lb.size = nb->buflen - roundup2(nb->datalen, 8),
  293. .nw = &nw,
  294. .strict = nlp->nl_flags & NLF_STRICT,
  295. };
  296. for (; nb->offset + sizeof(struct nlmsghdr) <= nb->datalen;) {
  297. hdr = (struct nlmsghdr *)&nb->data[nb->offset];
  298. /* Save length prior to calling handler */
  299. int msglen = NLMSG_ALIGN(hdr->nlmsg_len);
  300. NL_LOG(LOG_DEBUG3, "parsing offset %d/%d",
  301. nb->offset, nb->datalen);
  302. npt_clear(&npt);
  303. error = nl_receive_message(hdr, nb->datalen - nb->offset, nlp,
  304. &npt);
  305. nb->offset += msglen;
  306. if (__predict_false(error != 0 || nlp->nl_tx_blocked))
  307. break;
  308. }
  309. NL_LOG(LOG_DEBUG3, "packet parsing done");
  310. nlmsg_flush(&nw);
  311. if (nlp->nl_tx_blocked) {
  312. NLP_LOCK(nlp);
  313. nlp->nl_tx_blocked = false;
  314. NLP_UNLOCK(nlp);
  315. return (false);
  316. } else
  317. return (true);
  318. }