veth.c 28 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265
  1. /*
  2. * drivers/net/veth.c
  3. *
  4. * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc
  5. *
  6. * Author: Pavel Emelianov <xemul@openvz.org>
  7. * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com>
  8. *
  9. */
  10. #include <linux/netdevice.h>
  11. #include <linux/slab.h>
  12. #include <linux/ethtool.h>
  13. #include <linux/etherdevice.h>
  14. #include <linux/u64_stats_sync.h>
  15. #include <net/rtnetlink.h>
  16. #include <net/dst.h>
  17. #include <net/xfrm.h>
  18. #include <net/xdp.h>
  19. #include <linux/veth.h>
  20. #include <linux/module.h>
  21. #include <linux/bpf.h>
  22. #include <linux/filter.h>
  23. #include <linux/ptr_ring.h>
  24. #include <linux/bpf_trace.h>
  25. #define DRV_NAME "veth"
  26. #define DRV_VERSION "1.0"
  27. #define VETH_XDP_FLAG BIT(0)
  28. #define VETH_RING_SIZE 256
  29. #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN)
  30. /* Separating two types of XDP xmit */
  31. #define VETH_XDP_TX BIT(0)
  32. #define VETH_XDP_REDIR BIT(1)
  33. struct pcpu_vstats {
  34. u64 packets;
  35. u64 bytes;
  36. struct u64_stats_sync syncp;
  37. };
  38. struct veth_rq {
  39. struct napi_struct xdp_napi;
  40. struct net_device *dev;
  41. struct bpf_prog __rcu *xdp_prog;
  42. struct xdp_mem_info xdp_mem;
  43. bool rx_notify_masked;
  44. struct ptr_ring xdp_ring;
  45. struct xdp_rxq_info xdp_rxq;
  46. };
  47. struct veth_priv {
  48. struct net_device __rcu *peer;
  49. atomic64_t dropped;
  50. struct bpf_prog *_xdp_prog;
  51. struct veth_rq *rq;
  52. unsigned int requested_headroom;
  53. };
  54. /*
  55. * ethtool interface
  56. */
  57. static struct {
  58. const char string[ETH_GSTRING_LEN];
  59. } ethtool_stats_keys[] = {
  60. { "peer_ifindex" },
  61. };
  62. static int veth_get_link_ksettings(struct net_device *dev,
  63. struct ethtool_link_ksettings *cmd)
  64. {
  65. cmd->base.speed = SPEED_10000;
  66. cmd->base.duplex = DUPLEX_FULL;
  67. cmd->base.port = PORT_TP;
  68. cmd->base.autoneg = AUTONEG_DISABLE;
  69. return 0;
  70. }
  71. static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
  72. {
  73. strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
  74. strlcpy(info->version, DRV_VERSION, sizeof(info->version));
  75. }
  76. static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
  77. {
  78. switch(stringset) {
  79. case ETH_SS_STATS:
  80. memcpy(buf, &ethtool_stats_keys, sizeof(ethtool_stats_keys));
  81. break;
  82. }
  83. }
  84. static int veth_get_sset_count(struct net_device *dev, int sset)
  85. {
  86. switch (sset) {
  87. case ETH_SS_STATS:
  88. return ARRAY_SIZE(ethtool_stats_keys);
  89. default:
  90. return -EOPNOTSUPP;
  91. }
  92. }
  93. static void veth_get_ethtool_stats(struct net_device *dev,
  94. struct ethtool_stats *stats, u64 *data)
  95. {
  96. struct veth_priv *priv = netdev_priv(dev);
  97. struct net_device *peer = rtnl_dereference(priv->peer);
  98. data[0] = peer ? peer->ifindex : 0;
  99. }
  100. static const struct ethtool_ops veth_ethtool_ops = {
  101. .get_drvinfo = veth_get_drvinfo,
  102. .get_link = ethtool_op_get_link,
  103. .get_strings = veth_get_strings,
  104. .get_sset_count = veth_get_sset_count,
  105. .get_ethtool_stats = veth_get_ethtool_stats,
  106. .get_link_ksettings = veth_get_link_ksettings,
  107. };
  108. /* general routines */
  109. static bool veth_is_xdp_frame(void *ptr)
  110. {
  111. return (unsigned long)ptr & VETH_XDP_FLAG;
  112. }
  113. static void *veth_ptr_to_xdp(void *ptr)
  114. {
  115. return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG);
  116. }
  117. static void *veth_xdp_to_ptr(void *ptr)
  118. {
  119. return (void *)((unsigned long)ptr | VETH_XDP_FLAG);
  120. }
  121. static void veth_ptr_free(void *ptr)
  122. {
  123. if (veth_is_xdp_frame(ptr))
  124. xdp_return_frame(veth_ptr_to_xdp(ptr));
  125. else
  126. kfree_skb(ptr);
  127. }
  128. static void __veth_xdp_flush(struct veth_rq *rq)
  129. {
  130. /* Write ptr_ring before reading rx_notify_masked */
  131. smp_mb();
  132. if (!rq->rx_notify_masked) {
  133. rq->rx_notify_masked = true;
  134. napi_schedule(&rq->xdp_napi);
  135. }
  136. }
  137. static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb)
  138. {
  139. if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) {
  140. dev_kfree_skb_any(skb);
  141. return NET_RX_DROP;
  142. }
  143. return NET_RX_SUCCESS;
  144. }
  145. static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb,
  146. struct veth_rq *rq, bool xdp)
  147. {
  148. return __dev_forward_skb(dev, skb) ?: xdp ?
  149. veth_xdp_rx(rq, skb) :
  150. netif_rx(skb);
  151. }
  152. static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
  153. {
  154. struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
  155. struct veth_rq *rq = NULL;
  156. struct net_device *rcv;
  157. int length = skb->len;
  158. bool rcv_xdp = false;
  159. int rxq;
  160. rcu_read_lock();
  161. rcv = rcu_dereference(priv->peer);
  162. if (unlikely(!rcv)) {
  163. kfree_skb(skb);
  164. goto drop;
  165. }
  166. rcv_priv = netdev_priv(rcv);
  167. rxq = skb_get_queue_mapping(skb);
  168. if (rxq < rcv->real_num_rx_queues) {
  169. rq = &rcv_priv->rq[rxq];
  170. rcv_xdp = rcu_access_pointer(rq->xdp_prog);
  171. if (rcv_xdp)
  172. skb_record_rx_queue(skb, rxq);
  173. }
  174. if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) {
  175. struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
  176. u64_stats_update_begin(&stats->syncp);
  177. stats->bytes += length;
  178. stats->packets++;
  179. u64_stats_update_end(&stats->syncp);
  180. } else {
  181. drop:
  182. atomic64_inc(&priv->dropped);
  183. }
  184. if (rcv_xdp)
  185. __veth_xdp_flush(rq);
  186. rcu_read_unlock();
  187. return NETDEV_TX_OK;
  188. }
  189. static u64 veth_stats_one(struct pcpu_vstats *result, struct net_device *dev)
  190. {
  191. struct veth_priv *priv = netdev_priv(dev);
  192. int cpu;
  193. result->packets = 0;
  194. result->bytes = 0;
  195. for_each_possible_cpu(cpu) {
  196. struct pcpu_vstats *stats = per_cpu_ptr(dev->vstats, cpu);
  197. u64 packets, bytes;
  198. unsigned int start;
  199. do {
  200. start = u64_stats_fetch_begin_irq(&stats->syncp);
  201. packets = stats->packets;
  202. bytes = stats->bytes;
  203. } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
  204. result->packets += packets;
  205. result->bytes += bytes;
  206. }
  207. return atomic64_read(&priv->dropped);
  208. }
  209. static void veth_get_stats64(struct net_device *dev,
  210. struct rtnl_link_stats64 *tot)
  211. {
  212. struct veth_priv *priv = netdev_priv(dev);
  213. struct net_device *peer;
  214. struct pcpu_vstats one;
  215. tot->tx_dropped = veth_stats_one(&one, dev);
  216. tot->tx_bytes = one.bytes;
  217. tot->tx_packets = one.packets;
  218. rcu_read_lock();
  219. peer = rcu_dereference(priv->peer);
  220. if (peer) {
  221. tot->rx_dropped = veth_stats_one(&one, peer);
  222. tot->rx_bytes = one.bytes;
  223. tot->rx_packets = one.packets;
  224. }
  225. rcu_read_unlock();
  226. }
  227. /* fake multicast ability */
  228. static void veth_set_multicast_list(struct net_device *dev)
  229. {
  230. }
  231. static struct sk_buff *veth_build_skb(void *head, int headroom, int len,
  232. int buflen)
  233. {
  234. struct sk_buff *skb;
  235. if (!buflen) {
  236. buflen = SKB_DATA_ALIGN(headroom + len) +
  237. SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
  238. }
  239. skb = build_skb(head, buflen);
  240. if (!skb)
  241. return NULL;
  242. skb_reserve(skb, headroom);
  243. skb_put(skb, len);
  244. return skb;
  245. }
  246. static int veth_select_rxq(struct net_device *dev)
  247. {
  248. return smp_processor_id() % dev->real_num_rx_queues;
  249. }
  250. static int veth_xdp_xmit(struct net_device *dev, int n,
  251. struct xdp_frame **frames, u32 flags)
  252. {
  253. struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
  254. struct net_device *rcv;
  255. unsigned int max_len;
  256. struct veth_rq *rq;
  257. int i, drops = 0;
  258. if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
  259. return -EINVAL;
  260. rcv = rcu_dereference(priv->peer);
  261. if (unlikely(!rcv))
  262. return -ENXIO;
  263. rcv_priv = netdev_priv(rcv);
  264. rq = &rcv_priv->rq[veth_select_rxq(rcv)];
  265. /* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive
  266. * side. This means an XDP program is loaded on the peer and the peer
  267. * device is up.
  268. */
  269. if (!rcu_access_pointer(rq->xdp_prog))
  270. return -ENXIO;
  271. max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN;
  272. spin_lock(&rq->xdp_ring.producer_lock);
  273. for (i = 0; i < n; i++) {
  274. struct xdp_frame *frame = frames[i];
  275. void *ptr = veth_xdp_to_ptr(frame);
  276. if (unlikely(frame->len > max_len ||
  277. __ptr_ring_produce(&rq->xdp_ring, ptr))) {
  278. xdp_return_frame_rx_napi(frame);
  279. drops++;
  280. }
  281. }
  282. spin_unlock(&rq->xdp_ring.producer_lock);
  283. if (flags & XDP_XMIT_FLUSH)
  284. __veth_xdp_flush(rq);
  285. return n - drops;
  286. }
  287. static void veth_xdp_flush(struct net_device *dev)
  288. {
  289. struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
  290. struct net_device *rcv;
  291. struct veth_rq *rq;
  292. rcu_read_lock();
  293. rcv = rcu_dereference(priv->peer);
  294. if (unlikely(!rcv))
  295. goto out;
  296. rcv_priv = netdev_priv(rcv);
  297. rq = &rcv_priv->rq[veth_select_rxq(rcv)];
  298. /* xdp_ring is initialized on receive side? */
  299. if (unlikely(!rcu_access_pointer(rq->xdp_prog)))
  300. goto out;
  301. __veth_xdp_flush(rq);
  302. out:
  303. rcu_read_unlock();
  304. }
  305. static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
  306. {
  307. struct xdp_frame *frame = convert_to_xdp_frame(xdp);
  308. if (unlikely(!frame))
  309. return -EOVERFLOW;
  310. return veth_xdp_xmit(dev, 1, &frame, 0);
  311. }
  312. static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq,
  313. struct xdp_frame *frame,
  314. unsigned int *xdp_xmit)
  315. {
  316. void *hard_start = frame->data - frame->headroom;
  317. void *head = hard_start - sizeof(struct xdp_frame);
  318. int len = frame->len, delta = 0;
  319. struct xdp_frame orig_frame;
  320. struct bpf_prog *xdp_prog;
  321. unsigned int headroom;
  322. struct sk_buff *skb;
  323. rcu_read_lock();
  324. xdp_prog = rcu_dereference(rq->xdp_prog);
  325. if (likely(xdp_prog)) {
  326. struct xdp_buff xdp;
  327. u32 act;
  328. xdp.data_hard_start = hard_start;
  329. xdp.data = frame->data;
  330. xdp.data_end = frame->data + frame->len;
  331. xdp.data_meta = frame->data - frame->metasize;
  332. xdp.rxq = &rq->xdp_rxq;
  333. act = bpf_prog_run_xdp(xdp_prog, &xdp);
  334. switch (act) {
  335. case XDP_PASS:
  336. delta = frame->data - xdp.data;
  337. len = xdp.data_end - xdp.data;
  338. break;
  339. case XDP_TX:
  340. orig_frame = *frame;
  341. xdp.data_hard_start = head;
  342. xdp.rxq->mem = frame->mem;
  343. if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) {
  344. trace_xdp_exception(rq->dev, xdp_prog, act);
  345. frame = &orig_frame;
  346. goto err_xdp;
  347. }
  348. *xdp_xmit |= VETH_XDP_TX;
  349. rcu_read_unlock();
  350. goto xdp_xmit;
  351. case XDP_REDIRECT:
  352. orig_frame = *frame;
  353. xdp.data_hard_start = head;
  354. xdp.rxq->mem = frame->mem;
  355. if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) {
  356. frame = &orig_frame;
  357. goto err_xdp;
  358. }
  359. *xdp_xmit |= VETH_XDP_REDIR;
  360. rcu_read_unlock();
  361. goto xdp_xmit;
  362. default:
  363. bpf_warn_invalid_xdp_action(act);
  364. case XDP_ABORTED:
  365. trace_xdp_exception(rq->dev, xdp_prog, act);
  366. case XDP_DROP:
  367. goto err_xdp;
  368. }
  369. }
  370. rcu_read_unlock();
  371. headroom = sizeof(struct xdp_frame) + frame->headroom - delta;
  372. skb = veth_build_skb(head, headroom, len, 0);
  373. if (!skb) {
  374. xdp_return_frame(frame);
  375. goto err;
  376. }
  377. xdp_scrub_frame(frame);
  378. skb->protocol = eth_type_trans(skb, rq->dev);
  379. err:
  380. return skb;
  381. err_xdp:
  382. rcu_read_unlock();
  383. xdp_return_frame(frame);
  384. xdp_xmit:
  385. return NULL;
  386. }
  387. static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb,
  388. unsigned int *xdp_xmit)
  389. {
  390. u32 pktlen, headroom, act, metalen;
  391. void *orig_data, *orig_data_end;
  392. struct bpf_prog *xdp_prog;
  393. int mac_len, delta, off;
  394. struct xdp_buff xdp;
  395. skb_orphan(skb);
  396. rcu_read_lock();
  397. xdp_prog = rcu_dereference(rq->xdp_prog);
  398. if (unlikely(!xdp_prog)) {
  399. rcu_read_unlock();
  400. goto out;
  401. }
  402. mac_len = skb->data - skb_mac_header(skb);
  403. pktlen = skb->len + mac_len;
  404. headroom = skb_headroom(skb) - mac_len;
  405. if (skb_shared(skb) || skb_head_is_locked(skb) ||
  406. skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {
  407. struct sk_buff *nskb;
  408. int size, head_off;
  409. void *head, *start;
  410. struct page *page;
  411. size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) +
  412. SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
  413. if (size > PAGE_SIZE)
  414. goto drop;
  415. page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
  416. if (!page)
  417. goto drop;
  418. head = page_address(page);
  419. start = head + VETH_XDP_HEADROOM;
  420. if (skb_copy_bits(skb, -mac_len, start, pktlen)) {
  421. page_frag_free(head);
  422. goto drop;
  423. }
  424. nskb = veth_build_skb(head,
  425. VETH_XDP_HEADROOM + mac_len, skb->len,
  426. PAGE_SIZE);
  427. if (!nskb) {
  428. page_frag_free(head);
  429. goto drop;
  430. }
  431. skb_copy_header(nskb, skb);
  432. head_off = skb_headroom(nskb) - skb_headroom(skb);
  433. skb_headers_offset_update(nskb, head_off);
  434. consume_skb(skb);
  435. skb = nskb;
  436. }
  437. xdp.data_hard_start = skb->head;
  438. xdp.data = skb_mac_header(skb);
  439. xdp.data_end = xdp.data + pktlen;
  440. xdp.data_meta = xdp.data;
  441. xdp.rxq = &rq->xdp_rxq;
  442. orig_data = xdp.data;
  443. orig_data_end = xdp.data_end;
  444. act = bpf_prog_run_xdp(xdp_prog, &xdp);
  445. switch (act) {
  446. case XDP_PASS:
  447. break;
  448. case XDP_TX:
  449. get_page(virt_to_page(xdp.data));
  450. consume_skb(skb);
  451. xdp.rxq->mem = rq->xdp_mem;
  452. if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) {
  453. trace_xdp_exception(rq->dev, xdp_prog, act);
  454. goto err_xdp;
  455. }
  456. *xdp_xmit |= VETH_XDP_TX;
  457. rcu_read_unlock();
  458. goto xdp_xmit;
  459. case XDP_REDIRECT:
  460. get_page(virt_to_page(xdp.data));
  461. consume_skb(skb);
  462. xdp.rxq->mem = rq->xdp_mem;
  463. if (xdp_do_redirect(rq->dev, &xdp, xdp_prog))
  464. goto err_xdp;
  465. *xdp_xmit |= VETH_XDP_REDIR;
  466. rcu_read_unlock();
  467. goto xdp_xmit;
  468. default:
  469. bpf_warn_invalid_xdp_action(act);
  470. case XDP_ABORTED:
  471. trace_xdp_exception(rq->dev, xdp_prog, act);
  472. case XDP_DROP:
  473. goto drop;
  474. }
  475. rcu_read_unlock();
  476. delta = orig_data - xdp.data;
  477. off = mac_len + delta;
  478. if (off > 0)
  479. __skb_push(skb, off);
  480. else if (off < 0)
  481. __skb_pull(skb, -off);
  482. skb->mac_header -= delta;
  483. off = xdp.data_end - orig_data_end;
  484. if (off != 0)
  485. __skb_put(skb, off);
  486. skb->protocol = eth_type_trans(skb, rq->dev);
  487. metalen = xdp.data - xdp.data_meta;
  488. if (metalen)
  489. skb_metadata_set(skb, metalen);
  490. out:
  491. return skb;
  492. drop:
  493. rcu_read_unlock();
  494. kfree_skb(skb);
  495. return NULL;
  496. err_xdp:
  497. rcu_read_unlock();
  498. page_frag_free(xdp.data);
  499. xdp_xmit:
  500. return NULL;
  501. }
  502. static int veth_xdp_rcv(struct veth_rq *rq, int budget, unsigned int *xdp_xmit)
  503. {
  504. int i, done = 0;
  505. for (i = 0; i < budget; i++) {
  506. void *ptr = __ptr_ring_consume(&rq->xdp_ring);
  507. struct sk_buff *skb;
  508. if (!ptr)
  509. break;
  510. if (veth_is_xdp_frame(ptr)) {
  511. skb = veth_xdp_rcv_one(rq, veth_ptr_to_xdp(ptr),
  512. xdp_xmit);
  513. } else {
  514. skb = veth_xdp_rcv_skb(rq, ptr, xdp_xmit);
  515. }
  516. if (skb)
  517. napi_gro_receive(&rq->xdp_napi, skb);
  518. done++;
  519. }
  520. return done;
  521. }
  522. static int veth_poll(struct napi_struct *napi, int budget)
  523. {
  524. struct veth_rq *rq =
  525. container_of(napi, struct veth_rq, xdp_napi);
  526. unsigned int xdp_xmit = 0;
  527. int done;
  528. xdp_set_return_frame_no_direct();
  529. done = veth_xdp_rcv(rq, budget, &xdp_xmit);
  530. if (done < budget && napi_complete_done(napi, done)) {
  531. /* Write rx_notify_masked before reading ptr_ring */
  532. smp_store_mb(rq->rx_notify_masked, false);
  533. if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) {
  534. rq->rx_notify_masked = true;
  535. napi_schedule(&rq->xdp_napi);
  536. }
  537. }
  538. if (xdp_xmit & VETH_XDP_TX)
  539. veth_xdp_flush(rq->dev);
  540. if (xdp_xmit & VETH_XDP_REDIR)
  541. xdp_do_flush_map();
  542. xdp_clear_return_frame_no_direct();
  543. return done;
  544. }
  545. static int veth_napi_add(struct net_device *dev)
  546. {
  547. struct veth_priv *priv = netdev_priv(dev);
  548. int err, i;
  549. for (i = 0; i < dev->real_num_rx_queues; i++) {
  550. struct veth_rq *rq = &priv->rq[i];
  551. err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL);
  552. if (err)
  553. goto err_xdp_ring;
  554. }
  555. for (i = 0; i < dev->real_num_rx_queues; i++) {
  556. struct veth_rq *rq = &priv->rq[i];
  557. netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
  558. napi_enable(&rq->xdp_napi);
  559. }
  560. return 0;
  561. err_xdp_ring:
  562. for (i--; i >= 0; i--)
  563. ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free);
  564. return err;
  565. }
  566. static void veth_napi_del(struct net_device *dev)
  567. {
  568. struct veth_priv *priv = netdev_priv(dev);
  569. int i;
  570. for (i = 0; i < dev->real_num_rx_queues; i++) {
  571. struct veth_rq *rq = &priv->rq[i];
  572. napi_disable(&rq->xdp_napi);
  573. napi_hash_del(&rq->xdp_napi);
  574. }
  575. synchronize_net();
  576. for (i = 0; i < dev->real_num_rx_queues; i++) {
  577. struct veth_rq *rq = &priv->rq[i];
  578. netif_napi_del(&rq->xdp_napi);
  579. rq->rx_notify_masked = false;
  580. ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free);
  581. }
  582. }
  583. static int veth_enable_xdp(struct net_device *dev)
  584. {
  585. struct veth_priv *priv = netdev_priv(dev);
  586. int err, i;
  587. if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) {
  588. for (i = 0; i < dev->real_num_rx_queues; i++) {
  589. struct veth_rq *rq = &priv->rq[i];
  590. err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i);
  591. if (err < 0)
  592. goto err_rxq_reg;
  593. err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
  594. MEM_TYPE_PAGE_SHARED,
  595. NULL);
  596. if (err < 0)
  597. goto err_reg_mem;
  598. /* Save original mem info as it can be overwritten */
  599. rq->xdp_mem = rq->xdp_rxq.mem;
  600. }
  601. err = veth_napi_add(dev);
  602. if (err)
  603. goto err_rxq_reg;
  604. }
  605. for (i = 0; i < dev->real_num_rx_queues; i++)
  606. rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog);
  607. return 0;
  608. err_reg_mem:
  609. xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
  610. err_rxq_reg:
  611. for (i--; i >= 0; i--)
  612. xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
  613. return err;
  614. }
  615. static void veth_disable_xdp(struct net_device *dev)
  616. {
  617. struct veth_priv *priv = netdev_priv(dev);
  618. int i;
  619. for (i = 0; i < dev->real_num_rx_queues; i++)
  620. rcu_assign_pointer(priv->rq[i].xdp_prog, NULL);
  621. veth_napi_del(dev);
  622. for (i = 0; i < dev->real_num_rx_queues; i++) {
  623. struct veth_rq *rq = &priv->rq[i];
  624. rq->xdp_rxq.mem = rq->xdp_mem;
  625. xdp_rxq_info_unreg(&rq->xdp_rxq);
  626. }
  627. }
  628. static int veth_open(struct net_device *dev)
  629. {
  630. struct veth_priv *priv = netdev_priv(dev);
  631. struct net_device *peer = rtnl_dereference(priv->peer);
  632. int err;
  633. if (!peer)
  634. return -ENOTCONN;
  635. if (priv->_xdp_prog) {
  636. err = veth_enable_xdp(dev);
  637. if (err)
  638. return err;
  639. }
  640. if (peer->flags & IFF_UP) {
  641. netif_carrier_on(dev);
  642. netif_carrier_on(peer);
  643. }
  644. return 0;
  645. }
  646. static int veth_close(struct net_device *dev)
  647. {
  648. struct veth_priv *priv = netdev_priv(dev);
  649. struct net_device *peer = rtnl_dereference(priv->peer);
  650. netif_carrier_off(dev);
  651. if (peer)
  652. netif_carrier_off(peer);
  653. if (priv->_xdp_prog)
  654. veth_disable_xdp(dev);
  655. return 0;
  656. }
  657. static int is_valid_veth_mtu(int mtu)
  658. {
  659. return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU;
  660. }
  661. static int veth_alloc_queues(struct net_device *dev)
  662. {
  663. struct veth_priv *priv = netdev_priv(dev);
  664. int i;
  665. priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL);
  666. if (!priv->rq)
  667. return -ENOMEM;
  668. for (i = 0; i < dev->num_rx_queues; i++)
  669. priv->rq[i].dev = dev;
  670. return 0;
  671. }
  672. static void veth_free_queues(struct net_device *dev)
  673. {
  674. struct veth_priv *priv = netdev_priv(dev);
  675. kfree(priv->rq);
  676. }
  677. static int veth_dev_init(struct net_device *dev)
  678. {
  679. int err;
  680. dev->vstats = netdev_alloc_pcpu_stats(struct pcpu_vstats);
  681. if (!dev->vstats)
  682. return -ENOMEM;
  683. err = veth_alloc_queues(dev);
  684. if (err) {
  685. free_percpu(dev->vstats);
  686. return err;
  687. }
  688. return 0;
  689. }
  690. static void veth_dev_free(struct net_device *dev)
  691. {
  692. veth_free_queues(dev);
  693. free_percpu(dev->vstats);
  694. }
  695. #ifdef CONFIG_NET_POLL_CONTROLLER
  696. static void veth_poll_controller(struct net_device *dev)
  697. {
  698. /* veth only receives frames when its peer sends one
  699. * Since it has nothing to do with disabling irqs, we are guaranteed
  700. * never to have pending data when we poll for it so
  701. * there is nothing to do here.
  702. *
  703. * We need this though so netpoll recognizes us as an interface that
  704. * supports polling, which enables bridge devices in virt setups to
  705. * still use netconsole
  706. */
  707. }
  708. #endif /* CONFIG_NET_POLL_CONTROLLER */
  709. static int veth_get_iflink(const struct net_device *dev)
  710. {
  711. struct veth_priv *priv = netdev_priv(dev);
  712. struct net_device *peer;
  713. int iflink;
  714. rcu_read_lock();
  715. peer = rcu_dereference(priv->peer);
  716. iflink = peer ? peer->ifindex : 0;
  717. rcu_read_unlock();
  718. return iflink;
  719. }
  720. static netdev_features_t veth_fix_features(struct net_device *dev,
  721. netdev_features_t features)
  722. {
  723. struct veth_priv *priv = netdev_priv(dev);
  724. struct net_device *peer;
  725. peer = rtnl_dereference(priv->peer);
  726. if (peer) {
  727. struct veth_priv *peer_priv = netdev_priv(peer);
  728. if (peer_priv->_xdp_prog)
  729. features &= ~NETIF_F_GSO_SOFTWARE;
  730. }
  731. return features;
  732. }
  733. static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
  734. {
  735. struct veth_priv *peer_priv, *priv = netdev_priv(dev);
  736. struct net_device *peer;
  737. if (new_hr < 0)
  738. new_hr = 0;
  739. rcu_read_lock();
  740. peer = rcu_dereference(priv->peer);
  741. if (unlikely(!peer))
  742. goto out;
  743. peer_priv = netdev_priv(peer);
  744. priv->requested_headroom = new_hr;
  745. new_hr = max(priv->requested_headroom, peer_priv->requested_headroom);
  746. dev->needed_headroom = new_hr;
  747. peer->needed_headroom = new_hr;
  748. out:
  749. rcu_read_unlock();
  750. }
  751. static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
  752. struct netlink_ext_ack *extack)
  753. {
  754. struct veth_priv *priv = netdev_priv(dev);
  755. struct bpf_prog *old_prog;
  756. struct net_device *peer;
  757. unsigned int max_mtu;
  758. int err;
  759. old_prog = priv->_xdp_prog;
  760. priv->_xdp_prog = prog;
  761. peer = rtnl_dereference(priv->peer);
  762. if (prog) {
  763. if (!peer) {
  764. NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached");
  765. err = -ENOTCONN;
  766. goto err;
  767. }
  768. max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM -
  769. peer->hard_header_len -
  770. SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
  771. if (peer->mtu > max_mtu) {
  772. NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP");
  773. err = -ERANGE;
  774. goto err;
  775. }
  776. if (dev->real_num_rx_queues < peer->real_num_tx_queues) {
  777. NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues");
  778. err = -ENOSPC;
  779. goto err;
  780. }
  781. if (dev->flags & IFF_UP) {
  782. err = veth_enable_xdp(dev);
  783. if (err) {
  784. NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed");
  785. goto err;
  786. }
  787. }
  788. if (!old_prog) {
  789. peer->hw_features &= ~NETIF_F_GSO_SOFTWARE;
  790. peer->max_mtu = max_mtu;
  791. }
  792. }
  793. if (old_prog) {
  794. if (!prog) {
  795. if (dev->flags & IFF_UP)
  796. veth_disable_xdp(dev);
  797. if (peer) {
  798. peer->hw_features |= NETIF_F_GSO_SOFTWARE;
  799. peer->max_mtu = ETH_MAX_MTU;
  800. }
  801. }
  802. bpf_prog_put(old_prog);
  803. }
  804. if ((!!old_prog ^ !!prog) && peer)
  805. netdev_update_features(peer);
  806. return 0;
  807. err:
  808. priv->_xdp_prog = old_prog;
  809. return err;
  810. }
  811. static u32 veth_xdp_query(struct net_device *dev)
  812. {
  813. struct veth_priv *priv = netdev_priv(dev);
  814. const struct bpf_prog *xdp_prog;
  815. xdp_prog = priv->_xdp_prog;
  816. if (xdp_prog)
  817. return xdp_prog->aux->id;
  818. return 0;
  819. }
  820. static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
  821. {
  822. switch (xdp->command) {
  823. case XDP_SETUP_PROG:
  824. return veth_xdp_set(dev, xdp->prog, xdp->extack);
  825. case XDP_QUERY_PROG:
  826. xdp->prog_id = veth_xdp_query(dev);
  827. return 0;
  828. default:
  829. return -EINVAL;
  830. }
  831. }
  832. static const struct net_device_ops veth_netdev_ops = {
  833. .ndo_init = veth_dev_init,
  834. .ndo_open = veth_open,
  835. .ndo_stop = veth_close,
  836. .ndo_start_xmit = veth_xmit,
  837. .ndo_get_stats64 = veth_get_stats64,
  838. .ndo_set_rx_mode = veth_set_multicast_list,
  839. .ndo_set_mac_address = eth_mac_addr,
  840. #ifdef CONFIG_NET_POLL_CONTROLLER
  841. .ndo_poll_controller = veth_poll_controller,
  842. #endif
  843. .ndo_get_iflink = veth_get_iflink,
  844. .ndo_fix_features = veth_fix_features,
  845. .ndo_features_check = passthru_features_check,
  846. .ndo_set_rx_headroom = veth_set_rx_headroom,
  847. .ndo_bpf = veth_xdp,
  848. .ndo_xdp_xmit = veth_xdp_xmit,
  849. };
  850. #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
  851. NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \
  852. NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \
  853. NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \
  854. NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX )
  855. static void veth_setup(struct net_device *dev)
  856. {
  857. ether_setup(dev);
  858. dev->priv_flags &= ~IFF_TX_SKB_SHARING;
  859. dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
  860. dev->priv_flags |= IFF_NO_QUEUE;
  861. dev->priv_flags |= IFF_PHONY_HEADROOM;
  862. dev->netdev_ops = &veth_netdev_ops;
  863. dev->ethtool_ops = &veth_ethtool_ops;
  864. dev->features |= NETIF_F_LLTX;
  865. dev->features |= VETH_FEATURES;
  866. dev->vlan_features = dev->features &
  867. ~(NETIF_F_HW_VLAN_CTAG_TX |
  868. NETIF_F_HW_VLAN_STAG_TX |
  869. NETIF_F_HW_VLAN_CTAG_RX |
  870. NETIF_F_HW_VLAN_STAG_RX);
  871. dev->needs_free_netdev = true;
  872. dev->priv_destructor = veth_dev_free;
  873. dev->max_mtu = ETH_MAX_MTU;
  874. dev->hw_features = VETH_FEATURES;
  875. dev->hw_enc_features = VETH_FEATURES;
  876. dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
  877. }
  878. /*
  879. * netlink interface
  880. */
  881. static int veth_validate(struct nlattr *tb[], struct nlattr *data[],
  882. struct netlink_ext_ack *extack)
  883. {
  884. if (tb[IFLA_ADDRESS]) {
  885. if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
  886. return -EINVAL;
  887. if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
  888. return -EADDRNOTAVAIL;
  889. }
  890. if (tb[IFLA_MTU]) {
  891. if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU])))
  892. return -EINVAL;
  893. }
  894. return 0;
  895. }
  896. static struct rtnl_link_ops veth_link_ops;
  897. static int veth_newlink(struct net *src_net, struct net_device *dev,
  898. struct nlattr *tb[], struct nlattr *data[],
  899. struct netlink_ext_ack *extack)
  900. {
  901. int err;
  902. struct net_device *peer;
  903. struct veth_priv *priv;
  904. char ifname[IFNAMSIZ];
  905. struct nlattr *peer_tb[IFLA_MAX + 1], **tbp;
  906. unsigned char name_assign_type;
  907. struct ifinfomsg *ifmp;
  908. struct net *net;
  909. /*
  910. * create and register peer first
  911. */
  912. if (data != NULL && data[VETH_INFO_PEER] != NULL) {
  913. struct nlattr *nla_peer;
  914. nla_peer = data[VETH_INFO_PEER];
  915. ifmp = nla_data(nla_peer);
  916. err = rtnl_nla_parse_ifla(peer_tb,
  917. nla_data(nla_peer) + sizeof(struct ifinfomsg),
  918. nla_len(nla_peer) - sizeof(struct ifinfomsg),
  919. NULL);
  920. if (err < 0)
  921. return err;
  922. err = veth_validate(peer_tb, NULL, extack);
  923. if (err < 0)
  924. return err;
  925. tbp = peer_tb;
  926. } else {
  927. ifmp = NULL;
  928. tbp = tb;
  929. }
  930. if (ifmp && tbp[IFLA_IFNAME]) {
  931. nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
  932. name_assign_type = NET_NAME_USER;
  933. } else {
  934. snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d");
  935. name_assign_type = NET_NAME_ENUM;
  936. }
  937. net = rtnl_link_get_net(src_net, tbp);
  938. if (IS_ERR(net))
  939. return PTR_ERR(net);
  940. peer = rtnl_create_link(net, ifname, name_assign_type,
  941. &veth_link_ops, tbp);
  942. if (IS_ERR(peer)) {
  943. put_net(net);
  944. return PTR_ERR(peer);
  945. }
  946. if (!ifmp || !tbp[IFLA_ADDRESS])
  947. eth_hw_addr_random(peer);
  948. if (ifmp && (dev->ifindex != 0))
  949. peer->ifindex = ifmp->ifi_index;
  950. peer->gso_max_size = dev->gso_max_size;
  951. peer->gso_max_segs = dev->gso_max_segs;
  952. err = register_netdevice(peer);
  953. put_net(net);
  954. net = NULL;
  955. if (err < 0)
  956. goto err_register_peer;
  957. netif_carrier_off(peer);
  958. err = rtnl_configure_link(peer, ifmp);
  959. if (err < 0)
  960. goto err_configure_peer;
  961. /*
  962. * register dev last
  963. *
  964. * note, that since we've registered new device the dev's name
  965. * should be re-allocated
  966. */
  967. if (tb[IFLA_ADDRESS] == NULL)
  968. eth_hw_addr_random(dev);
  969. if (tb[IFLA_IFNAME])
  970. nla_strlcpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
  971. else
  972. snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d");
  973. err = register_netdevice(dev);
  974. if (err < 0)
  975. goto err_register_dev;
  976. netif_carrier_off(dev);
  977. /*
  978. * tie the deviced together
  979. */
  980. priv = netdev_priv(dev);
  981. rcu_assign_pointer(priv->peer, peer);
  982. priv = netdev_priv(peer);
  983. rcu_assign_pointer(priv->peer, dev);
  984. return 0;
  985. err_register_dev:
  986. /* nothing to do */
  987. err_configure_peer:
  988. unregister_netdevice(peer);
  989. return err;
  990. err_register_peer:
  991. free_netdev(peer);
  992. return err;
  993. }
  994. static void veth_dellink(struct net_device *dev, struct list_head *head)
  995. {
  996. struct veth_priv *priv;
  997. struct net_device *peer;
  998. priv = netdev_priv(dev);
  999. peer = rtnl_dereference(priv->peer);
  1000. /* Note : dellink() is called from default_device_exit_batch(),
  1001. * before a rcu_synchronize() point. The devices are guaranteed
  1002. * not being freed before one RCU grace period.
  1003. */
  1004. RCU_INIT_POINTER(priv->peer, NULL);
  1005. unregister_netdevice_queue(dev, head);
  1006. if (peer) {
  1007. priv = netdev_priv(peer);
  1008. RCU_INIT_POINTER(priv->peer, NULL);
  1009. unregister_netdevice_queue(peer, head);
  1010. }
  1011. }
  1012. static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = {
  1013. [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) },
  1014. };
  1015. static struct net *veth_get_link_net(const struct net_device *dev)
  1016. {
  1017. struct veth_priv *priv = netdev_priv(dev);
  1018. struct net_device *peer = rtnl_dereference(priv->peer);
  1019. return peer ? dev_net(peer) : dev_net(dev);
  1020. }
  1021. static struct rtnl_link_ops veth_link_ops = {
  1022. .kind = DRV_NAME,
  1023. .priv_size = sizeof(struct veth_priv),
  1024. .setup = veth_setup,
  1025. .validate = veth_validate,
  1026. .newlink = veth_newlink,
  1027. .dellink = veth_dellink,
  1028. .policy = veth_policy,
  1029. .maxtype = VETH_INFO_MAX,
  1030. .get_link_net = veth_get_link_net,
  1031. };
  1032. /*
  1033. * init/fini
  1034. */
  1035. static __init int veth_init(void)
  1036. {
  1037. return rtnl_link_register(&veth_link_ops);
  1038. }
  1039. static __exit void veth_exit(void)
  1040. {
  1041. rtnl_link_unregister(&veth_link_ops);
  1042. }
  1043. module_init(veth_init);
  1044. module_exit(veth_exit);
  1045. MODULE_DESCRIPTION("Virtual Ethernet Tunnel");
  1046. MODULE_LICENSE("GPL v2");
  1047. MODULE_ALIAS_RTNL_LINK(DRV_NAME);