veth.c 32 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * drivers/net/veth.c
  4. *
  5. * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc
  6. *
  7. * Author: Pavel Emelianov <xemul@openvz.org>
  8. * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com>
  9. *
  10. */
  11. #include <linux/netdevice.h>
  12. #include <linux/slab.h>
  13. #include <linux/ethtool.h>
  14. #include <linux/etherdevice.h>
  15. #include <linux/u64_stats_sync.h>
  16. #include <net/rtnetlink.h>
  17. #include <net/dst.h>
  18. #include <net/xfrm.h>
  19. #include <net/xdp.h>
  20. #include <linux/veth.h>
  21. #include <linux/module.h>
  22. #include <linux/bpf.h>
  23. #include <linux/filter.h>
  24. #include <linux/ptr_ring.h>
  25. #include <linux/bpf_trace.h>
  26. #include <linux/net_tstamp.h>
  27. #define DRV_NAME "veth"
  28. #define DRV_VERSION "1.0"
  29. #define VETH_XDP_FLAG BIT(0)
  30. #define VETH_RING_SIZE 256
  31. #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN)
  32. /* Separating two types of XDP xmit */
  33. #define VETH_XDP_TX BIT(0)
  34. #define VETH_XDP_REDIR BIT(1)
  35. #define VETH_XDP_TX_BULK_SIZE 16
  36. struct veth_rq_stats {
  37. u64 xdp_packets;
  38. u64 xdp_bytes;
  39. u64 xdp_drops;
  40. struct u64_stats_sync syncp;
  41. };
  42. struct veth_rq {
  43. struct napi_struct xdp_napi;
  44. struct net_device *dev;
  45. struct bpf_prog __rcu *xdp_prog;
  46. struct xdp_mem_info xdp_mem;
  47. struct veth_rq_stats stats;
  48. bool rx_notify_masked;
  49. struct ptr_ring xdp_ring;
  50. struct xdp_rxq_info xdp_rxq;
  51. };
  52. struct veth_priv {
  53. struct net_device __rcu *peer;
  54. atomic64_t dropped;
  55. struct bpf_prog *_xdp_prog;
  56. struct veth_rq *rq;
  57. unsigned int requested_headroom;
  58. };
  59. struct veth_xdp_tx_bq {
  60. struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE];
  61. unsigned int count;
  62. };
  63. /*
  64. * ethtool interface
  65. */
  66. struct veth_q_stat_desc {
  67. char desc[ETH_GSTRING_LEN];
  68. size_t offset;
  69. };
  70. #define VETH_RQ_STAT(m) offsetof(struct veth_rq_stats, m)
  71. static const struct veth_q_stat_desc veth_rq_stats_desc[] = {
  72. { "xdp_packets", VETH_RQ_STAT(xdp_packets) },
  73. { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) },
  74. { "xdp_drops", VETH_RQ_STAT(xdp_drops) },
  75. };
  76. #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc)
  77. static struct {
  78. const char string[ETH_GSTRING_LEN];
  79. } ethtool_stats_keys[] = {
  80. { "peer_ifindex" },
  81. };
  82. static int veth_get_link_ksettings(struct net_device *dev,
  83. struct ethtool_link_ksettings *cmd)
  84. {
  85. cmd->base.speed = SPEED_10000;
  86. cmd->base.duplex = DUPLEX_FULL;
  87. cmd->base.port = PORT_TP;
  88. cmd->base.autoneg = AUTONEG_DISABLE;
  89. return 0;
  90. }
  91. static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
  92. {
  93. strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
  94. strlcpy(info->version, DRV_VERSION, sizeof(info->version));
  95. }
  96. static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
  97. {
  98. char *p = (char *)buf;
  99. int i, j;
  100. switch(stringset) {
  101. case ETH_SS_STATS:
  102. memcpy(p, &ethtool_stats_keys, sizeof(ethtool_stats_keys));
  103. p += sizeof(ethtool_stats_keys);
  104. for (i = 0; i < dev->real_num_rx_queues; i++) {
  105. for (j = 0; j < VETH_RQ_STATS_LEN; j++) {
  106. snprintf(p, ETH_GSTRING_LEN,
  107. "rx_queue_%u_%.11s",
  108. i, veth_rq_stats_desc[j].desc);
  109. p += ETH_GSTRING_LEN;
  110. }
  111. }
  112. break;
  113. }
  114. }
  115. static int veth_get_sset_count(struct net_device *dev, int sset)
  116. {
  117. switch (sset) {
  118. case ETH_SS_STATS:
  119. return ARRAY_SIZE(ethtool_stats_keys) +
  120. VETH_RQ_STATS_LEN * dev->real_num_rx_queues;
  121. default:
  122. return -EOPNOTSUPP;
  123. }
  124. }
  125. static void veth_get_ethtool_stats(struct net_device *dev,
  126. struct ethtool_stats *stats, u64 *data)
  127. {
  128. struct veth_priv *priv = netdev_priv(dev);
  129. struct net_device *peer = rtnl_dereference(priv->peer);
  130. int i, j, idx;
  131. data[0] = peer ? peer->ifindex : 0;
  132. idx = 1;
  133. for (i = 0; i < dev->real_num_rx_queues; i++) {
  134. const struct veth_rq_stats *rq_stats = &priv->rq[i].stats;
  135. const void *stats_base = (void *)rq_stats;
  136. unsigned int start;
  137. size_t offset;
  138. do {
  139. start = u64_stats_fetch_begin_irq(&rq_stats->syncp);
  140. for (j = 0; j < VETH_RQ_STATS_LEN; j++) {
  141. offset = veth_rq_stats_desc[j].offset;
  142. data[idx + j] = *(u64 *)(stats_base + offset);
  143. }
  144. } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start));
  145. idx += VETH_RQ_STATS_LEN;
  146. }
  147. }
  148. static const struct ethtool_ops veth_ethtool_ops = {
  149. .get_drvinfo = veth_get_drvinfo,
  150. .get_link = ethtool_op_get_link,
  151. .get_strings = veth_get_strings,
  152. .get_sset_count = veth_get_sset_count,
  153. .get_ethtool_stats = veth_get_ethtool_stats,
  154. .get_link_ksettings = veth_get_link_ksettings,
  155. .get_ts_info = ethtool_op_get_ts_info,
  156. };
  157. /* general routines */
  158. static bool veth_is_xdp_frame(void *ptr)
  159. {
  160. return (unsigned long)ptr & VETH_XDP_FLAG;
  161. }
  162. static void *veth_ptr_to_xdp(void *ptr)
  163. {
  164. return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG);
  165. }
  166. static void *veth_xdp_to_ptr(void *ptr)
  167. {
  168. return (void *)((unsigned long)ptr | VETH_XDP_FLAG);
  169. }
  170. static void veth_ptr_free(void *ptr)
  171. {
  172. if (veth_is_xdp_frame(ptr))
  173. xdp_return_frame(veth_ptr_to_xdp(ptr));
  174. else
  175. kfree_skb(ptr);
  176. }
  177. static void __veth_xdp_flush(struct veth_rq *rq)
  178. {
  179. /* Write ptr_ring before reading rx_notify_masked */
  180. smp_mb();
  181. if (!rq->rx_notify_masked) {
  182. rq->rx_notify_masked = true;
  183. napi_schedule(&rq->xdp_napi);
  184. }
  185. }
  186. static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb)
  187. {
  188. if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) {
  189. dev_kfree_skb_any(skb);
  190. return NET_RX_DROP;
  191. }
  192. return NET_RX_SUCCESS;
  193. }
  194. static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb,
  195. struct veth_rq *rq, bool xdp)
  196. {
  197. return __dev_forward_skb(dev, skb) ?: xdp ?
  198. veth_xdp_rx(rq, skb) :
  199. netif_rx(skb);
  200. }
  201. static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
  202. {
  203. struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
  204. struct veth_rq *rq = NULL;
  205. struct net_device *rcv;
  206. int length = skb->len;
  207. bool rcv_xdp = false;
  208. int rxq;
  209. rcu_read_lock();
  210. rcv = rcu_dereference(priv->peer);
  211. if (unlikely(!rcv)) {
  212. kfree_skb(skb);
  213. goto drop;
  214. }
  215. rcv_priv = netdev_priv(rcv);
  216. rxq = skb_get_queue_mapping(skb);
  217. if (rxq < rcv->real_num_rx_queues) {
  218. rq = &rcv_priv->rq[rxq];
  219. rcv_xdp = rcu_access_pointer(rq->xdp_prog);
  220. skb_record_rx_queue(skb, rxq);
  221. }
  222. skb_tx_timestamp(skb);
  223. if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) {
  224. if (!rcv_xdp) {
  225. struct pcpu_lstats *stats = this_cpu_ptr(dev->lstats);
  226. u64_stats_update_begin(&stats->syncp);
  227. stats->bytes += length;
  228. stats->packets++;
  229. u64_stats_update_end(&stats->syncp);
  230. }
  231. } else {
  232. drop:
  233. atomic64_inc(&priv->dropped);
  234. }
  235. if (rcv_xdp)
  236. __veth_xdp_flush(rq);
  237. rcu_read_unlock();
  238. return NETDEV_TX_OK;
  239. }
  240. static u64 veth_stats_tx(struct pcpu_lstats *result, struct net_device *dev)
  241. {
  242. struct veth_priv *priv = netdev_priv(dev);
  243. int cpu;
  244. result->packets = 0;
  245. result->bytes = 0;
  246. for_each_possible_cpu(cpu) {
  247. struct pcpu_lstats *stats = per_cpu_ptr(dev->lstats, cpu);
  248. u64 packets, bytes;
  249. unsigned int start;
  250. do {
  251. start = u64_stats_fetch_begin_irq(&stats->syncp);
  252. packets = stats->packets;
  253. bytes = stats->bytes;
  254. } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
  255. result->packets += packets;
  256. result->bytes += bytes;
  257. }
  258. return atomic64_read(&priv->dropped);
  259. }
  260. static void veth_stats_rx(struct veth_rq_stats *result, struct net_device *dev)
  261. {
  262. struct veth_priv *priv = netdev_priv(dev);
  263. int i;
  264. result->xdp_packets = 0;
  265. result->xdp_bytes = 0;
  266. result->xdp_drops = 0;
  267. for (i = 0; i < dev->num_rx_queues; i++) {
  268. struct veth_rq_stats *stats = &priv->rq[i].stats;
  269. u64 packets, bytes, drops;
  270. unsigned int start;
  271. do {
  272. start = u64_stats_fetch_begin_irq(&stats->syncp);
  273. packets = stats->xdp_packets;
  274. bytes = stats->xdp_bytes;
  275. drops = stats->xdp_drops;
  276. } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
  277. result->xdp_packets += packets;
  278. result->xdp_bytes += bytes;
  279. result->xdp_drops += drops;
  280. }
  281. }
  282. static void veth_get_stats64(struct net_device *dev,
  283. struct rtnl_link_stats64 *tot)
  284. {
  285. struct veth_priv *priv = netdev_priv(dev);
  286. struct net_device *peer;
  287. struct veth_rq_stats rx;
  288. struct pcpu_lstats tx;
  289. tot->tx_dropped = veth_stats_tx(&tx, dev);
  290. tot->tx_bytes = tx.bytes;
  291. tot->tx_packets = tx.packets;
  292. veth_stats_rx(&rx, dev);
  293. tot->rx_dropped = rx.xdp_drops;
  294. tot->rx_bytes = rx.xdp_bytes;
  295. tot->rx_packets = rx.xdp_packets;
  296. rcu_read_lock();
  297. peer = rcu_dereference(priv->peer);
  298. if (peer) {
  299. tot->rx_dropped += veth_stats_tx(&tx, peer);
  300. tot->rx_bytes += tx.bytes;
  301. tot->rx_packets += tx.packets;
  302. veth_stats_rx(&rx, peer);
  303. tot->tx_bytes += rx.xdp_bytes;
  304. tot->tx_packets += rx.xdp_packets;
  305. }
  306. rcu_read_unlock();
  307. }
  308. /* fake multicast ability */
  309. static void veth_set_multicast_list(struct net_device *dev)
  310. {
  311. }
  312. static struct sk_buff *veth_build_skb(void *head, int headroom, int len,
  313. int buflen)
  314. {
  315. struct sk_buff *skb;
  316. if (!buflen) {
  317. buflen = SKB_DATA_ALIGN(headroom + len) +
  318. SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
  319. }
  320. skb = build_skb(head, buflen);
  321. if (!skb)
  322. return NULL;
  323. skb_reserve(skb, headroom);
  324. skb_put(skb, len);
  325. return skb;
  326. }
  327. static int veth_select_rxq(struct net_device *dev)
  328. {
  329. return smp_processor_id() % dev->real_num_rx_queues;
  330. }
  331. static int veth_xdp_xmit(struct net_device *dev, int n,
  332. struct xdp_frame **frames, u32 flags)
  333. {
  334. struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
  335. struct net_device *rcv;
  336. int i, ret, drops = n;
  337. unsigned int max_len;
  338. struct veth_rq *rq;
  339. if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) {
  340. ret = -EINVAL;
  341. goto drop;
  342. }
  343. rcv = rcu_dereference(priv->peer);
  344. if (unlikely(!rcv)) {
  345. ret = -ENXIO;
  346. goto drop;
  347. }
  348. rcv_priv = netdev_priv(rcv);
  349. rq = &rcv_priv->rq[veth_select_rxq(rcv)];
  350. /* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive
  351. * side. This means an XDP program is loaded on the peer and the peer
  352. * device is up.
  353. */
  354. if (!rcu_access_pointer(rq->xdp_prog)) {
  355. ret = -ENXIO;
  356. goto drop;
  357. }
  358. drops = 0;
  359. max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN;
  360. spin_lock(&rq->xdp_ring.producer_lock);
  361. for (i = 0; i < n; i++) {
  362. struct xdp_frame *frame = frames[i];
  363. void *ptr = veth_xdp_to_ptr(frame);
  364. if (unlikely(frame->len > max_len ||
  365. __ptr_ring_produce(&rq->xdp_ring, ptr))) {
  366. xdp_return_frame_rx_napi(frame);
  367. drops++;
  368. }
  369. }
  370. spin_unlock(&rq->xdp_ring.producer_lock);
  371. if (flags & XDP_XMIT_FLUSH)
  372. __veth_xdp_flush(rq);
  373. if (likely(!drops))
  374. return n;
  375. ret = n - drops;
  376. drop:
  377. atomic64_add(drops, &priv->dropped);
  378. return ret;
  379. }
  380. static void veth_xdp_flush_bq(struct net_device *dev, struct veth_xdp_tx_bq *bq)
  381. {
  382. int sent, i, err = 0;
  383. sent = veth_xdp_xmit(dev, bq->count, bq->q, 0);
  384. if (sent < 0) {
  385. err = sent;
  386. sent = 0;
  387. for (i = 0; i < bq->count; i++)
  388. xdp_return_frame(bq->q[i]);
  389. }
  390. trace_xdp_bulk_tx(dev, sent, bq->count - sent, err);
  391. bq->count = 0;
  392. }
  393. static void veth_xdp_flush(struct net_device *dev, struct veth_xdp_tx_bq *bq)
  394. {
  395. struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
  396. struct net_device *rcv;
  397. struct veth_rq *rq;
  398. rcu_read_lock();
  399. veth_xdp_flush_bq(dev, bq);
  400. rcv = rcu_dereference(priv->peer);
  401. if (unlikely(!rcv))
  402. goto out;
  403. rcv_priv = netdev_priv(rcv);
  404. rq = &rcv_priv->rq[veth_select_rxq(rcv)];
  405. /* xdp_ring is initialized on receive side? */
  406. if (unlikely(!rcu_access_pointer(rq->xdp_prog)))
  407. goto out;
  408. __veth_xdp_flush(rq);
  409. out:
  410. rcu_read_unlock();
  411. }
  412. static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp,
  413. struct veth_xdp_tx_bq *bq)
  414. {
  415. struct xdp_frame *frame = convert_to_xdp_frame(xdp);
  416. if (unlikely(!frame))
  417. return -EOVERFLOW;
  418. if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE))
  419. veth_xdp_flush_bq(dev, bq);
  420. bq->q[bq->count++] = frame;
  421. return 0;
  422. }
  423. static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq,
  424. struct xdp_frame *frame,
  425. unsigned int *xdp_xmit,
  426. struct veth_xdp_tx_bq *bq)
  427. {
  428. void *hard_start = frame->data - frame->headroom;
  429. int len = frame->len, delta = 0;
  430. struct xdp_frame orig_frame;
  431. struct bpf_prog *xdp_prog;
  432. unsigned int headroom;
  433. struct sk_buff *skb;
  434. /* bpf_xdp_adjust_head() assures BPF cannot access xdp_frame area */
  435. hard_start -= sizeof(struct xdp_frame);
  436. rcu_read_lock();
  437. xdp_prog = rcu_dereference(rq->xdp_prog);
  438. if (likely(xdp_prog)) {
  439. struct xdp_buff xdp;
  440. u32 act;
  441. xdp.data_hard_start = hard_start;
  442. xdp.data = frame->data;
  443. xdp.data_end = frame->data + frame->len;
  444. xdp.data_meta = frame->data - frame->metasize;
  445. xdp.rxq = &rq->xdp_rxq;
  446. act = bpf_prog_run_xdp(xdp_prog, &xdp);
  447. switch (act) {
  448. case XDP_PASS:
  449. delta = frame->data - xdp.data;
  450. len = xdp.data_end - xdp.data;
  451. break;
  452. case XDP_TX:
  453. orig_frame = *frame;
  454. xdp.rxq->mem = frame->mem;
  455. if (unlikely(veth_xdp_tx(rq->dev, &xdp, bq) < 0)) {
  456. trace_xdp_exception(rq->dev, xdp_prog, act);
  457. frame = &orig_frame;
  458. goto err_xdp;
  459. }
  460. *xdp_xmit |= VETH_XDP_TX;
  461. rcu_read_unlock();
  462. goto xdp_xmit;
  463. case XDP_REDIRECT:
  464. orig_frame = *frame;
  465. xdp.rxq->mem = frame->mem;
  466. if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) {
  467. frame = &orig_frame;
  468. goto err_xdp;
  469. }
  470. *xdp_xmit |= VETH_XDP_REDIR;
  471. rcu_read_unlock();
  472. goto xdp_xmit;
  473. default:
  474. bpf_warn_invalid_xdp_action(act);
  475. /* fall through */
  476. case XDP_ABORTED:
  477. trace_xdp_exception(rq->dev, xdp_prog, act);
  478. /* fall through */
  479. case XDP_DROP:
  480. goto err_xdp;
  481. }
  482. }
  483. rcu_read_unlock();
  484. headroom = sizeof(struct xdp_frame) + frame->headroom - delta;
  485. skb = veth_build_skb(hard_start, headroom, len, 0);
  486. if (!skb) {
  487. xdp_return_frame(frame);
  488. goto err;
  489. }
  490. xdp_release_frame(frame);
  491. xdp_scrub_frame(frame);
  492. skb->protocol = eth_type_trans(skb, rq->dev);
  493. err:
  494. return skb;
  495. err_xdp:
  496. rcu_read_unlock();
  497. xdp_return_frame(frame);
  498. xdp_xmit:
  499. return NULL;
  500. }
  501. static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb,
  502. unsigned int *xdp_xmit,
  503. struct veth_xdp_tx_bq *bq)
  504. {
  505. u32 pktlen, headroom, act, metalen;
  506. void *orig_data, *orig_data_end;
  507. struct bpf_prog *xdp_prog;
  508. int mac_len, delta, off;
  509. struct xdp_buff xdp;
  510. skb_orphan(skb);
  511. rcu_read_lock();
  512. xdp_prog = rcu_dereference(rq->xdp_prog);
  513. if (unlikely(!xdp_prog)) {
  514. rcu_read_unlock();
  515. goto out;
  516. }
  517. mac_len = skb->data - skb_mac_header(skb);
  518. pktlen = skb->len + mac_len;
  519. headroom = skb_headroom(skb) - mac_len;
  520. if (skb_shared(skb) || skb_head_is_locked(skb) ||
  521. skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {
  522. struct sk_buff *nskb;
  523. int size, head_off;
  524. void *head, *start;
  525. struct page *page;
  526. size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) +
  527. SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
  528. if (size > PAGE_SIZE)
  529. goto drop;
  530. page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
  531. if (!page)
  532. goto drop;
  533. head = page_address(page);
  534. start = head + VETH_XDP_HEADROOM;
  535. if (skb_copy_bits(skb, -mac_len, start, pktlen)) {
  536. page_frag_free(head);
  537. goto drop;
  538. }
  539. nskb = veth_build_skb(head,
  540. VETH_XDP_HEADROOM + mac_len, skb->len,
  541. PAGE_SIZE);
  542. if (!nskb) {
  543. page_frag_free(head);
  544. goto drop;
  545. }
  546. skb_copy_header(nskb, skb);
  547. head_off = skb_headroom(nskb) - skb_headroom(skb);
  548. skb_headers_offset_update(nskb, head_off);
  549. consume_skb(skb);
  550. skb = nskb;
  551. }
  552. xdp.data_hard_start = skb->head;
  553. xdp.data = skb_mac_header(skb);
  554. xdp.data_end = xdp.data + pktlen;
  555. xdp.data_meta = xdp.data;
  556. xdp.rxq = &rq->xdp_rxq;
  557. orig_data = xdp.data;
  558. orig_data_end = xdp.data_end;
  559. act = bpf_prog_run_xdp(xdp_prog, &xdp);
  560. switch (act) {
  561. case XDP_PASS:
  562. break;
  563. case XDP_TX:
  564. get_page(virt_to_page(xdp.data));
  565. consume_skb(skb);
  566. xdp.rxq->mem = rq->xdp_mem;
  567. if (unlikely(veth_xdp_tx(rq->dev, &xdp, bq) < 0)) {
  568. trace_xdp_exception(rq->dev, xdp_prog, act);
  569. goto err_xdp;
  570. }
  571. *xdp_xmit |= VETH_XDP_TX;
  572. rcu_read_unlock();
  573. goto xdp_xmit;
  574. case XDP_REDIRECT:
  575. get_page(virt_to_page(xdp.data));
  576. consume_skb(skb);
  577. xdp.rxq->mem = rq->xdp_mem;
  578. if (xdp_do_redirect(rq->dev, &xdp, xdp_prog))
  579. goto err_xdp;
  580. *xdp_xmit |= VETH_XDP_REDIR;
  581. rcu_read_unlock();
  582. goto xdp_xmit;
  583. default:
  584. bpf_warn_invalid_xdp_action(act);
  585. /* fall through */
  586. case XDP_ABORTED:
  587. trace_xdp_exception(rq->dev, xdp_prog, act);
  588. /* fall through */
  589. case XDP_DROP:
  590. goto drop;
  591. }
  592. rcu_read_unlock();
  593. delta = orig_data - xdp.data;
  594. off = mac_len + delta;
  595. if (off > 0)
  596. __skb_push(skb, off);
  597. else if (off < 0)
  598. __skb_pull(skb, -off);
  599. skb->mac_header -= delta;
  600. off = xdp.data_end - orig_data_end;
  601. if (off != 0)
  602. __skb_put(skb, off);
  603. skb->protocol = eth_type_trans(skb, rq->dev);
  604. metalen = xdp.data - xdp.data_meta;
  605. if (metalen)
  606. skb_metadata_set(skb, metalen);
  607. out:
  608. return skb;
  609. drop:
  610. rcu_read_unlock();
  611. kfree_skb(skb);
  612. return NULL;
  613. err_xdp:
  614. rcu_read_unlock();
  615. page_frag_free(xdp.data);
  616. xdp_xmit:
  617. return NULL;
  618. }
  619. static int veth_xdp_rcv(struct veth_rq *rq, int budget, unsigned int *xdp_xmit,
  620. struct veth_xdp_tx_bq *bq)
  621. {
  622. int i, done = 0, drops = 0, bytes = 0;
  623. for (i = 0; i < budget; i++) {
  624. void *ptr = __ptr_ring_consume(&rq->xdp_ring);
  625. unsigned int xdp_xmit_one = 0;
  626. struct sk_buff *skb;
  627. if (!ptr)
  628. break;
  629. if (veth_is_xdp_frame(ptr)) {
  630. struct xdp_frame *frame = veth_ptr_to_xdp(ptr);
  631. bytes += frame->len;
  632. skb = veth_xdp_rcv_one(rq, frame, &xdp_xmit_one, bq);
  633. } else {
  634. skb = ptr;
  635. bytes += skb->len;
  636. skb = veth_xdp_rcv_skb(rq, skb, &xdp_xmit_one, bq);
  637. }
  638. *xdp_xmit |= xdp_xmit_one;
  639. if (skb)
  640. napi_gro_receive(&rq->xdp_napi, skb);
  641. else if (!xdp_xmit_one)
  642. drops++;
  643. done++;
  644. }
  645. u64_stats_update_begin(&rq->stats.syncp);
  646. rq->stats.xdp_packets += done;
  647. rq->stats.xdp_bytes += bytes;
  648. rq->stats.xdp_drops += drops;
  649. u64_stats_update_end(&rq->stats.syncp);
  650. return done;
  651. }
  652. static int veth_poll(struct napi_struct *napi, int budget)
  653. {
  654. struct veth_rq *rq =
  655. container_of(napi, struct veth_rq, xdp_napi);
  656. unsigned int xdp_xmit = 0;
  657. struct veth_xdp_tx_bq bq;
  658. int done;
  659. bq.count = 0;
  660. xdp_set_return_frame_no_direct();
  661. done = veth_xdp_rcv(rq, budget, &xdp_xmit, &bq);
  662. if (done < budget && napi_complete_done(napi, done)) {
  663. /* Write rx_notify_masked before reading ptr_ring */
  664. smp_store_mb(rq->rx_notify_masked, false);
  665. if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) {
  666. rq->rx_notify_masked = true;
  667. napi_schedule(&rq->xdp_napi);
  668. }
  669. }
  670. if (xdp_xmit & VETH_XDP_TX)
  671. veth_xdp_flush(rq->dev, &bq);
  672. if (xdp_xmit & VETH_XDP_REDIR)
  673. xdp_do_flush_map();
  674. xdp_clear_return_frame_no_direct();
  675. return done;
  676. }
  677. static int veth_napi_add(struct net_device *dev)
  678. {
  679. struct veth_priv *priv = netdev_priv(dev);
  680. int err, i;
  681. for (i = 0; i < dev->real_num_rx_queues; i++) {
  682. struct veth_rq *rq = &priv->rq[i];
  683. err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL);
  684. if (err)
  685. goto err_xdp_ring;
  686. }
  687. for (i = 0; i < dev->real_num_rx_queues; i++) {
  688. struct veth_rq *rq = &priv->rq[i];
  689. netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
  690. napi_enable(&rq->xdp_napi);
  691. }
  692. return 0;
  693. err_xdp_ring:
  694. for (i--; i >= 0; i--)
  695. ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free);
  696. return err;
  697. }
  698. static void veth_napi_del(struct net_device *dev)
  699. {
  700. struct veth_priv *priv = netdev_priv(dev);
  701. int i;
  702. for (i = 0; i < dev->real_num_rx_queues; i++) {
  703. struct veth_rq *rq = &priv->rq[i];
  704. napi_disable(&rq->xdp_napi);
  705. napi_hash_del(&rq->xdp_napi);
  706. }
  707. synchronize_net();
  708. for (i = 0; i < dev->real_num_rx_queues; i++) {
  709. struct veth_rq *rq = &priv->rq[i];
  710. netif_napi_del(&rq->xdp_napi);
  711. rq->rx_notify_masked = false;
  712. ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free);
  713. }
  714. }
  715. static int veth_enable_xdp(struct net_device *dev)
  716. {
  717. struct veth_priv *priv = netdev_priv(dev);
  718. int err, i;
  719. if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) {
  720. for (i = 0; i < dev->real_num_rx_queues; i++) {
  721. struct veth_rq *rq = &priv->rq[i];
  722. err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i);
  723. if (err < 0)
  724. goto err_rxq_reg;
  725. err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
  726. MEM_TYPE_PAGE_SHARED,
  727. NULL);
  728. if (err < 0)
  729. goto err_reg_mem;
  730. /* Save original mem info as it can be overwritten */
  731. rq->xdp_mem = rq->xdp_rxq.mem;
  732. }
  733. err = veth_napi_add(dev);
  734. if (err)
  735. goto err_rxq_reg;
  736. }
  737. for (i = 0; i < dev->real_num_rx_queues; i++)
  738. rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog);
  739. return 0;
  740. err_reg_mem:
  741. xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
  742. err_rxq_reg:
  743. for (i--; i >= 0; i--)
  744. xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
  745. return err;
  746. }
  747. static void veth_disable_xdp(struct net_device *dev)
  748. {
  749. struct veth_priv *priv = netdev_priv(dev);
  750. int i;
  751. for (i = 0; i < dev->real_num_rx_queues; i++)
  752. rcu_assign_pointer(priv->rq[i].xdp_prog, NULL);
  753. veth_napi_del(dev);
  754. for (i = 0; i < dev->real_num_rx_queues; i++) {
  755. struct veth_rq *rq = &priv->rq[i];
  756. rq->xdp_rxq.mem = rq->xdp_mem;
  757. xdp_rxq_info_unreg(&rq->xdp_rxq);
  758. }
  759. }
  760. static int veth_open(struct net_device *dev)
  761. {
  762. struct veth_priv *priv = netdev_priv(dev);
  763. struct net_device *peer = rtnl_dereference(priv->peer);
  764. int err;
  765. if (!peer)
  766. return -ENOTCONN;
  767. if (priv->_xdp_prog) {
  768. err = veth_enable_xdp(dev);
  769. if (err)
  770. return err;
  771. }
  772. if (peer->flags & IFF_UP) {
  773. netif_carrier_on(dev);
  774. netif_carrier_on(peer);
  775. }
  776. return 0;
  777. }
  778. static int veth_close(struct net_device *dev)
  779. {
  780. struct veth_priv *priv = netdev_priv(dev);
  781. struct net_device *peer = rtnl_dereference(priv->peer);
  782. netif_carrier_off(dev);
  783. if (peer)
  784. netif_carrier_off(peer);
  785. if (priv->_xdp_prog)
  786. veth_disable_xdp(dev);
  787. return 0;
  788. }
  789. static int is_valid_veth_mtu(int mtu)
  790. {
  791. return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU;
  792. }
  793. static int veth_alloc_queues(struct net_device *dev)
  794. {
  795. struct veth_priv *priv = netdev_priv(dev);
  796. int i;
  797. priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL);
  798. if (!priv->rq)
  799. return -ENOMEM;
  800. for (i = 0; i < dev->num_rx_queues; i++) {
  801. priv->rq[i].dev = dev;
  802. u64_stats_init(&priv->rq[i].stats.syncp);
  803. }
  804. return 0;
  805. }
  806. static void veth_free_queues(struct net_device *dev)
  807. {
  808. struct veth_priv *priv = netdev_priv(dev);
  809. kfree(priv->rq);
  810. }
  811. static int veth_dev_init(struct net_device *dev)
  812. {
  813. int err;
  814. dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
  815. if (!dev->lstats)
  816. return -ENOMEM;
  817. err = veth_alloc_queues(dev);
  818. if (err) {
  819. free_percpu(dev->lstats);
  820. return err;
  821. }
  822. return 0;
  823. }
  824. static void veth_dev_free(struct net_device *dev)
  825. {
  826. veth_free_queues(dev);
  827. free_percpu(dev->lstats);
  828. }
  829. #ifdef CONFIG_NET_POLL_CONTROLLER
  830. static void veth_poll_controller(struct net_device *dev)
  831. {
  832. /* veth only receives frames when its peer sends one
  833. * Since it has nothing to do with disabling irqs, we are guaranteed
  834. * never to have pending data when we poll for it so
  835. * there is nothing to do here.
  836. *
  837. * We need this though so netpoll recognizes us as an interface that
  838. * supports polling, which enables bridge devices in virt setups to
  839. * still use netconsole
  840. */
  841. }
  842. #endif /* CONFIG_NET_POLL_CONTROLLER */
  843. static int veth_get_iflink(const struct net_device *dev)
  844. {
  845. struct veth_priv *priv = netdev_priv(dev);
  846. struct net_device *peer;
  847. int iflink;
  848. rcu_read_lock();
  849. peer = rcu_dereference(priv->peer);
  850. iflink = peer ? peer->ifindex : 0;
  851. rcu_read_unlock();
  852. return iflink;
  853. }
  854. static netdev_features_t veth_fix_features(struct net_device *dev,
  855. netdev_features_t features)
  856. {
  857. struct veth_priv *priv = netdev_priv(dev);
  858. struct net_device *peer;
  859. peer = rtnl_dereference(priv->peer);
  860. if (peer) {
  861. struct veth_priv *peer_priv = netdev_priv(peer);
  862. if (peer_priv->_xdp_prog)
  863. features &= ~NETIF_F_GSO_SOFTWARE;
  864. }
  865. return features;
  866. }
  867. static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
  868. {
  869. struct veth_priv *peer_priv, *priv = netdev_priv(dev);
  870. struct net_device *peer;
  871. if (new_hr < 0)
  872. new_hr = 0;
  873. rcu_read_lock();
  874. peer = rcu_dereference(priv->peer);
  875. if (unlikely(!peer))
  876. goto out;
  877. peer_priv = netdev_priv(peer);
  878. priv->requested_headroom = new_hr;
  879. new_hr = max(priv->requested_headroom, peer_priv->requested_headroom);
  880. dev->needed_headroom = new_hr;
  881. peer->needed_headroom = new_hr;
  882. out:
  883. rcu_read_unlock();
  884. }
  885. static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
  886. struct netlink_ext_ack *extack)
  887. {
  888. struct veth_priv *priv = netdev_priv(dev);
  889. struct bpf_prog *old_prog;
  890. struct net_device *peer;
  891. unsigned int max_mtu;
  892. int err;
  893. old_prog = priv->_xdp_prog;
  894. priv->_xdp_prog = prog;
  895. peer = rtnl_dereference(priv->peer);
  896. if (prog) {
  897. if (!peer) {
  898. NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached");
  899. err = -ENOTCONN;
  900. goto err;
  901. }
  902. max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM -
  903. peer->hard_header_len -
  904. SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
  905. if (peer->mtu > max_mtu) {
  906. NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP");
  907. err = -ERANGE;
  908. goto err;
  909. }
  910. if (dev->real_num_rx_queues < peer->real_num_tx_queues) {
  911. NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues");
  912. err = -ENOSPC;
  913. goto err;
  914. }
  915. if (dev->flags & IFF_UP) {
  916. err = veth_enable_xdp(dev);
  917. if (err) {
  918. NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed");
  919. goto err;
  920. }
  921. }
  922. if (!old_prog) {
  923. peer->hw_features &= ~NETIF_F_GSO_SOFTWARE;
  924. peer->max_mtu = max_mtu;
  925. }
  926. }
  927. if (old_prog) {
  928. if (!prog) {
  929. if (dev->flags & IFF_UP)
  930. veth_disable_xdp(dev);
  931. if (peer) {
  932. peer->hw_features |= NETIF_F_GSO_SOFTWARE;
  933. peer->max_mtu = ETH_MAX_MTU;
  934. }
  935. }
  936. bpf_prog_put(old_prog);
  937. }
  938. if ((!!old_prog ^ !!prog) && peer)
  939. netdev_update_features(peer);
  940. return 0;
  941. err:
  942. priv->_xdp_prog = old_prog;
  943. return err;
  944. }
  945. static u32 veth_xdp_query(struct net_device *dev)
  946. {
  947. struct veth_priv *priv = netdev_priv(dev);
  948. const struct bpf_prog *xdp_prog;
  949. xdp_prog = priv->_xdp_prog;
  950. if (xdp_prog)
  951. return xdp_prog->aux->id;
  952. return 0;
  953. }
  954. static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
  955. {
  956. switch (xdp->command) {
  957. case XDP_SETUP_PROG:
  958. return veth_xdp_set(dev, xdp->prog, xdp->extack);
  959. case XDP_QUERY_PROG:
  960. xdp->prog_id = veth_xdp_query(dev);
  961. return 0;
  962. default:
  963. return -EINVAL;
  964. }
  965. }
  966. static const struct net_device_ops veth_netdev_ops = {
  967. .ndo_init = veth_dev_init,
  968. .ndo_open = veth_open,
  969. .ndo_stop = veth_close,
  970. .ndo_start_xmit = veth_xmit,
  971. .ndo_get_stats64 = veth_get_stats64,
  972. .ndo_set_rx_mode = veth_set_multicast_list,
  973. .ndo_set_mac_address = eth_mac_addr,
  974. #ifdef CONFIG_NET_POLL_CONTROLLER
  975. .ndo_poll_controller = veth_poll_controller,
  976. #endif
  977. .ndo_get_iflink = veth_get_iflink,
  978. .ndo_fix_features = veth_fix_features,
  979. .ndo_features_check = passthru_features_check,
  980. .ndo_set_rx_headroom = veth_set_rx_headroom,
  981. .ndo_bpf = veth_xdp,
  982. .ndo_xdp_xmit = veth_xdp_xmit,
  983. };
  984. #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
  985. NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \
  986. NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \
  987. NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \
  988. NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX )
  989. static void veth_setup(struct net_device *dev)
  990. {
  991. ether_setup(dev);
  992. dev->priv_flags &= ~IFF_TX_SKB_SHARING;
  993. dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
  994. dev->priv_flags |= IFF_NO_QUEUE;
  995. dev->priv_flags |= IFF_PHONY_HEADROOM;
  996. dev->netdev_ops = &veth_netdev_ops;
  997. dev->ethtool_ops = &veth_ethtool_ops;
  998. dev->features |= NETIF_F_LLTX;
  999. dev->features |= VETH_FEATURES;
  1000. dev->vlan_features = dev->features &
  1001. ~(NETIF_F_HW_VLAN_CTAG_TX |
  1002. NETIF_F_HW_VLAN_STAG_TX |
  1003. NETIF_F_HW_VLAN_CTAG_RX |
  1004. NETIF_F_HW_VLAN_STAG_RX);
  1005. dev->needs_free_netdev = true;
  1006. dev->priv_destructor = veth_dev_free;
  1007. dev->max_mtu = ETH_MAX_MTU;
  1008. dev->hw_features = VETH_FEATURES;
  1009. dev->hw_enc_features = VETH_FEATURES;
  1010. dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
  1011. }
  1012. /*
  1013. * netlink interface
  1014. */
  1015. static int veth_validate(struct nlattr *tb[], struct nlattr *data[],
  1016. struct netlink_ext_ack *extack)
  1017. {
  1018. if (tb[IFLA_ADDRESS]) {
  1019. if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
  1020. return -EINVAL;
  1021. if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
  1022. return -EADDRNOTAVAIL;
  1023. }
  1024. if (tb[IFLA_MTU]) {
  1025. if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU])))
  1026. return -EINVAL;
  1027. }
  1028. return 0;
  1029. }
  1030. static struct rtnl_link_ops veth_link_ops;
  1031. static int veth_newlink(struct net *src_net, struct net_device *dev,
  1032. struct nlattr *tb[], struct nlattr *data[],
  1033. struct netlink_ext_ack *extack)
  1034. {
  1035. int err;
  1036. struct net_device *peer;
  1037. struct veth_priv *priv;
  1038. char ifname[IFNAMSIZ];
  1039. struct nlattr *peer_tb[IFLA_MAX + 1], **tbp;
  1040. unsigned char name_assign_type;
  1041. struct ifinfomsg *ifmp;
  1042. struct net *net;
  1043. /*
  1044. * create and register peer first
  1045. */
  1046. if (data != NULL && data[VETH_INFO_PEER] != NULL) {
  1047. struct nlattr *nla_peer;
  1048. nla_peer = data[VETH_INFO_PEER];
  1049. ifmp = nla_data(nla_peer);
  1050. err = rtnl_nla_parse_ifla(peer_tb,
  1051. nla_data(nla_peer) + sizeof(struct ifinfomsg),
  1052. nla_len(nla_peer) - sizeof(struct ifinfomsg),
  1053. NULL);
  1054. if (err < 0)
  1055. return err;
  1056. err = veth_validate(peer_tb, NULL, extack);
  1057. if (err < 0)
  1058. return err;
  1059. tbp = peer_tb;
  1060. } else {
  1061. ifmp = NULL;
  1062. tbp = tb;
  1063. }
  1064. if (ifmp && tbp[IFLA_IFNAME]) {
  1065. nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
  1066. name_assign_type = NET_NAME_USER;
  1067. } else {
  1068. snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d");
  1069. name_assign_type = NET_NAME_ENUM;
  1070. }
  1071. net = rtnl_link_get_net(src_net, tbp);
  1072. if (IS_ERR(net))
  1073. return PTR_ERR(net);
  1074. peer = rtnl_create_link(net, ifname, name_assign_type,
  1075. &veth_link_ops, tbp, extack);
  1076. if (IS_ERR(peer)) {
  1077. put_net(net);
  1078. return PTR_ERR(peer);
  1079. }
  1080. if (!ifmp || !tbp[IFLA_ADDRESS])
  1081. eth_hw_addr_random(peer);
  1082. if (ifmp && (dev->ifindex != 0))
  1083. peer->ifindex = ifmp->ifi_index;
  1084. peer->gso_max_size = dev->gso_max_size;
  1085. peer->gso_max_segs = dev->gso_max_segs;
  1086. err = register_netdevice(peer);
  1087. put_net(net);
  1088. net = NULL;
  1089. if (err < 0)
  1090. goto err_register_peer;
  1091. netif_carrier_off(peer);
  1092. err = rtnl_configure_link(peer, ifmp);
  1093. if (err < 0)
  1094. goto err_configure_peer;
  1095. /*
  1096. * register dev last
  1097. *
  1098. * note, that since we've registered new device the dev's name
  1099. * should be re-allocated
  1100. */
  1101. if (tb[IFLA_ADDRESS] == NULL)
  1102. eth_hw_addr_random(dev);
  1103. if (tb[IFLA_IFNAME])
  1104. nla_strlcpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
  1105. else
  1106. snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d");
  1107. err = register_netdevice(dev);
  1108. if (err < 0)
  1109. goto err_register_dev;
  1110. netif_carrier_off(dev);
  1111. /*
  1112. * tie the deviced together
  1113. */
  1114. priv = netdev_priv(dev);
  1115. rcu_assign_pointer(priv->peer, peer);
  1116. priv = netdev_priv(peer);
  1117. rcu_assign_pointer(priv->peer, dev);
  1118. return 0;
  1119. err_register_dev:
  1120. /* nothing to do */
  1121. err_configure_peer:
  1122. unregister_netdevice(peer);
  1123. return err;
  1124. err_register_peer:
  1125. free_netdev(peer);
  1126. return err;
  1127. }
  1128. static void veth_dellink(struct net_device *dev, struct list_head *head)
  1129. {
  1130. struct veth_priv *priv;
  1131. struct net_device *peer;
  1132. priv = netdev_priv(dev);
  1133. peer = rtnl_dereference(priv->peer);
  1134. /* Note : dellink() is called from default_device_exit_batch(),
  1135. * before a rcu_synchronize() point. The devices are guaranteed
  1136. * not being freed before one RCU grace period.
  1137. */
  1138. RCU_INIT_POINTER(priv->peer, NULL);
  1139. unregister_netdevice_queue(dev, head);
  1140. if (peer) {
  1141. priv = netdev_priv(peer);
  1142. RCU_INIT_POINTER(priv->peer, NULL);
  1143. unregister_netdevice_queue(peer, head);
  1144. }
  1145. }
  1146. static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = {
  1147. [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) },
  1148. };
  1149. static struct net *veth_get_link_net(const struct net_device *dev)
  1150. {
  1151. struct veth_priv *priv = netdev_priv(dev);
  1152. struct net_device *peer = rtnl_dereference(priv->peer);
  1153. return peer ? dev_net(peer) : dev_net(dev);
  1154. }
  1155. static struct rtnl_link_ops veth_link_ops = {
  1156. .kind = DRV_NAME,
  1157. .priv_size = sizeof(struct veth_priv),
  1158. .setup = veth_setup,
  1159. .validate = veth_validate,
  1160. .newlink = veth_newlink,
  1161. .dellink = veth_dellink,
  1162. .policy = veth_policy,
  1163. .maxtype = VETH_INFO_MAX,
  1164. .get_link_net = veth_get_link_net,
  1165. };
  1166. /*
  1167. * init/fini
  1168. */
  1169. static __init int veth_init(void)
  1170. {
  1171. return rtnl_link_register(&veth_link_ops);
  1172. }
  1173. static __exit void veth_exit(void)
  1174. {
  1175. rtnl_link_unregister(&veth_link_ops);
  1176. }
  1177. module_init(veth_init);
  1178. module_exit(veth_exit);
  1179. MODULE_DESCRIPTION("Virtual Ethernet Tunnel");
  1180. MODULE_LICENSE("GPL v2");
  1181. MODULE_ALIAS_RTNL_LINK(DRV_NAME);