fs.c 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083
  1. /*-
  2. * SPDX-License-Identifier: BSD-2-Clause
  3. *
  4. * Copyright (c) 2022 The FreeBSD Foundation
  5. *
  6. * This software was developed by Mark Johnston under sponsorship from
  7. * the FreeBSD Foundation.
  8. *
  9. * Redistribution and use in source and binary forms, with or without
  10. * modification, are permitted provided that the following conditions are
  11. * met:
  12. * 1. Redistributions of source code must retain the above copyright
  13. * notice, this list of conditions and the following disclaimer.
  14. * 2. Redistributions in binary form must reproduce the above copyright
  15. * notice, this list of conditions and the following disclaimer in
  16. * the documentation and/or other materials provided with the distribution.
  17. *
  18. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  19. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  20. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  21. * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  22. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  23. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  24. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  25. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  26. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  27. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  28. * SUCH DAMAGE.
  29. */
  30. #include <sys/stat.h>
  31. #include <assert.h>
  32. #include <dirent.h>
  33. #include <fcntl.h>
  34. #include <stdlib.h>
  35. #include <string.h>
  36. #include <unistd.h>
  37. #include <util.h>
  38. #include "makefs.h"
  39. #include "zfs.h"
  40. typedef struct {
  41. const char *name;
  42. unsigned int id;
  43. uint16_t size;
  44. sa_bswap_type_t bs;
  45. } zfs_sattr_t;
  46. typedef struct zfs_fs {
  47. zfs_objset_t *os;
  48. /* Offset table for system attributes, indexed by a zpl_attr_t. */
  49. uint16_t *saoffs;
  50. size_t sacnt;
  51. const zfs_sattr_t *satab;
  52. } zfs_fs_t;
  53. /*
  54. * The order of the attributes doesn't matter, this is simply the one hard-coded
  55. * by OpenZFS, based on a zdb dump of the SA_REGISTRY table.
  56. */
  57. typedef enum zpl_attr {
  58. ZPL_ATIME,
  59. ZPL_MTIME,
  60. ZPL_CTIME,
  61. ZPL_CRTIME,
  62. ZPL_GEN,
  63. ZPL_MODE,
  64. ZPL_SIZE,
  65. ZPL_PARENT,
  66. ZPL_LINKS,
  67. ZPL_XATTR,
  68. ZPL_RDEV,
  69. ZPL_FLAGS,
  70. ZPL_UID,
  71. ZPL_GID,
  72. ZPL_PAD,
  73. ZPL_ZNODE_ACL,
  74. ZPL_DACL_COUNT,
  75. ZPL_SYMLINK,
  76. ZPL_SCANSTAMP,
  77. ZPL_DACL_ACES,
  78. ZPL_DXATTR,
  79. ZPL_PROJID,
  80. } zpl_attr_t;
  81. /*
  82. * This table must be kept in sync with zpl_attr_layout[] and zpl_attr_t.
  83. */
  84. static const zfs_sattr_t zpl_attrs[] = {
  85. #define _ZPL_ATTR(n, s, b) { .name = #n, .id = n, .size = s, .bs = b }
  86. _ZPL_ATTR(ZPL_ATIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
  87. _ZPL_ATTR(ZPL_MTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
  88. _ZPL_ATTR(ZPL_CTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
  89. _ZPL_ATTR(ZPL_CRTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
  90. _ZPL_ATTR(ZPL_GEN, sizeof(uint64_t), SA_UINT64_ARRAY),
  91. _ZPL_ATTR(ZPL_MODE, sizeof(uint64_t), SA_UINT64_ARRAY),
  92. _ZPL_ATTR(ZPL_SIZE, sizeof(uint64_t), SA_UINT64_ARRAY),
  93. _ZPL_ATTR(ZPL_PARENT, sizeof(uint64_t), SA_UINT64_ARRAY),
  94. _ZPL_ATTR(ZPL_LINKS, sizeof(uint64_t), SA_UINT64_ARRAY),
  95. _ZPL_ATTR(ZPL_XATTR, sizeof(uint64_t), SA_UINT64_ARRAY),
  96. _ZPL_ATTR(ZPL_RDEV, sizeof(uint64_t), SA_UINT64_ARRAY),
  97. _ZPL_ATTR(ZPL_FLAGS, sizeof(uint64_t), SA_UINT64_ARRAY),
  98. _ZPL_ATTR(ZPL_UID, sizeof(uint64_t), SA_UINT64_ARRAY),
  99. _ZPL_ATTR(ZPL_GID, sizeof(uint64_t), SA_UINT64_ARRAY),
  100. _ZPL_ATTR(ZPL_PAD, sizeof(uint64_t), SA_UINT64_ARRAY),
  101. _ZPL_ATTR(ZPL_ZNODE_ACL, 88, SA_UINT64_ARRAY),
  102. _ZPL_ATTR(ZPL_DACL_COUNT, sizeof(uint64_t), SA_UINT64_ARRAY),
  103. _ZPL_ATTR(ZPL_SYMLINK, 0, SA_UINT8_ARRAY),
  104. _ZPL_ATTR(ZPL_SCANSTAMP, sizeof(uint64_t) * 4, SA_UINT8_ARRAY),
  105. _ZPL_ATTR(ZPL_DACL_ACES, 0, SA_ACL),
  106. _ZPL_ATTR(ZPL_DXATTR, 0, SA_UINT8_ARRAY),
  107. _ZPL_ATTR(ZPL_PROJID, sizeof(uint64_t), SA_UINT64_ARRAY),
  108. #undef ZPL_ATTR
  109. };
  110. /*
  111. * This layout matches that of a filesystem created using OpenZFS on FreeBSD.
  112. * It need not match in general, but FreeBSD's loader doesn't bother parsing the
  113. * layout and just hard-codes attribute offsets.
  114. */
  115. static const sa_attr_type_t zpl_attr_layout[] = {
  116. ZPL_MODE,
  117. ZPL_SIZE,
  118. ZPL_GEN,
  119. ZPL_UID,
  120. ZPL_GID,
  121. ZPL_PARENT,
  122. ZPL_FLAGS,
  123. ZPL_ATIME,
  124. ZPL_MTIME,
  125. ZPL_CTIME,
  126. ZPL_CRTIME,
  127. ZPL_LINKS,
  128. ZPL_DACL_COUNT,
  129. ZPL_DACL_ACES,
  130. ZPL_SYMLINK,
  131. };
  132. /*
  133. * Keys for the ZPL attribute tables in the SA layout ZAP. The first two
  134. * indices are reserved for legacy attribute encoding.
  135. */
  136. #define SA_LAYOUT_INDEX_DEFAULT 2
  137. #define SA_LAYOUT_INDEX_SYMLINK 3
  138. struct fs_populate_dir {
  139. SLIST_ENTRY(fs_populate_dir) next;
  140. int dirfd;
  141. uint64_t objid;
  142. zfs_zap_t *zap;
  143. };
  144. struct fs_populate_arg {
  145. zfs_opt_t *zfs;
  146. zfs_fs_t *fs; /* owning filesystem */
  147. uint64_t rootdirid; /* root directory dnode ID */
  148. int rootdirfd; /* root directory fd */
  149. SLIST_HEAD(, fs_populate_dir) dirs; /* stack of directories */
  150. };
  151. static void fs_build_one(zfs_opt_t *, zfs_dsl_dir_t *, fsnode *, int);
  152. static void
  153. eclose(int fd)
  154. {
  155. if (close(fd) != 0)
  156. err(1, "close");
  157. }
  158. static bool
  159. fsnode_isroot(const fsnode *cur)
  160. {
  161. return (strcmp(cur->name, ".") == 0);
  162. }
  163. /*
  164. * Visit each node in a directory hierarchy, in pre-order depth-first order.
  165. */
  166. static void
  167. fsnode_foreach(fsnode *root, int (*cb)(fsnode *, void *), void *arg)
  168. {
  169. assert(root->type == S_IFDIR);
  170. for (fsnode *cur = root; cur != NULL; cur = cur->next) {
  171. assert(cur->type == S_IFREG || cur->type == S_IFDIR ||
  172. cur->type == S_IFLNK);
  173. if (cb(cur, arg) == 0)
  174. continue;
  175. if (cur->type == S_IFDIR && cur->child != NULL)
  176. fsnode_foreach(cur->child, cb, arg);
  177. }
  178. }
  179. static void
  180. fs_populate_dirent(struct fs_populate_arg *arg, fsnode *cur, uint64_t dnid)
  181. {
  182. struct fs_populate_dir *dir;
  183. uint64_t type;
  184. switch (cur->type) {
  185. case S_IFREG:
  186. type = DT_REG;
  187. break;
  188. case S_IFDIR:
  189. type = DT_DIR;
  190. break;
  191. case S_IFLNK:
  192. type = DT_LNK;
  193. break;
  194. default:
  195. assert(0);
  196. }
  197. dir = SLIST_FIRST(&arg->dirs);
  198. zap_add_uint64(dir->zap, cur->name, ZFS_DIRENT_MAKE(type, dnid));
  199. }
  200. static void
  201. fs_populate_attr(zfs_fs_t *fs, char *attrbuf, const void *val, uint16_t ind,
  202. size_t *szp)
  203. {
  204. assert(ind < fs->sacnt);
  205. assert(fs->saoffs[ind] != 0xffff);
  206. memcpy(attrbuf + fs->saoffs[ind], val, fs->satab[ind].size);
  207. *szp += fs->satab[ind].size;
  208. }
  209. static void
  210. fs_populate_varszattr(zfs_fs_t *fs, char *attrbuf, const void *val,
  211. size_t valsz, size_t varoff, uint16_t ind, size_t *szp)
  212. {
  213. assert(ind < fs->sacnt);
  214. assert(fs->saoffs[ind] != 0xffff);
  215. assert(fs->satab[ind].size == 0);
  216. memcpy(attrbuf + fs->saoffs[ind] + varoff, val, valsz);
  217. *szp += valsz;
  218. }
  219. /*
  220. * Derive the relative fd/path combo needed to access a file. Ideally we'd
  221. * always be able to use relative lookups (i.e., use the *at() system calls),
  222. * since they require less path translation and are more amenable to sandboxing,
  223. * but the handling of multiple staging directories makes that difficult. To
  224. * make matters worse, we have no choice but to use relative lookups when
  225. * dealing with an mtree manifest, so both mechanisms are implemented.
  226. */
  227. static void
  228. fs_populate_path(const fsnode *cur, struct fs_populate_arg *arg,
  229. char *path, size_t sz, int *dirfdp)
  230. {
  231. if (cur->contents != NULL) {
  232. size_t n;
  233. *dirfdp = AT_FDCWD;
  234. n = strlcpy(path, cur->contents, sz);
  235. assert(n < sz);
  236. } else if (cur->root == NULL) {
  237. size_t n;
  238. *dirfdp = SLIST_FIRST(&arg->dirs)->dirfd;
  239. n = strlcpy(path, cur->name, sz);
  240. assert(n < sz);
  241. } else {
  242. int n;
  243. *dirfdp = AT_FDCWD;
  244. n = snprintf(path, sz, "%s/%s/%s",
  245. cur->root, cur->path, cur->name);
  246. assert(n >= 0);
  247. assert((size_t)n < sz);
  248. }
  249. }
  250. static int
  251. fs_open(const fsnode *cur, struct fs_populate_arg *arg, int flags)
  252. {
  253. char path[PATH_MAX];
  254. int fd;
  255. fs_populate_path(cur, arg, path, sizeof(path), &fd);
  256. fd = openat(fd, path, flags);
  257. if (fd < 0)
  258. err(1, "openat(%s)", path);
  259. return (fd);
  260. }
  261. static int
  262. fs_open_can_fail(const fsnode *cur, struct fs_populate_arg *arg, int flags)
  263. {
  264. int fd;
  265. char path[PATH_MAX];
  266. fs_populate_path(cur, arg, path, sizeof(path), &fd);
  267. return (openat(fd, path, flags));
  268. }
  269. static void
  270. fs_readlink(const fsnode *cur, struct fs_populate_arg *arg,
  271. char *buf, size_t bufsz)
  272. {
  273. char path[PATH_MAX];
  274. int fd;
  275. if (cur->symlink != NULL) {
  276. size_t n;
  277. n = strlcpy(buf, cur->symlink, bufsz);
  278. assert(n < bufsz);
  279. } else {
  280. ssize_t n;
  281. fs_populate_path(cur, arg, path, sizeof(path), &fd);
  282. n = readlinkat(fd, path, buf, bufsz - 1);
  283. if (n == -1)
  284. err(1, "readlinkat(%s)", cur->name);
  285. buf[n] = '\0';
  286. }
  287. }
  288. static void
  289. fs_populate_time(zfs_fs_t *fs, char *attrbuf, struct timespec *ts,
  290. uint16_t ind, size_t *szp)
  291. {
  292. uint64_t timebuf[2];
  293. assert(ind < fs->sacnt);
  294. assert(fs->saoffs[ind] != 0xffff);
  295. assert(fs->satab[ind].size == sizeof(timebuf));
  296. timebuf[0] = ts->tv_sec;
  297. timebuf[1] = ts->tv_nsec;
  298. fs_populate_attr(fs, attrbuf, timebuf, ind, szp);
  299. }
  300. static void
  301. fs_populate_sattrs(struct fs_populate_arg *arg, const fsnode *cur,
  302. dnode_phys_t *dnode)
  303. {
  304. char target[PATH_MAX];
  305. zfs_fs_t *fs;
  306. zfs_ace_hdr_t aces[3];
  307. struct stat *sb;
  308. sa_hdr_phys_t *sahdr;
  309. uint64_t daclcount, flags, gen, gid, links, mode, parent, objsize, uid;
  310. char *attrbuf;
  311. size_t bonussz, hdrsz;
  312. int layout;
  313. assert(dnode->dn_bonustype == DMU_OT_SA);
  314. assert(dnode->dn_nblkptr == 1);
  315. fs = arg->fs;
  316. sb = &cur->inode->st;
  317. switch (cur->type) {
  318. case S_IFREG:
  319. layout = SA_LAYOUT_INDEX_DEFAULT;
  320. links = cur->inode->nlink;
  321. objsize = sb->st_size;
  322. parent = SLIST_FIRST(&arg->dirs)->objid;
  323. break;
  324. case S_IFDIR:
  325. layout = SA_LAYOUT_INDEX_DEFAULT;
  326. links = 1; /* .. */
  327. objsize = 1; /* .. */
  328. /*
  329. * The size of a ZPL directory is the number of entries
  330. * (including "." and ".."), and the link count is the number of
  331. * entries which are directories (including "." and "..").
  332. */
  333. for (fsnode *c = fsnode_isroot(cur) ? cur->next : cur->child;
  334. c != NULL; c = c->next) {
  335. if (c->type == S_IFDIR)
  336. links++;
  337. objsize++;
  338. }
  339. /* The root directory is its own parent. */
  340. parent = SLIST_EMPTY(&arg->dirs) ?
  341. arg->rootdirid : SLIST_FIRST(&arg->dirs)->objid;
  342. break;
  343. case S_IFLNK:
  344. fs_readlink(cur, arg, target, sizeof(target));
  345. layout = SA_LAYOUT_INDEX_SYMLINK;
  346. links = 1;
  347. objsize = strlen(target);
  348. parent = SLIST_FIRST(&arg->dirs)->objid;
  349. break;
  350. default:
  351. assert(0);
  352. }
  353. daclcount = nitems(aces);
  354. flags = ZFS_ACL_TRIVIAL | ZFS_ACL_AUTO_INHERIT | ZFS_ARCHIVE |
  355. ZFS_AV_MODIFIED;
  356. gen = 1;
  357. gid = sb->st_gid;
  358. mode = sb->st_mode;
  359. uid = sb->st_uid;
  360. memset(aces, 0, sizeof(aces));
  361. aces[0].z_flags = ACE_OWNER;
  362. aces[0].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
  363. aces[0].z_access_mask = ACE_WRITE_ATTRIBUTES | ACE_WRITE_OWNER |
  364. ACE_WRITE_ACL | ACE_WRITE_NAMED_ATTRS | ACE_READ_ACL |
  365. ACE_READ_ATTRIBUTES | ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
  366. if ((mode & S_IRUSR) != 0)
  367. aces[0].z_access_mask |= ACE_READ_DATA;
  368. if ((mode & S_IWUSR) != 0)
  369. aces[0].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
  370. if ((mode & S_IXUSR) != 0)
  371. aces[0].z_access_mask |= ACE_EXECUTE;
  372. aces[1].z_flags = ACE_GROUP | ACE_IDENTIFIER_GROUP;
  373. aces[1].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
  374. aces[1].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES |
  375. ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
  376. if ((mode & S_IRGRP) != 0)
  377. aces[1].z_access_mask |= ACE_READ_DATA;
  378. if ((mode & S_IWGRP) != 0)
  379. aces[1].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
  380. if ((mode & S_IXGRP) != 0)
  381. aces[1].z_access_mask |= ACE_EXECUTE;
  382. aces[2].z_flags = ACE_EVERYONE;
  383. aces[2].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
  384. aces[2].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES |
  385. ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
  386. if ((mode & S_IROTH) != 0)
  387. aces[2].z_access_mask |= ACE_READ_DATA;
  388. if ((mode & S_IWOTH) != 0)
  389. aces[2].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
  390. if ((mode & S_IXOTH) != 0)
  391. aces[2].z_access_mask |= ACE_EXECUTE;
  392. switch (layout) {
  393. case SA_LAYOUT_INDEX_DEFAULT:
  394. /* At most one variable-length attribute. */
  395. hdrsz = sizeof(uint64_t);
  396. break;
  397. case SA_LAYOUT_INDEX_SYMLINK:
  398. /* At most five variable-length attributes. */
  399. hdrsz = sizeof(uint64_t) * 2;
  400. break;
  401. default:
  402. assert(0);
  403. }
  404. sahdr = (sa_hdr_phys_t *)DN_BONUS(dnode);
  405. sahdr->sa_magic = SA_MAGIC;
  406. SA_HDR_LAYOUT_INFO_ENCODE(sahdr->sa_layout_info, layout, hdrsz);
  407. bonussz = SA_HDR_SIZE(sahdr);
  408. attrbuf = (char *)sahdr + SA_HDR_SIZE(sahdr);
  409. fs_populate_attr(fs, attrbuf, &daclcount, ZPL_DACL_COUNT, &bonussz);
  410. fs_populate_attr(fs, attrbuf, &flags, ZPL_FLAGS, &bonussz);
  411. fs_populate_attr(fs, attrbuf, &gen, ZPL_GEN, &bonussz);
  412. fs_populate_attr(fs, attrbuf, &gid, ZPL_GID, &bonussz);
  413. fs_populate_attr(fs, attrbuf, &links, ZPL_LINKS, &bonussz);
  414. fs_populate_attr(fs, attrbuf, &mode, ZPL_MODE, &bonussz);
  415. fs_populate_attr(fs, attrbuf, &parent, ZPL_PARENT, &bonussz);
  416. fs_populate_attr(fs, attrbuf, &objsize, ZPL_SIZE, &bonussz);
  417. fs_populate_attr(fs, attrbuf, &uid, ZPL_UID, &bonussz);
  418. /*
  419. * We deliberately set atime = mtime here to ensure that images are
  420. * reproducible.
  421. */
  422. fs_populate_time(fs, attrbuf, &sb->st_mtim, ZPL_ATIME, &bonussz);
  423. fs_populate_time(fs, attrbuf, &sb->st_ctim, ZPL_CTIME, &bonussz);
  424. fs_populate_time(fs, attrbuf, &sb->st_mtim, ZPL_MTIME, &bonussz);
  425. #ifdef __linux__
  426. /* Linux has no st_birthtim; approximate with st_ctim */
  427. fs_populate_time(fs, attrbuf, &sb->st_ctim, ZPL_CRTIME, &bonussz);
  428. #else
  429. fs_populate_time(fs, attrbuf, &sb->st_birthtim, ZPL_CRTIME, &bonussz);
  430. #endif
  431. fs_populate_varszattr(fs, attrbuf, aces, sizeof(aces), 0,
  432. ZPL_DACL_ACES, &bonussz);
  433. sahdr->sa_lengths[0] = sizeof(aces);
  434. if (cur->type == S_IFLNK) {
  435. assert(layout == SA_LAYOUT_INDEX_SYMLINK);
  436. /* Need to use a spill block pointer if the target is long. */
  437. assert(bonussz + objsize <= DN_OLD_MAX_BONUSLEN);
  438. fs_populate_varszattr(fs, attrbuf, target, objsize,
  439. sahdr->sa_lengths[0], ZPL_SYMLINK, &bonussz);
  440. sahdr->sa_lengths[1] = (uint16_t)objsize;
  441. }
  442. dnode->dn_bonuslen = bonussz;
  443. }
  444. static void
  445. fs_populate_file(fsnode *cur, struct fs_populate_arg *arg)
  446. {
  447. struct dnode_cursor *c;
  448. dnode_phys_t *dnode;
  449. zfs_opt_t *zfs;
  450. char *buf;
  451. uint64_t dnid;
  452. ssize_t n;
  453. size_t bufsz;
  454. off_t nbytes, reqbytes, size;
  455. int fd;
  456. assert(cur->type == S_IFREG);
  457. assert((cur->inode->flags & FI_ROOT) == 0);
  458. zfs = arg->zfs;
  459. assert(cur->inode->ino != 0);
  460. if ((cur->inode->flags & FI_ALLOCATED) != 0) {
  461. /*
  462. * This is a hard link of an existing file.
  463. *
  464. * XXX-MJ need to check whether it crosses datasets, add a test
  465. * case for that
  466. */
  467. fs_populate_dirent(arg, cur, cur->inode->ino);
  468. return;
  469. }
  470. dnode = objset_dnode_bonus_alloc(arg->fs->os,
  471. DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid);
  472. cur->inode->ino = dnid;
  473. cur->inode->flags |= FI_ALLOCATED;
  474. fd = fs_open(cur, arg, O_RDONLY);
  475. buf = zfs->filebuf;
  476. bufsz = sizeof(zfs->filebuf);
  477. size = cur->inode->st.st_size;
  478. c = dnode_cursor_init(zfs, arg->fs->os, dnode, size, 0);
  479. for (off_t foff = 0; foff < size; foff += nbytes) {
  480. off_t loc, sofar;
  481. /*
  482. * Fill up our buffer, handling partial reads.
  483. */
  484. sofar = 0;
  485. nbytes = MIN(size - foff, (off_t)bufsz);
  486. do {
  487. n = read(fd, buf + sofar, nbytes);
  488. if (n < 0)
  489. err(1, "reading from '%s'", cur->name);
  490. if (n == 0)
  491. errx(1, "unexpected EOF reading '%s'",
  492. cur->name);
  493. sofar += n;
  494. } while (sofar < nbytes);
  495. if (nbytes < (off_t)bufsz)
  496. memset(buf + nbytes, 0, bufsz - nbytes);
  497. reqbytes = foff == 0 ? nbytes : MAXBLOCKSIZE;
  498. loc = objset_space_alloc(zfs, arg->fs->os, &reqbytes);
  499. vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, buf, reqbytes, loc,
  500. dnode_cursor_next(zfs, c, foff));
  501. }
  502. eclose(fd);
  503. dnode_cursor_finish(zfs, c);
  504. fs_populate_sattrs(arg, cur, dnode);
  505. fs_populate_dirent(arg, cur, dnid);
  506. }
  507. static void
  508. fs_populate_dir(fsnode *cur, struct fs_populate_arg *arg)
  509. {
  510. dnode_phys_t *dnode;
  511. zfs_objset_t *os;
  512. uint64_t dnid;
  513. int dirfd;
  514. assert(cur->type == S_IFDIR);
  515. assert((cur->inode->flags & FI_ALLOCATED) == 0);
  516. os = arg->fs->os;
  517. dnode = objset_dnode_bonus_alloc(os, DMU_OT_DIRECTORY_CONTENTS,
  518. DMU_OT_SA, 0, &dnid);
  519. /*
  520. * Add an entry to the parent directory and open this directory.
  521. */
  522. if (!SLIST_EMPTY(&arg->dirs)) {
  523. fs_populate_dirent(arg, cur, dnid);
  524. /*
  525. * We only need the directory fd if we're finding files in
  526. * it. If it's just there for other directories or
  527. * files using contents= we don't need to succeed here.
  528. */
  529. dirfd = fs_open_can_fail(cur, arg, O_DIRECTORY | O_RDONLY);
  530. } else {
  531. arg->rootdirid = dnid;
  532. dirfd = arg->rootdirfd;
  533. arg->rootdirfd = -1;
  534. }
  535. /*
  536. * Set ZPL attributes.
  537. */
  538. fs_populate_sattrs(arg, cur, dnode);
  539. /*
  540. * If this is a root directory, then its children belong to a different
  541. * dataset and this directory remains empty in the current objset.
  542. */
  543. if ((cur->inode->flags & FI_ROOT) == 0) {
  544. struct fs_populate_dir *dir;
  545. dir = ecalloc(1, sizeof(*dir));
  546. dir->dirfd = dirfd;
  547. dir->objid = dnid;
  548. dir->zap = zap_alloc(os, dnode);
  549. SLIST_INSERT_HEAD(&arg->dirs, dir, next);
  550. } else {
  551. zap_write(arg->zfs, zap_alloc(os, dnode));
  552. fs_build_one(arg->zfs, cur->inode->param, cur->child, dirfd);
  553. }
  554. }
  555. static void
  556. fs_populate_symlink(fsnode *cur, struct fs_populate_arg *arg)
  557. {
  558. dnode_phys_t *dnode;
  559. uint64_t dnid;
  560. assert(cur->type == S_IFLNK);
  561. assert((cur->inode->flags & (FI_ALLOCATED | FI_ROOT)) == 0);
  562. dnode = objset_dnode_bonus_alloc(arg->fs->os,
  563. DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid);
  564. fs_populate_dirent(arg, cur, dnid);
  565. fs_populate_sattrs(arg, cur, dnode);
  566. }
  567. static int
  568. fs_foreach_populate(fsnode *cur, void *_arg)
  569. {
  570. struct fs_populate_arg *arg;
  571. struct fs_populate_dir *dir;
  572. int ret;
  573. arg = _arg;
  574. switch (cur->type) {
  575. case S_IFREG:
  576. fs_populate_file(cur, arg);
  577. break;
  578. case S_IFDIR:
  579. if (fsnode_isroot(cur))
  580. break;
  581. fs_populate_dir(cur, arg);
  582. break;
  583. case S_IFLNK:
  584. fs_populate_symlink(cur, arg);
  585. break;
  586. default:
  587. assert(0);
  588. }
  589. ret = (cur->inode->flags & FI_ROOT) != 0 ? 0 : 1;
  590. if (cur->next == NULL &&
  591. (cur->child == NULL || (cur->inode->flags & FI_ROOT) != 0)) {
  592. /*
  593. * We reached a terminal node in a subtree. Walk back up and
  594. * write out directories. We're done once we hit the root of a
  595. * dataset or find a level where we're not on the edge of the
  596. * tree.
  597. */
  598. do {
  599. dir = SLIST_FIRST(&arg->dirs);
  600. SLIST_REMOVE_HEAD(&arg->dirs, next);
  601. zap_write(arg->zfs, dir->zap);
  602. if (dir->dirfd != -1)
  603. eclose(dir->dirfd);
  604. free(dir);
  605. cur = cur->parent;
  606. } while (cur != NULL && cur->next == NULL &&
  607. (cur->inode->flags & FI_ROOT) == 0);
  608. }
  609. return (ret);
  610. }
  611. static void
  612. fs_add_zpl_attr_layout(zfs_zap_t *zap, unsigned int index,
  613. const sa_attr_type_t layout[], size_t sacnt)
  614. {
  615. char ti[16];
  616. assert(sizeof(layout[0]) == 2);
  617. snprintf(ti, sizeof(ti), "%u", index);
  618. zap_add(zap, ti, sizeof(sa_attr_type_t), sacnt,
  619. (const uint8_t *)layout);
  620. }
  621. /*
  622. * Initialize system attribute tables.
  623. *
  624. * There are two elements to this. First, we write the zpl_attrs[] and
  625. * zpl_attr_layout[] tables to disk. Then we create a lookup table which
  626. * allows us to set file attributes quickly.
  627. */
  628. static uint64_t
  629. fs_set_zpl_attrs(zfs_opt_t *zfs, zfs_fs_t *fs)
  630. {
  631. zfs_zap_t *sazap, *salzap, *sarzap;
  632. zfs_objset_t *os;
  633. dnode_phys_t *saobj, *salobj, *sarobj;
  634. uint64_t saobjid, salobjid, sarobjid;
  635. uint16_t offset;
  636. os = fs->os;
  637. /*
  638. * The on-disk tables are stored in two ZAP objects, the registry object
  639. * and the layout object. Individual attributes are described by
  640. * entries in the registry object; for example, the value for the
  641. * "ZPL_SIZE" key gives the size and encoding of the ZPL_SIZE attribute.
  642. * The attributes of a file are ordered according to one of the layouts
  643. * defined in the layout object. The master node object is simply used
  644. * to locate the registry and layout objects.
  645. */
  646. saobj = objset_dnode_alloc(os, DMU_OT_SA_MASTER_NODE, &saobjid);
  647. salobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_LAYOUTS, &salobjid);
  648. sarobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_REGISTRATION, &sarobjid);
  649. sarzap = zap_alloc(os, sarobj);
  650. for (size_t i = 0; i < nitems(zpl_attrs); i++) {
  651. const zfs_sattr_t *sa;
  652. uint64_t attr;
  653. attr = 0;
  654. sa = &zpl_attrs[i];
  655. SA_ATTR_ENCODE(attr, (uint64_t)i, sa->size, sa->bs);
  656. zap_add_uint64(sarzap, sa->name, attr);
  657. }
  658. zap_write(zfs, sarzap);
  659. /*
  660. * Layouts are arrays of indices into the registry. We define two
  661. * layouts for use by the ZPL, one for non-symlinks and one for
  662. * symlinks. They are identical except that the symlink layout includes
  663. * ZPL_SYMLINK as its final attribute.
  664. */
  665. salzap = zap_alloc(os, salobj);
  666. assert(zpl_attr_layout[nitems(zpl_attr_layout) - 1] == ZPL_SYMLINK);
  667. fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_DEFAULT,
  668. zpl_attr_layout, nitems(zpl_attr_layout) - 1);
  669. fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_SYMLINK,
  670. zpl_attr_layout, nitems(zpl_attr_layout));
  671. zap_write(zfs, salzap);
  672. sazap = zap_alloc(os, saobj);
  673. zap_add_uint64(sazap, SA_LAYOUTS, salobjid);
  674. zap_add_uint64(sazap, SA_REGISTRY, sarobjid);
  675. zap_write(zfs, sazap);
  676. /* Sanity check. */
  677. for (size_t i = 0; i < nitems(zpl_attrs); i++)
  678. assert(i == zpl_attrs[i].id);
  679. /*
  680. * Build the offset table used when setting file attributes. File
  681. * attributes are stored in the object's bonus buffer; this table
  682. * provides the buffer offset of attributes referenced by the layout
  683. * table.
  684. */
  685. fs->sacnt = nitems(zpl_attrs);
  686. fs->saoffs = ecalloc(fs->sacnt, sizeof(*fs->saoffs));
  687. for (size_t i = 0; i < fs->sacnt; i++)
  688. fs->saoffs[i] = 0xffff;
  689. offset = 0;
  690. for (size_t i = 0; i < nitems(zpl_attr_layout); i++) {
  691. uint16_t size;
  692. assert(zpl_attr_layout[i] < fs->sacnt);
  693. fs->saoffs[zpl_attr_layout[i]] = offset;
  694. size = zpl_attrs[zpl_attr_layout[i]].size;
  695. offset += size;
  696. }
  697. fs->satab = zpl_attrs;
  698. return (saobjid);
  699. }
  700. static void
  701. fs_layout_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg)
  702. {
  703. char *mountpoint, *origmountpoint, *name, *next;
  704. fsnode *cur, *root;
  705. uint64_t canmount;
  706. if (!dsl_dir_has_dataset(dsldir))
  707. return;
  708. if (dsl_dir_get_canmount(dsldir, &canmount) == 0 && canmount == 0)
  709. return;
  710. mountpoint = dsl_dir_get_mountpoint(zfs, dsldir);
  711. if (mountpoint == NULL)
  712. return;
  713. /*
  714. * If we were asked to specify a bootfs, set it here.
  715. */
  716. if (zfs->bootfs != NULL && strcmp(zfs->bootfs,
  717. dsl_dir_fullname(dsldir)) == 0) {
  718. zap_add_uint64(zfs->poolprops, "bootfs",
  719. dsl_dir_dataset_id(dsldir));
  720. }
  721. origmountpoint = mountpoint;
  722. /*
  723. * Figure out which fsnode corresponds to our mountpoint.
  724. */
  725. root = arg;
  726. cur = root;
  727. if (strcmp(mountpoint, zfs->rootpath) != 0) {
  728. mountpoint += strlen(zfs->rootpath);
  729. /*
  730. * Look up the directory in the staged tree. For example, if
  731. * the dataset's mount point is /foo/bar/baz, we'll search the
  732. * root directory for "foo", search "foo" for "baz", and so on.
  733. * Each intermediate name must refer to a directory; the final
  734. * component need not exist.
  735. */
  736. cur = root;
  737. for (next = name = mountpoint; next != NULL;) {
  738. for (; *next == '/'; next++)
  739. ;
  740. name = strsep(&next, "/");
  741. for (; cur != NULL && strcmp(cur->name, name) != 0;
  742. cur = cur->next)
  743. ;
  744. if (cur == NULL) {
  745. if (next == NULL)
  746. break;
  747. errx(1, "missing mountpoint directory for `%s'",
  748. dsl_dir_fullname(dsldir));
  749. }
  750. if (cur->type != S_IFDIR) {
  751. errx(1,
  752. "mountpoint for `%s' is not a directory",
  753. dsl_dir_fullname(dsldir));
  754. }
  755. if (next != NULL)
  756. cur = cur->child;
  757. }
  758. }
  759. if (cur != NULL) {
  760. assert(cur->type == S_IFDIR);
  761. /*
  762. * Multiple datasets shouldn't share a mountpoint. It's
  763. * technically allowed, but it's not clear what makefs should do
  764. * in that case.
  765. */
  766. assert((cur->inode->flags & FI_ROOT) == 0);
  767. if (cur != root)
  768. cur->inode->flags |= FI_ROOT;
  769. assert(cur->inode->param == NULL);
  770. cur->inode->param = dsldir;
  771. }
  772. free(origmountpoint);
  773. }
  774. static int
  775. fs_foreach_mark(fsnode *cur, void *arg)
  776. {
  777. uint64_t *countp;
  778. countp = arg;
  779. if (cur->type == S_IFDIR && fsnode_isroot(cur))
  780. return (1);
  781. if (cur->inode->ino == 0) {
  782. cur->inode->ino = ++(*countp);
  783. cur->inode->nlink = 1;
  784. } else {
  785. cur->inode->nlink++;
  786. }
  787. return ((cur->inode->flags & FI_ROOT) != 0 ? 0 : 1);
  788. }
  789. /*
  790. * Create a filesystem dataset. More specifically:
  791. * - create an object set for the dataset,
  792. * - add required metadata (SA tables, property definitions, etc.) to that
  793. * object set,
  794. * - optionally populate the object set with file objects, using "root" as the
  795. * root directory.
  796. *
  797. * "dirfd" is a directory descriptor for the directory referenced by "root". It
  798. * is closed before returning.
  799. */
  800. static void
  801. fs_build_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, fsnode *root, int dirfd)
  802. {
  803. struct fs_populate_arg arg;
  804. zfs_fs_t fs;
  805. zfs_zap_t *masterzap;
  806. zfs_objset_t *os;
  807. dnode_phys_t *deleteq, *masterobj;
  808. uint64_t deleteqid, dnodecount, moid, rootdirid, saobjid;
  809. bool fakedroot;
  810. /*
  811. * This dataset's mountpoint doesn't exist in the staging tree, or the
  812. * dataset doesn't have a mountpoint at all. In either case we still
  813. * need a root directory. Fake up a root fsnode to handle this case.
  814. */
  815. fakedroot = root == NULL;
  816. if (fakedroot) {
  817. struct stat *stp;
  818. assert(dirfd == -1);
  819. root = ecalloc(1, sizeof(*root));
  820. root->inode = ecalloc(1, sizeof(*root->inode));
  821. root->name = estrdup(".");
  822. root->type = S_IFDIR;
  823. stp = &root->inode->st;
  824. stp->st_uid = 0;
  825. stp->st_gid = 0;
  826. stp->st_mode = S_IFDIR | 0755;
  827. }
  828. assert(root->type == S_IFDIR);
  829. assert(fsnode_isroot(root));
  830. /*
  831. * Initialize the object set for this dataset.
  832. */
  833. os = objset_alloc(zfs, DMU_OST_ZFS);
  834. masterobj = objset_dnode_alloc(os, DMU_OT_MASTER_NODE, &moid);
  835. assert(moid == MASTER_NODE_OBJ);
  836. memset(&fs, 0, sizeof(fs));
  837. fs.os = os;
  838. /*
  839. * Create the ZAP SA layout now since filesystem object dnodes will
  840. * refer to those attributes.
  841. */
  842. saobjid = fs_set_zpl_attrs(zfs, &fs);
  843. /*
  844. * Make a pass over the staged directory to detect hard links and assign
  845. * virtual dnode numbers.
  846. */
  847. dnodecount = 1; /* root directory */
  848. fsnode_foreach(root, fs_foreach_mark, &dnodecount);
  849. /*
  850. * Make a second pass to populate the dataset with files from the
  851. * staged directory. Most of our runtime is spent here.
  852. */
  853. arg.rootdirfd = dirfd;
  854. arg.zfs = zfs;
  855. arg.fs = &fs;
  856. SLIST_INIT(&arg.dirs);
  857. fs_populate_dir(root, &arg);
  858. assert(!SLIST_EMPTY(&arg.dirs));
  859. fsnode_foreach(root, fs_foreach_populate, &arg);
  860. assert(SLIST_EMPTY(&arg.dirs));
  861. rootdirid = arg.rootdirid;
  862. /*
  863. * Create an empty delete queue. We don't do anything with it, but
  864. * OpenZFS will refuse to mount filesystems that don't have one.
  865. */
  866. deleteq = objset_dnode_alloc(os, DMU_OT_UNLINKED_SET, &deleteqid);
  867. zap_write(zfs, zap_alloc(os, deleteq));
  868. /*
  869. * Populate and write the master node object. This is a ZAP object
  870. * containing various dataset properties and the object IDs of the root
  871. * directory and delete queue.
  872. */
  873. masterzap = zap_alloc(os, masterobj);
  874. zap_add_uint64(masterzap, ZFS_ROOT_OBJ, rootdirid);
  875. zap_add_uint64(masterzap, ZFS_UNLINKED_SET, deleteqid);
  876. zap_add_uint64(masterzap, ZFS_SA_ATTRS, saobjid);
  877. zap_add_uint64(masterzap, ZPL_VERSION_OBJ, 5 /* ZPL_VERSION_SA */);
  878. zap_add_uint64(masterzap, "normalization", 0 /* off */);
  879. zap_add_uint64(masterzap, "utf8only", 0 /* off */);
  880. zap_add_uint64(masterzap, "casesensitivity", 0 /* case sensitive */);
  881. zap_add_uint64(masterzap, "acltype", 2 /* NFSv4 */);
  882. zap_write(zfs, masterzap);
  883. /*
  884. * All finished with this object set, we may as well write it now.
  885. * The DSL layer will sum up the bytes consumed by each dataset using
  886. * information stored in the object set, so it can't be freed just yet.
  887. */
  888. dsl_dir_dataset_write(zfs, os, dsldir);
  889. if (fakedroot) {
  890. free(root->inode);
  891. free(root->name);
  892. free(root);
  893. }
  894. free(fs.saoffs);
  895. }
  896. /*
  897. * Create an object set for each DSL directory which has a dataset and doesn't
  898. * already have an object set.
  899. */
  900. static void
  901. fs_build_unmounted(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg __unused)
  902. {
  903. if (dsl_dir_has_dataset(dsldir) && !dsl_dir_dataset_has_objset(dsldir))
  904. fs_build_one(zfs, dsldir, NULL, -1);
  905. }
  906. /*
  907. * Create our datasets and populate them with files.
  908. */
  909. void
  910. fs_build(zfs_opt_t *zfs, int dirfd, fsnode *root)
  911. {
  912. /*
  913. * Run through our datasets and find the root fsnode for each one. Each
  914. * root fsnode is flagged so that we can figure out which dataset it
  915. * belongs to.
  916. */
  917. dsl_dir_foreach(zfs, zfs->rootdsldir, fs_layout_one, root);
  918. /*
  919. * Did we find our boot filesystem?
  920. */
  921. if (zfs->bootfs != NULL && !zap_entry_exists(zfs->poolprops, "bootfs"))
  922. errx(1, "no mounted dataset matches bootfs property `%s'",
  923. zfs->bootfs);
  924. /*
  925. * Traverse the file hierarchy starting from the root fsnode. One
  926. * dataset, not necessarily the root dataset, must "own" the root
  927. * directory by having its mountpoint be equal to the root path.
  928. *
  929. * As roots of other datasets are encountered during the traversal,
  930. * fs_build_one() recursively creates the corresponding object sets and
  931. * populates them. Once this function has returned, all datasets will
  932. * have been fully populated.
  933. */
  934. fs_build_one(zfs, root->inode->param, root, dirfd);
  935. /*
  936. * Now create object sets for datasets whose mountpoints weren't found
  937. * in the staging directory, either because there is no mountpoint, or
  938. * because the mountpoint doesn't correspond to an existing directory.
  939. */
  940. dsl_dir_foreach(zfs, zfs->rootdsldir, fs_build_unmounted, NULL);
  941. }