read_write.c 29 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330
  1. /*
  2. * linux/fs/read_write.c
  3. *
  4. * Copyright (C) 1991, 1992 Linus Torvalds
  5. */
  6. #include <linux/slab.h>
  7. #include <linux/stat.h>
  8. #include <linux/fcntl.h>
  9. #include <linux/file.h>
  10. #include <linux/uio.h>
  11. #include <linux/fsnotify.h>
  12. #include <linux/security.h>
  13. #include <linux/export.h>
  14. #include <linux/syscalls.h>
  15. #include <linux/pagemap.h>
  16. #include <linux/splice.h>
  17. #include <linux/compat.h>
  18. #include "internal.h"
  19. #include <asm/uaccess.h>
  20. #include <asm/unistd.h>
  21. typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
  22. typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *);
  23. const struct file_operations generic_ro_fops = {
  24. .llseek = generic_file_llseek,
  25. .read_iter = generic_file_read_iter,
  26. .mmap = generic_file_readonly_mmap,
  27. .splice_read = generic_file_splice_read,
  28. };
  29. EXPORT_SYMBOL(generic_ro_fops);
  30. static inline int unsigned_offsets(struct file *file)
  31. {
  32. return file->f_mode & FMODE_UNSIGNED_OFFSET;
  33. }
  34. /**
  35. * vfs_setpos - update the file offset for lseek
  36. * @file: file structure in question
  37. * @offset: file offset to seek to
  38. * @maxsize: maximum file size
  39. *
  40. * This is a low-level filesystem helper for updating the file offset to
  41. * the value specified by @offset if the given offset is valid and it is
  42. * not equal to the current file offset.
  43. *
  44. * Return the specified offset on success and -EINVAL on invalid offset.
  45. */
  46. loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
  47. {
  48. if (offset < 0 && !unsigned_offsets(file))
  49. return -EINVAL;
  50. if (offset > maxsize)
  51. return -EINVAL;
  52. if (offset != file->f_pos) {
  53. file->f_pos = offset;
  54. file->f_version = 0;
  55. }
  56. return offset;
  57. }
  58. EXPORT_SYMBOL(vfs_setpos);
  59. /**
  60. * generic_file_llseek_size - generic llseek implementation for regular files
  61. * @file: file structure to seek on
  62. * @offset: file offset to seek to
  63. * @whence: type of seek
  64. * @size: max size of this file in file system
  65. * @eof: offset used for SEEK_END position
  66. *
  67. * This is a variant of generic_file_llseek that allows passing in a custom
  68. * maximum file size and a custom EOF position, for e.g. hashed directories
  69. *
  70. * Synchronization:
  71. * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  72. * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  73. * read/writes behave like SEEK_SET against seeks.
  74. */
  75. loff_t
  76. generic_file_llseek_size(struct file *file, loff_t offset, int whence,
  77. loff_t maxsize, loff_t eof)
  78. {
  79. switch (whence) {
  80. case SEEK_END:
  81. offset += eof;
  82. break;
  83. case SEEK_CUR:
  84. /*
  85. * Here we special-case the lseek(fd, 0, SEEK_CUR)
  86. * position-querying operation. Avoid rewriting the "same"
  87. * f_pos value back to the file because a concurrent read(),
  88. * write() or lseek() might have altered it
  89. */
  90. if (offset == 0)
  91. return file->f_pos;
  92. /*
  93. * f_lock protects against read/modify/write race with other
  94. * SEEK_CURs. Note that parallel writes and reads behave
  95. * like SEEK_SET.
  96. */
  97. spin_lock(&file->f_lock);
  98. offset = vfs_setpos(file, file->f_pos + offset, maxsize);
  99. spin_unlock(&file->f_lock);
  100. return offset;
  101. case SEEK_DATA:
  102. /*
  103. * In the generic case the entire file is data, so as long as
  104. * offset isn't at the end of the file then the offset is data.
  105. */
  106. if (offset >= eof)
  107. return -ENXIO;
  108. break;
  109. case SEEK_HOLE:
  110. /*
  111. * There is a virtual hole at the end of the file, so as long as
  112. * offset isn't i_size or larger, return i_size.
  113. */
  114. if (offset >= eof)
  115. return -ENXIO;
  116. offset = eof;
  117. break;
  118. }
  119. return vfs_setpos(file, offset, maxsize);
  120. }
  121. EXPORT_SYMBOL(generic_file_llseek_size);
  122. /**
  123. * generic_file_llseek - generic llseek implementation for regular files
  124. * @file: file structure to seek on
  125. * @offset: file offset to seek to
  126. * @whence: type of seek
  127. *
  128. * This is a generic implemenation of ->llseek useable for all normal local
  129. * filesystems. It just updates the file offset to the value specified by
  130. * @offset and @whence.
  131. */
  132. loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
  133. {
  134. struct inode *inode = file->f_mapping->host;
  135. return generic_file_llseek_size(file, offset, whence,
  136. inode->i_sb->s_maxbytes,
  137. i_size_read(inode));
  138. }
  139. EXPORT_SYMBOL(generic_file_llseek);
  140. /**
  141. * fixed_size_llseek - llseek implementation for fixed-sized devices
  142. * @file: file structure to seek on
  143. * @offset: file offset to seek to
  144. * @whence: type of seek
  145. * @size: size of the file
  146. *
  147. */
  148. loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
  149. {
  150. switch (whence) {
  151. case SEEK_SET: case SEEK_CUR: case SEEK_END:
  152. return generic_file_llseek_size(file, offset, whence,
  153. size, size);
  154. default:
  155. return -EINVAL;
  156. }
  157. }
  158. EXPORT_SYMBOL(fixed_size_llseek);
  159. /**
  160. * noop_llseek - No Operation Performed llseek implementation
  161. * @file: file structure to seek on
  162. * @offset: file offset to seek to
  163. * @whence: type of seek
  164. *
  165. * This is an implementation of ->llseek useable for the rare special case when
  166. * userspace expects the seek to succeed but the (device) file is actually not
  167. * able to perform the seek. In this case you use noop_llseek() instead of
  168. * falling back to the default implementation of ->llseek.
  169. */
  170. loff_t noop_llseek(struct file *file, loff_t offset, int whence)
  171. {
  172. return file->f_pos;
  173. }
  174. EXPORT_SYMBOL(noop_llseek);
  175. loff_t no_llseek(struct file *file, loff_t offset, int whence)
  176. {
  177. return -ESPIPE;
  178. }
  179. EXPORT_SYMBOL(no_llseek);
  180. loff_t default_llseek(struct file *file, loff_t offset, int whence)
  181. {
  182. struct inode *inode = file_inode(file);
  183. loff_t retval;
  184. mutex_lock(&inode->i_mutex);
  185. switch (whence) {
  186. case SEEK_END:
  187. offset += i_size_read(inode);
  188. break;
  189. case SEEK_CUR:
  190. if (offset == 0) {
  191. retval = file->f_pos;
  192. goto out;
  193. }
  194. offset += file->f_pos;
  195. break;
  196. case SEEK_DATA:
  197. /*
  198. * In the generic case the entire file is data, so as
  199. * long as offset isn't at the end of the file then the
  200. * offset is data.
  201. */
  202. if (offset >= inode->i_size) {
  203. retval = -ENXIO;
  204. goto out;
  205. }
  206. break;
  207. case SEEK_HOLE:
  208. /*
  209. * There is a virtual hole at the end of the file, so
  210. * as long as offset isn't i_size or larger, return
  211. * i_size.
  212. */
  213. if (offset >= inode->i_size) {
  214. retval = -ENXIO;
  215. goto out;
  216. }
  217. offset = inode->i_size;
  218. break;
  219. }
  220. retval = -EINVAL;
  221. if (offset >= 0 || unsigned_offsets(file)) {
  222. if (offset != file->f_pos) {
  223. file->f_pos = offset;
  224. file->f_version = 0;
  225. }
  226. retval = offset;
  227. }
  228. out:
  229. mutex_unlock(&inode->i_mutex);
  230. return retval;
  231. }
  232. EXPORT_SYMBOL(default_llseek);
  233. loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
  234. {
  235. loff_t (*fn)(struct file *, loff_t, int);
  236. fn = no_llseek;
  237. if (file->f_mode & FMODE_LSEEK) {
  238. if (file->f_op->llseek)
  239. fn = file->f_op->llseek;
  240. }
  241. return fn(file, offset, whence);
  242. }
  243. EXPORT_SYMBOL(vfs_llseek);
  244. static inline struct fd fdget_pos(int fd)
  245. {
  246. return __to_fd(__fdget_pos(fd));
  247. }
  248. static inline void fdput_pos(struct fd f)
  249. {
  250. if (f.flags & FDPUT_POS_UNLOCK)
  251. mutex_unlock(&f.file->f_pos_lock);
  252. fdput(f);
  253. }
  254. SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
  255. {
  256. off_t retval;
  257. struct fd f = fdget_pos(fd);
  258. if (!f.file)
  259. return -EBADF;
  260. retval = -EINVAL;
  261. if (whence <= SEEK_MAX) {
  262. loff_t res = vfs_llseek(f.file, offset, whence);
  263. retval = res;
  264. if (res != (loff_t)retval)
  265. retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
  266. }
  267. fdput_pos(f);
  268. return retval;
  269. }
  270. #ifdef CONFIG_COMPAT
  271. COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
  272. {
  273. return sys_lseek(fd, offset, whence);
  274. }
  275. #endif
  276. #ifdef __ARCH_WANT_SYS_LLSEEK
  277. SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
  278. unsigned long, offset_low, loff_t __user *, result,
  279. unsigned int, whence)
  280. {
  281. int retval;
  282. struct fd f = fdget_pos(fd);
  283. loff_t offset;
  284. if (!f.file)
  285. return -EBADF;
  286. retval = -EINVAL;
  287. if (whence > SEEK_MAX)
  288. goto out_putf;
  289. offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
  290. whence);
  291. retval = (int)offset;
  292. if (offset >= 0) {
  293. retval = -EFAULT;
  294. if (!copy_to_user(result, &offset, sizeof(offset)))
  295. retval = 0;
  296. }
  297. out_putf:
  298. fdput_pos(f);
  299. return retval;
  300. }
  301. #endif
  302. ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
  303. {
  304. struct kiocb kiocb;
  305. ssize_t ret;
  306. if (!file->f_op->read_iter)
  307. return -EINVAL;
  308. init_sync_kiocb(&kiocb, file);
  309. kiocb.ki_pos = *ppos;
  310. iter->type |= READ;
  311. ret = file->f_op->read_iter(&kiocb, iter);
  312. BUG_ON(ret == -EIOCBQUEUED);
  313. if (ret > 0)
  314. *ppos = kiocb.ki_pos;
  315. return ret;
  316. }
  317. EXPORT_SYMBOL(vfs_iter_read);
  318. ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
  319. {
  320. struct kiocb kiocb;
  321. ssize_t ret;
  322. if (!file->f_op->write_iter)
  323. return -EINVAL;
  324. init_sync_kiocb(&kiocb, file);
  325. kiocb.ki_pos = *ppos;
  326. iter->type |= WRITE;
  327. ret = file->f_op->write_iter(&kiocb, iter);
  328. BUG_ON(ret == -EIOCBQUEUED);
  329. if (ret > 0)
  330. *ppos = kiocb.ki_pos;
  331. return ret;
  332. }
  333. EXPORT_SYMBOL(vfs_iter_write);
  334. /*
  335. * rw_verify_area doesn't like huge counts. We limit
  336. * them to something that fits in "int" so that others
  337. * won't have to do range checks all the time.
  338. */
  339. int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
  340. {
  341. struct inode *inode;
  342. loff_t pos;
  343. int retval = -EINVAL;
  344. inode = file_inode(file);
  345. if (unlikely((ssize_t) count < 0))
  346. return retval;
  347. pos = *ppos;
  348. if (unlikely(pos < 0)) {
  349. if (!unsigned_offsets(file))
  350. return retval;
  351. if (count >= -pos) /* both values are in 0..LLONG_MAX */
  352. return -EOVERFLOW;
  353. } else if (unlikely((loff_t) (pos + count) < 0)) {
  354. if (!unsigned_offsets(file))
  355. return retval;
  356. }
  357. if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
  358. retval = locks_mandatory_area(
  359. read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
  360. inode, file, pos, count);
  361. if (retval < 0)
  362. return retval;
  363. }
  364. retval = security_file_permission(file,
  365. read_write == READ ? MAY_READ : MAY_WRITE);
  366. if (retval)
  367. return retval;
  368. return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
  369. }
  370. static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
  371. {
  372. struct iovec iov = { .iov_base = buf, .iov_len = len };
  373. struct kiocb kiocb;
  374. struct iov_iter iter;
  375. ssize_t ret;
  376. init_sync_kiocb(&kiocb, filp);
  377. kiocb.ki_pos = *ppos;
  378. iov_iter_init(&iter, READ, &iov, 1, len);
  379. ret = filp->f_op->read_iter(&kiocb, &iter);
  380. BUG_ON(ret == -EIOCBQUEUED);
  381. *ppos = kiocb.ki_pos;
  382. return ret;
  383. }
  384. ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
  385. loff_t *pos)
  386. {
  387. if (file->f_op->read)
  388. return file->f_op->read(file, buf, count, pos);
  389. else if (file->f_op->read_iter)
  390. return new_sync_read(file, buf, count, pos);
  391. else
  392. return -EINVAL;
  393. }
  394. EXPORT_SYMBOL(__vfs_read);
  395. ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
  396. {
  397. ssize_t ret;
  398. if (!(file->f_mode & FMODE_READ))
  399. return -EBADF;
  400. if (!(file->f_mode & FMODE_CAN_READ))
  401. return -EINVAL;
  402. if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
  403. return -EFAULT;
  404. ret = rw_verify_area(READ, file, pos, count);
  405. if (ret >= 0) {
  406. count = ret;
  407. ret = __vfs_read(file, buf, count, pos);
  408. if (ret > 0) {
  409. fsnotify_access(file);
  410. add_rchar(current, ret);
  411. }
  412. inc_syscr(current);
  413. }
  414. return ret;
  415. }
  416. EXPORT_SYMBOL(vfs_read);
  417. static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
  418. {
  419. struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
  420. struct kiocb kiocb;
  421. struct iov_iter iter;
  422. ssize_t ret;
  423. init_sync_kiocb(&kiocb, filp);
  424. kiocb.ki_pos = *ppos;
  425. iov_iter_init(&iter, WRITE, &iov, 1, len);
  426. ret = filp->f_op->write_iter(&kiocb, &iter);
  427. BUG_ON(ret == -EIOCBQUEUED);
  428. if (ret > 0)
  429. *ppos = kiocb.ki_pos;
  430. return ret;
  431. }
  432. ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
  433. loff_t *pos)
  434. {
  435. if (file->f_op->write)
  436. return file->f_op->write(file, p, count, pos);
  437. else if (file->f_op->write_iter)
  438. return new_sync_write(file, p, count, pos);
  439. else
  440. return -EINVAL;
  441. }
  442. EXPORT_SYMBOL(__vfs_write);
  443. ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
  444. {
  445. mm_segment_t old_fs;
  446. const char __user *p;
  447. ssize_t ret;
  448. if (!(file->f_mode & FMODE_CAN_WRITE))
  449. return -EINVAL;
  450. old_fs = get_fs();
  451. set_fs(get_ds());
  452. p = (__force const char __user *)buf;
  453. if (count > MAX_RW_COUNT)
  454. count = MAX_RW_COUNT;
  455. ret = __vfs_write(file, p, count, pos);
  456. set_fs(old_fs);
  457. if (ret > 0) {
  458. fsnotify_modify(file);
  459. add_wchar(current, ret);
  460. }
  461. inc_syscw(current);
  462. return ret;
  463. }
  464. EXPORT_SYMBOL(__kernel_write);
  465. ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
  466. {
  467. ssize_t ret;
  468. if (!(file->f_mode & FMODE_WRITE))
  469. return -EBADF;
  470. if (!(file->f_mode & FMODE_CAN_WRITE))
  471. return -EINVAL;
  472. if (unlikely(!access_ok(VERIFY_READ, buf, count)))
  473. return -EFAULT;
  474. ret = rw_verify_area(WRITE, file, pos, count);
  475. if (ret >= 0) {
  476. count = ret;
  477. file_start_write(file);
  478. ret = __vfs_write(file, buf, count, pos);
  479. if (ret > 0) {
  480. fsnotify_modify(file);
  481. add_wchar(current, ret);
  482. }
  483. inc_syscw(current);
  484. file_end_write(file);
  485. }
  486. return ret;
  487. }
  488. EXPORT_SYMBOL(vfs_write);
  489. static inline loff_t file_pos_read(struct file *file)
  490. {
  491. return file->f_pos;
  492. }
  493. static inline void file_pos_write(struct file *file, loff_t pos)
  494. {
  495. file->f_pos = pos;
  496. }
  497. SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
  498. {
  499. struct fd f = fdget_pos(fd);
  500. ssize_t ret = -EBADF;
  501. if (f.file) {
  502. loff_t pos = file_pos_read(f.file);
  503. ret = vfs_read(f.file, buf, count, &pos);
  504. if (ret >= 0)
  505. file_pos_write(f.file, pos);
  506. fdput_pos(f);
  507. }
  508. return ret;
  509. }
  510. SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
  511. size_t, count)
  512. {
  513. struct fd f = fdget_pos(fd);
  514. ssize_t ret = -EBADF;
  515. if (f.file) {
  516. loff_t pos = file_pos_read(f.file);
  517. ret = vfs_write(f.file, buf, count, &pos);
  518. if (ret >= 0)
  519. file_pos_write(f.file, pos);
  520. fdput_pos(f);
  521. }
  522. return ret;
  523. }
  524. SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
  525. size_t, count, loff_t, pos)
  526. {
  527. struct fd f;
  528. ssize_t ret = -EBADF;
  529. if (pos < 0)
  530. return -EINVAL;
  531. f = fdget(fd);
  532. if (f.file) {
  533. ret = -ESPIPE;
  534. if (f.file->f_mode & FMODE_PREAD)
  535. ret = vfs_read(f.file, buf, count, &pos);
  536. fdput(f);
  537. }
  538. return ret;
  539. }
  540. SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
  541. size_t, count, loff_t, pos)
  542. {
  543. struct fd f;
  544. ssize_t ret = -EBADF;
  545. if (pos < 0)
  546. return -EINVAL;
  547. f = fdget(fd);
  548. if (f.file) {
  549. ret = -ESPIPE;
  550. if (f.file->f_mode & FMODE_PWRITE)
  551. ret = vfs_write(f.file, buf, count, &pos);
  552. fdput(f);
  553. }
  554. return ret;
  555. }
  556. /*
  557. * Reduce an iovec's length in-place. Return the resulting number of segments
  558. */
  559. unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
  560. {
  561. unsigned long seg = 0;
  562. size_t len = 0;
  563. while (seg < nr_segs) {
  564. seg++;
  565. if (len + iov->iov_len >= to) {
  566. iov->iov_len = to - len;
  567. break;
  568. }
  569. len += iov->iov_len;
  570. iov++;
  571. }
  572. return seg;
  573. }
  574. EXPORT_SYMBOL(iov_shorten);
  575. static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
  576. loff_t *ppos, iter_fn_t fn)
  577. {
  578. struct kiocb kiocb;
  579. ssize_t ret;
  580. init_sync_kiocb(&kiocb, filp);
  581. kiocb.ki_pos = *ppos;
  582. ret = fn(&kiocb, iter);
  583. BUG_ON(ret == -EIOCBQUEUED);
  584. *ppos = kiocb.ki_pos;
  585. return ret;
  586. }
  587. /* Do it by hand, with file-ops */
  588. static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
  589. loff_t *ppos, io_fn_t fn)
  590. {
  591. ssize_t ret = 0;
  592. while (iov_iter_count(iter)) {
  593. struct iovec iovec = iov_iter_iovec(iter);
  594. ssize_t nr;
  595. nr = fn(filp, iovec.iov_base, iovec.iov_len, ppos);
  596. if (nr < 0) {
  597. if (!ret)
  598. ret = nr;
  599. break;
  600. }
  601. ret += nr;
  602. if (nr != iovec.iov_len)
  603. break;
  604. iov_iter_advance(iter, nr);
  605. }
  606. return ret;
  607. }
  608. /* A write operation does a read from user space and vice versa */
  609. #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
  610. ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
  611. unsigned long nr_segs, unsigned long fast_segs,
  612. struct iovec *fast_pointer,
  613. struct iovec **ret_pointer)
  614. {
  615. unsigned long seg;
  616. ssize_t ret;
  617. struct iovec *iov = fast_pointer;
  618. /*
  619. * SuS says "The readv() function *may* fail if the iovcnt argument
  620. * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
  621. * traditionally returned zero for zero segments, so...
  622. */
  623. if (nr_segs == 0) {
  624. ret = 0;
  625. goto out;
  626. }
  627. /*
  628. * First get the "struct iovec" from user memory and
  629. * verify all the pointers
  630. */
  631. if (nr_segs > UIO_MAXIOV) {
  632. ret = -EINVAL;
  633. goto out;
  634. }
  635. if (nr_segs > fast_segs) {
  636. iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
  637. if (iov == NULL) {
  638. ret = -ENOMEM;
  639. goto out;
  640. }
  641. }
  642. if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
  643. ret = -EFAULT;
  644. goto out;
  645. }
  646. /*
  647. * According to the Single Unix Specification we should return EINVAL
  648. * if an element length is < 0 when cast to ssize_t or if the
  649. * total length would overflow the ssize_t return value of the
  650. * system call.
  651. *
  652. * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
  653. * overflow case.
  654. */
  655. ret = 0;
  656. for (seg = 0; seg < nr_segs; seg++) {
  657. void __user *buf = iov[seg].iov_base;
  658. ssize_t len = (ssize_t)iov[seg].iov_len;
  659. /* see if we we're about to use an invalid len or if
  660. * it's about to overflow ssize_t */
  661. if (len < 0) {
  662. ret = -EINVAL;
  663. goto out;
  664. }
  665. if (type >= 0
  666. && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
  667. ret = -EFAULT;
  668. goto out;
  669. }
  670. if (len > MAX_RW_COUNT - ret) {
  671. len = MAX_RW_COUNT - ret;
  672. iov[seg].iov_len = len;
  673. }
  674. ret += len;
  675. }
  676. out:
  677. *ret_pointer = iov;
  678. return ret;
  679. }
  680. static ssize_t do_readv_writev(int type, struct file *file,
  681. const struct iovec __user * uvector,
  682. unsigned long nr_segs, loff_t *pos)
  683. {
  684. size_t tot_len;
  685. struct iovec iovstack[UIO_FASTIOV];
  686. struct iovec *iov = iovstack;
  687. struct iov_iter iter;
  688. ssize_t ret;
  689. io_fn_t fn;
  690. iter_fn_t iter_fn;
  691. ret = import_iovec(type, uvector, nr_segs,
  692. ARRAY_SIZE(iovstack), &iov, &iter);
  693. if (ret < 0)
  694. return ret;
  695. tot_len = iov_iter_count(&iter);
  696. if (!tot_len)
  697. goto out;
  698. ret = rw_verify_area(type, file, pos, tot_len);
  699. if (ret < 0)
  700. goto out;
  701. if (type == READ) {
  702. fn = file->f_op->read;
  703. iter_fn = file->f_op->read_iter;
  704. } else {
  705. fn = (io_fn_t)file->f_op->write;
  706. iter_fn = file->f_op->write_iter;
  707. file_start_write(file);
  708. }
  709. if (iter_fn)
  710. ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
  711. else
  712. ret = do_loop_readv_writev(file, &iter, pos, fn);
  713. if (type != READ)
  714. file_end_write(file);
  715. out:
  716. kfree(iov);
  717. if ((ret + (type == READ)) > 0) {
  718. if (type == READ)
  719. fsnotify_access(file);
  720. else
  721. fsnotify_modify(file);
  722. }
  723. return ret;
  724. }
  725. ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
  726. unsigned long vlen, loff_t *pos)
  727. {
  728. if (!(file->f_mode & FMODE_READ))
  729. return -EBADF;
  730. if (!(file->f_mode & FMODE_CAN_READ))
  731. return -EINVAL;
  732. return do_readv_writev(READ, file, vec, vlen, pos);
  733. }
  734. EXPORT_SYMBOL(vfs_readv);
  735. ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
  736. unsigned long vlen, loff_t *pos)
  737. {
  738. if (!(file->f_mode & FMODE_WRITE))
  739. return -EBADF;
  740. if (!(file->f_mode & FMODE_CAN_WRITE))
  741. return -EINVAL;
  742. return do_readv_writev(WRITE, file, vec, vlen, pos);
  743. }
  744. EXPORT_SYMBOL(vfs_writev);
  745. SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
  746. unsigned long, vlen)
  747. {
  748. struct fd f = fdget_pos(fd);
  749. ssize_t ret = -EBADF;
  750. if (f.file) {
  751. loff_t pos = file_pos_read(f.file);
  752. ret = vfs_readv(f.file, vec, vlen, &pos);
  753. if (ret >= 0)
  754. file_pos_write(f.file, pos);
  755. fdput_pos(f);
  756. }
  757. if (ret > 0)
  758. add_rchar(current, ret);
  759. inc_syscr(current);
  760. return ret;
  761. }
  762. SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
  763. unsigned long, vlen)
  764. {
  765. struct fd f = fdget_pos(fd);
  766. ssize_t ret = -EBADF;
  767. if (f.file) {
  768. loff_t pos = file_pos_read(f.file);
  769. ret = vfs_writev(f.file, vec, vlen, &pos);
  770. if (ret >= 0)
  771. file_pos_write(f.file, pos);
  772. fdput_pos(f);
  773. }
  774. if (ret > 0)
  775. add_wchar(current, ret);
  776. inc_syscw(current);
  777. return ret;
  778. }
  779. static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
  780. {
  781. #define HALF_LONG_BITS (BITS_PER_LONG / 2)
  782. return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
  783. }
  784. SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
  785. unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
  786. {
  787. loff_t pos = pos_from_hilo(pos_h, pos_l);
  788. struct fd f;
  789. ssize_t ret = -EBADF;
  790. if (pos < 0)
  791. return -EINVAL;
  792. f = fdget(fd);
  793. if (f.file) {
  794. ret = -ESPIPE;
  795. if (f.file->f_mode & FMODE_PREAD)
  796. ret = vfs_readv(f.file, vec, vlen, &pos);
  797. fdput(f);
  798. }
  799. if (ret > 0)
  800. add_rchar(current, ret);
  801. inc_syscr(current);
  802. return ret;
  803. }
  804. SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
  805. unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
  806. {
  807. loff_t pos = pos_from_hilo(pos_h, pos_l);
  808. struct fd f;
  809. ssize_t ret = -EBADF;
  810. if (pos < 0)
  811. return -EINVAL;
  812. f = fdget(fd);
  813. if (f.file) {
  814. ret = -ESPIPE;
  815. if (f.file->f_mode & FMODE_PWRITE)
  816. ret = vfs_writev(f.file, vec, vlen, &pos);
  817. fdput(f);
  818. }
  819. if (ret > 0)
  820. add_wchar(current, ret);
  821. inc_syscw(current);
  822. return ret;
  823. }
  824. #ifdef CONFIG_COMPAT
  825. static ssize_t compat_do_readv_writev(int type, struct file *file,
  826. const struct compat_iovec __user *uvector,
  827. unsigned long nr_segs, loff_t *pos)
  828. {
  829. compat_ssize_t tot_len;
  830. struct iovec iovstack[UIO_FASTIOV];
  831. struct iovec *iov = iovstack;
  832. struct iov_iter iter;
  833. ssize_t ret;
  834. io_fn_t fn;
  835. iter_fn_t iter_fn;
  836. ret = compat_import_iovec(type, uvector, nr_segs,
  837. UIO_FASTIOV, &iov, &iter);
  838. if (ret < 0)
  839. return ret;
  840. tot_len = iov_iter_count(&iter);
  841. if (!tot_len)
  842. goto out;
  843. ret = rw_verify_area(type, file, pos, tot_len);
  844. if (ret < 0)
  845. goto out;
  846. if (type == READ) {
  847. fn = file->f_op->read;
  848. iter_fn = file->f_op->read_iter;
  849. } else {
  850. fn = (io_fn_t)file->f_op->write;
  851. iter_fn = file->f_op->write_iter;
  852. file_start_write(file);
  853. }
  854. if (iter_fn)
  855. ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
  856. else
  857. ret = do_loop_readv_writev(file, &iter, pos, fn);
  858. if (type != READ)
  859. file_end_write(file);
  860. out:
  861. kfree(iov);
  862. if ((ret + (type == READ)) > 0) {
  863. if (type == READ)
  864. fsnotify_access(file);
  865. else
  866. fsnotify_modify(file);
  867. }
  868. return ret;
  869. }
  870. static size_t compat_readv(struct file *file,
  871. const struct compat_iovec __user *vec,
  872. unsigned long vlen, loff_t *pos)
  873. {
  874. ssize_t ret = -EBADF;
  875. if (!(file->f_mode & FMODE_READ))
  876. goto out;
  877. ret = -EINVAL;
  878. if (!(file->f_mode & FMODE_CAN_READ))
  879. goto out;
  880. ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
  881. out:
  882. if (ret > 0)
  883. add_rchar(current, ret);
  884. inc_syscr(current);
  885. return ret;
  886. }
  887. COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
  888. const struct compat_iovec __user *,vec,
  889. compat_ulong_t, vlen)
  890. {
  891. struct fd f = fdget_pos(fd);
  892. ssize_t ret;
  893. loff_t pos;
  894. if (!f.file)
  895. return -EBADF;
  896. pos = f.file->f_pos;
  897. ret = compat_readv(f.file, vec, vlen, &pos);
  898. if (ret >= 0)
  899. f.file->f_pos = pos;
  900. fdput_pos(f);
  901. return ret;
  902. }
  903. static long __compat_sys_preadv64(unsigned long fd,
  904. const struct compat_iovec __user *vec,
  905. unsigned long vlen, loff_t pos)
  906. {
  907. struct fd f;
  908. ssize_t ret;
  909. if (pos < 0)
  910. return -EINVAL;
  911. f = fdget(fd);
  912. if (!f.file)
  913. return -EBADF;
  914. ret = -ESPIPE;
  915. if (f.file->f_mode & FMODE_PREAD)
  916. ret = compat_readv(f.file, vec, vlen, &pos);
  917. fdput(f);
  918. return ret;
  919. }
  920. #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
  921. COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
  922. const struct compat_iovec __user *,vec,
  923. unsigned long, vlen, loff_t, pos)
  924. {
  925. return __compat_sys_preadv64(fd, vec, vlen, pos);
  926. }
  927. #endif
  928. COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
  929. const struct compat_iovec __user *,vec,
  930. compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
  931. {
  932. loff_t pos = ((loff_t)pos_high << 32) | pos_low;
  933. return __compat_sys_preadv64(fd, vec, vlen, pos);
  934. }
  935. static size_t compat_writev(struct file *file,
  936. const struct compat_iovec __user *vec,
  937. unsigned long vlen, loff_t *pos)
  938. {
  939. ssize_t ret = -EBADF;
  940. if (!(file->f_mode & FMODE_WRITE))
  941. goto out;
  942. ret = -EINVAL;
  943. if (!(file->f_mode & FMODE_CAN_WRITE))
  944. goto out;
  945. ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
  946. out:
  947. if (ret > 0)
  948. add_wchar(current, ret);
  949. inc_syscw(current);
  950. return ret;
  951. }
  952. COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
  953. const struct compat_iovec __user *, vec,
  954. compat_ulong_t, vlen)
  955. {
  956. struct fd f = fdget_pos(fd);
  957. ssize_t ret;
  958. loff_t pos;
  959. if (!f.file)
  960. return -EBADF;
  961. pos = f.file->f_pos;
  962. ret = compat_writev(f.file, vec, vlen, &pos);
  963. if (ret >= 0)
  964. f.file->f_pos = pos;
  965. fdput_pos(f);
  966. return ret;
  967. }
  968. static long __compat_sys_pwritev64(unsigned long fd,
  969. const struct compat_iovec __user *vec,
  970. unsigned long vlen, loff_t pos)
  971. {
  972. struct fd f;
  973. ssize_t ret;
  974. if (pos < 0)
  975. return -EINVAL;
  976. f = fdget(fd);
  977. if (!f.file)
  978. return -EBADF;
  979. ret = -ESPIPE;
  980. if (f.file->f_mode & FMODE_PWRITE)
  981. ret = compat_writev(f.file, vec, vlen, &pos);
  982. fdput(f);
  983. return ret;
  984. }
  985. #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
  986. COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
  987. const struct compat_iovec __user *,vec,
  988. unsigned long, vlen, loff_t, pos)
  989. {
  990. return __compat_sys_pwritev64(fd, vec, vlen, pos);
  991. }
  992. #endif
  993. COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
  994. const struct compat_iovec __user *,vec,
  995. compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
  996. {
  997. loff_t pos = ((loff_t)pos_high << 32) | pos_low;
  998. return __compat_sys_pwritev64(fd, vec, vlen, pos);
  999. }
  1000. #endif
  1001. static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
  1002. size_t count, loff_t max)
  1003. {
  1004. struct fd in, out;
  1005. struct inode *in_inode, *out_inode;
  1006. loff_t pos;
  1007. loff_t out_pos;
  1008. ssize_t retval;
  1009. int fl;
  1010. /*
  1011. * Get input file, and verify that it is ok..
  1012. */
  1013. retval = -EBADF;
  1014. in = fdget(in_fd);
  1015. if (!in.file)
  1016. goto out;
  1017. if (!(in.file->f_mode & FMODE_READ))
  1018. goto fput_in;
  1019. retval = -ESPIPE;
  1020. if (!ppos) {
  1021. pos = in.file->f_pos;
  1022. } else {
  1023. pos = *ppos;
  1024. if (!(in.file->f_mode & FMODE_PREAD))
  1025. goto fput_in;
  1026. }
  1027. retval = rw_verify_area(READ, in.file, &pos, count);
  1028. if (retval < 0)
  1029. goto fput_in;
  1030. count = retval;
  1031. /*
  1032. * Get output file, and verify that it is ok..
  1033. */
  1034. retval = -EBADF;
  1035. out = fdget(out_fd);
  1036. if (!out.file)
  1037. goto fput_in;
  1038. if (!(out.file->f_mode & FMODE_WRITE))
  1039. goto fput_out;
  1040. retval = -EINVAL;
  1041. in_inode = file_inode(in.file);
  1042. out_inode = file_inode(out.file);
  1043. out_pos = out.file->f_pos;
  1044. retval = rw_verify_area(WRITE, out.file, &out_pos, count);
  1045. if (retval < 0)
  1046. goto fput_out;
  1047. count = retval;
  1048. if (!max)
  1049. max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
  1050. if (unlikely(pos + count > max)) {
  1051. retval = -EOVERFLOW;
  1052. if (pos >= max)
  1053. goto fput_out;
  1054. count = max - pos;
  1055. }
  1056. fl = 0;
  1057. #if 0
  1058. /*
  1059. * We need to debate whether we can enable this or not. The
  1060. * man page documents EAGAIN return for the output at least,
  1061. * and the application is arguably buggy if it doesn't expect
  1062. * EAGAIN on a non-blocking file descriptor.
  1063. */
  1064. if (in.file->f_flags & O_NONBLOCK)
  1065. fl = SPLICE_F_NONBLOCK;
  1066. #endif
  1067. file_start_write(out.file);
  1068. retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
  1069. file_end_write(out.file);
  1070. if (retval > 0) {
  1071. add_rchar(current, retval);
  1072. add_wchar(current, retval);
  1073. fsnotify_access(in.file);
  1074. fsnotify_modify(out.file);
  1075. out.file->f_pos = out_pos;
  1076. if (ppos)
  1077. *ppos = pos;
  1078. else
  1079. in.file->f_pos = pos;
  1080. }
  1081. inc_syscr(current);
  1082. inc_syscw(current);
  1083. if (pos > max)
  1084. retval = -EOVERFLOW;
  1085. fput_out:
  1086. fdput(out);
  1087. fput_in:
  1088. fdput(in);
  1089. out:
  1090. return retval;
  1091. }
  1092. SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
  1093. {
  1094. loff_t pos;
  1095. off_t off;
  1096. ssize_t ret;
  1097. if (offset) {
  1098. if (unlikely(get_user(off, offset)))
  1099. return -EFAULT;
  1100. pos = off;
  1101. ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
  1102. if (unlikely(put_user(pos, offset)))
  1103. return -EFAULT;
  1104. return ret;
  1105. }
  1106. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1107. }
  1108. SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
  1109. {
  1110. loff_t pos;
  1111. ssize_t ret;
  1112. if (offset) {
  1113. if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
  1114. return -EFAULT;
  1115. ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
  1116. if (unlikely(put_user(pos, offset)))
  1117. return -EFAULT;
  1118. return ret;
  1119. }
  1120. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1121. }
  1122. #ifdef CONFIG_COMPAT
  1123. COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
  1124. compat_off_t __user *, offset, compat_size_t, count)
  1125. {
  1126. loff_t pos;
  1127. off_t off;
  1128. ssize_t ret;
  1129. if (offset) {
  1130. if (unlikely(get_user(off, offset)))
  1131. return -EFAULT;
  1132. pos = off;
  1133. ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
  1134. if (unlikely(put_user(pos, offset)))
  1135. return -EFAULT;
  1136. return ret;
  1137. }
  1138. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1139. }
  1140. COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
  1141. compat_loff_t __user *, offset, compat_size_t, count)
  1142. {
  1143. loff_t pos;
  1144. ssize_t ret;
  1145. if (offset) {
  1146. if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
  1147. return -EFAULT;
  1148. ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
  1149. if (unlikely(put_user(pos, offset)))
  1150. return -EFAULT;
  1151. return ret;
  1152. }
  1153. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1154. }
  1155. #endif