Skip to content

Commit 2973293

Browse files
Zach BrownAl Viro
Zach Brown
authored and
Al Viro
committed
vfs: add copy_file_range syscall and vfs helper
Add a copy_file_range() system call for offloading copies between regular files. This gives an interface to underlying layers of the storage stack which can copy without reading and writing all the data. There are a few candidates that should support copy offloading in the nearer term: - btrfs shares extent references with its clone ioctl - NFS has patches to add a COPY command which copies on the server - SCSI has a family of XCOPY commands which copy in the device This system call avoids the complexity of also accelerating the creation of the destination file by operating on an existing destination file descriptor, not a path. Currently the high level vfs entry point limits copy offloading to files on the same mount and super (and not in the same file). This can be relaxed if we get implementations which can copy between file systems safely. Signed-off-by: Zach Brown <zab@redhat.com> [Anna Schumaker: Change -EINVAL to -EBADF during file verification, Change flags parameter from int to unsigned int, Add function to include/linux/syscalls.h, Check copy len after file open mode, Don't forbid ranges inside the same file, Use rw_verify_area() to veriy ranges, Use file_out rather than file_in, Add COPY_FR_REFLINK flag] Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
1 parent 31ade3b commit 2973293

File tree

5 files changed

+130
-1
lines changed

5 files changed

+130
-1
lines changed

fs/read_write.c

+120
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <linux/pagemap.h>
1717
#include <linux/splice.h>
1818
#include <linux/compat.h>
19+
#include <linux/mount.h>
1920
#include "internal.h"
2021

2122
#include <asm/uaccess.h>
@@ -1327,3 +1328,122 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
13271328
return do_sendfile(out_fd, in_fd, NULL, count, 0);
13281329
}
13291330
#endif
1331+
1332+
/*
1333+
* copy_file_range() differs from regular file read and write in that it
1334+
* specifically allows return partial success. When it does so is up to
1335+
* the copy_file_range method.
1336+
*/
1337+
ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
1338+
struct file *file_out, loff_t pos_out,
1339+
size_t len, unsigned int flags)
1340+
{
1341+
struct inode *inode_in = file_inode(file_in);
1342+
struct inode *inode_out = file_inode(file_out);
1343+
ssize_t ret;
1344+
1345+
if (flags != 0)
1346+
return -EINVAL;
1347+
1348+
/* copy_file_range allows full ssize_t len, ignoring MAX_RW_COUNT */
1349+
ret = rw_verify_area(READ, file_in, &pos_in, len);
1350+
if (ret >= 0)
1351+
ret = rw_verify_area(WRITE, file_out, &pos_out, len);
1352+
if (ret < 0)
1353+
return ret;
1354+
1355+
if (!(file_in->f_mode & FMODE_READ) ||
1356+
!(file_out->f_mode & FMODE_WRITE) ||
1357+
(file_out->f_flags & O_APPEND) ||
1358+
!file_out->f_op->copy_file_range)
1359+
return -EBADF;
1360+
1361+
/* this could be relaxed once a method supports cross-fs copies */
1362+
if (inode_in->i_sb != inode_out->i_sb)
1363+
return -EXDEV;
1364+
1365+
if (len == 0)
1366+
return 0;
1367+
1368+
ret = mnt_want_write_file(file_out);
1369+
if (ret)
1370+
return ret;
1371+
1372+
ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out,
1373+
len, flags);
1374+
if (ret > 0) {
1375+
fsnotify_access(file_in);
1376+
add_rchar(current, ret);
1377+
fsnotify_modify(file_out);
1378+
add_wchar(current, ret);
1379+
}
1380+
inc_syscr(current);
1381+
inc_syscw(current);
1382+
1383+
mnt_drop_write_file(file_out);
1384+
1385+
return ret;
1386+
}
1387+
EXPORT_SYMBOL(vfs_copy_file_range);
1388+
1389+
SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
1390+
int, fd_out, loff_t __user *, off_out,
1391+
size_t, len, unsigned int, flags)
1392+
{
1393+
loff_t pos_in;
1394+
loff_t pos_out;
1395+
struct fd f_in;
1396+
struct fd f_out;
1397+
ssize_t ret = -EBADF;
1398+
1399+
f_in = fdget(fd_in);
1400+
if (!f_in.file)
1401+
goto out2;
1402+
1403+
f_out = fdget(fd_out);
1404+
if (!f_out.file)
1405+
goto out1;
1406+
1407+
ret = -EFAULT;
1408+
if (off_in) {
1409+
if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
1410+
goto out;
1411+
} else {
1412+
pos_in = f_in.file->f_pos;
1413+
}
1414+
1415+
if (off_out) {
1416+
if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
1417+
goto out;
1418+
} else {
1419+
pos_out = f_out.file->f_pos;
1420+
}
1421+
1422+
ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
1423+
flags);
1424+
if (ret > 0) {
1425+
pos_in += ret;
1426+
pos_out += ret;
1427+
1428+
if (off_in) {
1429+
if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
1430+
ret = -EFAULT;
1431+
} else {
1432+
f_in.file->f_pos = pos_in;
1433+
}
1434+
1435+
if (off_out) {
1436+
if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
1437+
ret = -EFAULT;
1438+
} else {
1439+
f_out.file->f_pos = pos_out;
1440+
}
1441+
}
1442+
1443+
out:
1444+
fdput(f_out);
1445+
out1:
1446+
fdput(f_in);
1447+
out2:
1448+
return ret;
1449+
}

include/linux/fs.h

+3
Original file line numberDiff line numberDiff line change
@@ -1629,6 +1629,7 @@ struct file_operations {
16291629
#ifndef CONFIG_MMU
16301630
unsigned (*mmap_capabilities)(struct file *);
16311631
#endif
1632+
ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int);
16321633
};
16331634

16341635
struct inode_operations {
@@ -1680,6 +1681,8 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
16801681
unsigned long, loff_t *);
16811682
extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
16821683
unsigned long, loff_t *);
1684+
extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
1685+
loff_t, size_t, unsigned int);
16831686

16841687
struct super_operations {
16851688
struct inode *(*alloc_inode)(struct super_block *sb);

include/linux/syscalls.h

+3
Original file line numberDiff line numberDiff line change
@@ -886,6 +886,9 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename,
886886
const char __user *const __user *envp, int flags);
887887

888888
asmlinkage long sys_membarrier(int cmd, int flags);
889+
asmlinkage long sys_copy_file_range(int fd_in, loff_t __user *off_in,
890+
int fd_out, loff_t __user *off_out,
891+
size_t len, unsigned int flags);
889892

890893
asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags);
891894

include/uapi/asm-generic/unistd.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -715,9 +715,11 @@ __SYSCALL(__NR_userfaultfd, sys_userfaultfd)
715715
__SYSCALL(__NR_membarrier, sys_membarrier)
716716
#define __NR_mlock2 284
717717
__SYSCALL(__NR_mlock2, sys_mlock2)
718+
#define __NR_copy_file_range 285
719+
__SYSCALL(__NR_copy_file_range, sys_copy_file_range)
718720

719721
#undef __NR_syscalls
720-
#define __NR_syscalls 285
722+
#define __NR_syscalls 286
721723

722724
/*
723725
* All syscalls below here should go away really,

kernel/sys_ni.c

+1
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ cond_syscall(sys_setfsuid);
174174
cond_syscall(sys_setfsgid);
175175
cond_syscall(sys_capget);
176176
cond_syscall(sys_capset);
177+
cond_syscall(sys_copy_file_range);
177178

178179
/* arch-specific weak syscall entries */
179180
cond_syscall(sys_pciconfig_read);

0 commit comments

Comments
 (0)