movefile: support in-kernel file copying on Linux (bug 607868)

Perform in-kernel file copying when possible, and also support
reflinks and sparse files. If the optimized implementation
fails at runtime, gracefully fallback to a plain read/write
loop.

Compile-time and run-time fallbacks are implemented, so that
any incompatiblities will be handled gracefully. For example,
if the code is compiled on a system that supports the
copy_file_range syscall, but at run-time an older kernel that
does not support this syscall is detected, it will be handled
gracefully. There are similar fallbacks for lack of lseek
SEEK_DATA and sendfile support.

X-Gentoo-Bug: 607868
X-Gentoo-Bug-Url: https://bugs.gentoo.org/show_bug.cgi?id=607868
Acked-by: Brian Dolbec <dolsen@gentoo.org>
diff --git a/pym/portage/tests/util/file_copy/__init__.py b/pym/portage/tests/util/file_copy/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/pym/portage/tests/util/file_copy/__init__.py
diff --git a/pym/portage/tests/util/file_copy/__test__.py b/pym/portage/tests/util/file_copy/__test__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/pym/portage/tests/util/file_copy/__test__.py
diff --git a/pym/portage/tests/util/file_copy/test_copyfile.py b/pym/portage/tests/util/file_copy/test_copyfile.py
new file mode 100644
index 0000000..b900fde
--- /dev/null
+++ b/pym/portage/tests/util/file_copy/test_copyfile.py
@@ -0,0 +1,71 @@
+# Copyright 2017 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
+
+import shutil
+import tempfile
+
+from portage import os
+from portage.tests import TestCase
+from portage.checksum import perform_md5
+from portage.util.file_copy import copyfile
+
+
+class CopyFileTestCase(TestCase):
+
+	def testCopyFile(self):
+
+		tempdir = tempfile.mkdtemp()
+		try:
+			src_path = os.path.join(tempdir, 'src')
+			dest_path = os.path.join(tempdir, 'dest')
+			content = b'foo'
+
+			with open(src_path, 'wb') as f:
+				f.write(content)
+
+			copyfile(src_path, dest_path)
+
+			self.assertEqual(perform_md5(src_path), perform_md5(dest_path))
+		finally:
+			shutil.rmtree(tempdir)
+
+
+class CopyFileSparseTestCase(TestCase):
+
+	def testCopyFileSparse(self):
+
+		tempdir = tempfile.mkdtemp()
+		try:
+			src_path = os.path.join(tempdir, 'src')
+			dest_path = os.path.join(tempdir, 'dest')
+			content = b'foo'
+
+			# Use seek to create some sparse blocks. Don't make these
+			# files too big, in case the filesystem doesn't support
+			# sparse files.
+			with open(src_path, 'wb') as f:
+				f.write(content)
+				f.seek(2**17, 1)
+				f.write(content)
+				f.seek(2**18, 1)
+				f.write(content)
+				# Test that sparse blocks are handled correctly at
+				# the end of the file (involves seek and truncate).
+				f.seek(2**17, 1)
+
+			copyfile(src_path, dest_path)
+
+			self.assertEqual(perform_md5(src_path), perform_md5(dest_path))
+
+			# This last part of the test is expected to fail when sparse
+			# copy is not implemented, so set the todo flag in order
+			# to tolerate failures.
+			self.todo = True
+
+			# If sparse blocks were preserved, then both files should
+			# consume the same number of blocks.
+			self.assertEqual(
+				os.stat(src_path).st_blocks,
+				os.stat(dest_path).st_blocks)
+		finally:
+			shutil.rmtree(tempdir)
diff --git a/pym/portage/util/file_copy/__init__.py b/pym/portage/util/file_copy/__init__.py
new file mode 100644
index 0000000..3d9b745
--- /dev/null
+++ b/pym/portage/util/file_copy/__init__.py
@@ -0,0 +1,36 @@
+# Copyright 2017 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
+
+import os
+import shutil
+import tempfile
+
+try:
+	from portage.util.file_copy.reflink_linux import file_copy as _file_copy
+except ImportError:
+	_file_copy = None
+
+
+def _optimized_copyfile(src, dst):
+	"""
+	Copy the contents (no metadata) of the file named src to a file
+	named dst.
+
+	If possible, copying is done within the kernel, and uses
+	"copy acceleration" techniques (such as reflinks). This also
+	supports sparse files.
+
+	@param src: path of source file
+	@type src: str
+	@param dst: path of destination file
+	@type dst: str
+	"""
+	with open(src, 'rb', buffering=0) as src_file, \
+		open(dst, 'wb', buffering=0) as dst_file:
+		_file_copy(src_file.fileno(), dst_file.fileno())
+
+
+if _file_copy is None:
+	copyfile = shutil.copyfile
+else:
+	copyfile = _optimized_copyfile
diff --git a/pym/portage/util/movefile.py b/pym/portage/util/movefile.py
index 4be1c3b..37c809e 100644
--- a/pym/portage/util/movefile.py
+++ b/pym/portage/util/movefile.py
@@ -8,7 +8,6 @@
 import errno
 import fnmatch
 import os as _os
-import shutil as _shutil
 import stat
 import sys
 import textwrap
@@ -23,6 +22,8 @@
 from portage.process import spawn
 from portage.util import writemsg
 from portage.util._xattr import xattr
+from portage.util.file_copy import copyfile
+
 
 def _apply_stat(src_stat, dest):
 	_os.chown(dest, src_stat.st_uid, src_stat.st_gid)
@@ -114,7 +115,7 @@
 		_copyfile = selinux.copyfile
 		_rename = selinux.rename
 	else:
-		_copyfile = _shutil.copyfile
+		_copyfile = copyfile
 		_rename = _os.rename
 
 	lchown = _unicode_func_wrapper(portage.data.lchown, encoding=encoding)
diff --git a/setup.py b/setup.py
index a346bd4..b624767 100755
--- a/setup.py
+++ b/setup.py
@@ -23,6 +23,7 @@
 import glob
 import os
 import os.path
+import platform
 import re
 import subprocess
 import sys
@@ -54,6 +55,14 @@
 	],
 }
 
+if platform.system() == 'Linux':
+	x_c_helpers.update({
+		'portage.util.file_copy.reflink_linux': [
+			'src/portage_util_file_copy_reflink_linux.c',
+		],
+	})
+
+
 class x_build(build):
 	""" Build command with extra build_man call. """
 
diff --git a/src/portage_util_file_copy_reflink_linux.c b/src/portage_util_file_copy_reflink_linux.c
new file mode 100644
index 0000000..b031d96
--- /dev/null
+++ b/src/portage_util_file_copy_reflink_linux.c
@@ -0,0 +1,385 @@
+/* Copyright 2017 Gentoo Foundation
+ * Distributed under the terms of the GNU General Public License v2
+ */
+
+#include <Python.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <sys/sendfile.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+static PyObject * _reflink_linux_file_copy(PyObject *, PyObject *);
+
+static PyMethodDef reflink_linuxMethods[] = {
+    {
+            "file_copy",
+            _reflink_linux_file_copy,
+            METH_VARARGS,
+            "Copy between two file descriptors, "
+            "with reflink and sparse file support."
+    },
+    {NULL, NULL, 0, NULL}
+};
+
+#if PY_MAJOR_VERSION >= 3
+static struct PyModuleDef moduledef = {
+    PyModuleDef_HEAD_INIT,
+    "reflink_linux",                                /* m_name */
+    "Module for reflink_linux copy operations",     /* m_doc */
+    -1,                                             /* m_size */
+    reflink_linuxMethods,                           /* m_methods */
+    NULL,                                           /* m_reload */
+    NULL,                                           /* m_traverse */
+    NULL,                                           /* m_clear */
+    NULL,                                           /* m_free */
+};
+
+PyMODINIT_FUNC
+PyInit_reflink_linux(void)
+{
+    PyObject *m;
+    m = PyModule_Create(&moduledef);
+    return m;
+}
+#else
+PyMODINIT_FUNC
+initreflink_linux(void)
+{
+    Py_InitModule("reflink_linux", reflink_linuxMethods);
+}
+#endif
+
+
+/**
+ * cfr_wrapper - A copy_file_range syscall wrapper function, having a
+ * function signature that is compatible with sendfile.
+ * @fd_out: output file descriptor
+ * @fd_in: input file descriptor
+ * @off_out: offset of the output file
+ * @len: number of bytes to copy between the file descriptors
+ *
+ * Return: Number of bytes written to out_fd on success, -1 on failure
+ * (errno is set appropriately).
+ */
+static ssize_t
+cfr_wrapper(int fd_out, int fd_in, loff_t *off_out, size_t len)
+{
+#ifdef __NR_copy_file_range
+    return syscall(__NR_copy_file_range, fd_in, NULL, fd_out,
+                   off_out, len, 0);
+#else
+    /* This is how it fails at runtime when the syscall is not supported. */
+    errno = ENOSYS;
+    return -1;
+#endif
+}
+
+/**
+ * do_lseek_data - Adjust file offsets to the next location containing
+ * data, creating sparse empty blocks in the output file as needed.
+ * @fd_in: input file descriptor
+ * @fd_out: output file descriptor
+ * @off_out: offset of the output file
+ *
+ * Use lseek SEEK_DATA to adjust the fd_in file offset to the next
+ * location containing data, and adjust the fd_in file offset and
+ * off_out to the same location (creating sparse empty blocks as
+ * needed). On success, both fd_in and fd_out file offsets are
+ * guaranteed to be exactly equal to the value that off_out points to.
+ * 
+ * Return: On success, the number of bytes to copy before the next hole,
+ * and -1 on failure (errno is set appropriately). Returns 0 when fd_in
+ * reaches EOF.
+ */
+static off_t
+do_lseek_data(int fd_out, int fd_in, loff_t *off_out) {
+#ifdef SEEK_DATA
+    /* Use lseek SEEK_DATA/SEEK_HOLE for sparse file support,
+     * as suggested in the copy_file_range man page.
+     */
+    off_t offset_data, offset_hole;
+
+    offset_data = lseek(fd_in, *off_out, SEEK_DATA);
+    if (offset_data < 0) {
+        if (errno == ENXIO) {
+            /* EOF - If the file ends with a hole, then use lseek SEEK_END
+             * to find the end offset, and create sparse empty blocks in
+             * the output file. It's the caller's responsibility to
+             * truncate the file.
+             */
+            offset_hole = lseek(fd_in, 0, SEEK_END);
+            if (offset_hole < 0) {
+                return -1;
+            } else if (offset_hole != *off_out) {
+                if (lseek(fd_out, offset_hole, SEEK_SET) < 0) {
+                    return -1;
+                }
+                *off_out = offset_hole;
+            }
+            return 0;
+        }
+        return -1;
+    }
+
+    /* Create sparse empty blocks in the output file, up
+     * until the next location that will contain data.
+     */
+    if (offset_data != *off_out) {
+        if (lseek(fd_out, offset_data, SEEK_SET) < 0) {
+            return -1;
+        }
+        *off_out = offset_data;
+    }
+
+    /* Locate the next hole, so that we know when to
+     * stop copying. There is an implicit hole at the
+     * end of the file. This should never result in ENXIO
+     * after SEEK_DATA has succeeded above.
+     */
+    offset_hole = lseek(fd_in, offset_data, SEEK_HOLE);
+    if (offset_hole < 0) {
+        return -1;
+    }
+
+    /* Revert SEEK_HOLE offset change, since we're going
+     * to copy the data that comes before the hole.
+     */
+    if (lseek(fd_in, offset_data, SEEK_SET) < 0) {
+        return -1;
+    }
+
+    return offset_hole - offset_data;
+#else
+    /* This is how it fails at runtime when lseek SEEK_DATA is not supported. */
+    errno = EINVAL;
+    return -1;
+#endif
+}
+
+
+/**
+ * _reflink_linux_file_copy - Copy between two file descriptors, with
+ * reflink and sparse file support.
+ * @fd_in: input file descriptor
+ * @fd_out: output file descriptor
+ *
+ * When supported, this uses copy_file_range for reflink support,
+ * and lseek SEEK_DATA for sparse file support. It has graceful
+ * fallbacks when support is unavailable for copy_file_range, lseek
+ * SEEK_DATA, or sendfile operations. When all else fails, it uses
+ * a plain read/write loop that works in any kernel version.
+ *
+ * If a syscall is interrupted by a signal, then the function will
+ * automatically resume copying a the appropriate location which is
+ * tracked internally by the offset_out variable.
+ * 
+ * Return: The length of the output file on success. Raise OSError
+ * on failure.
+ */
+static PyObject *
+_reflink_linux_file_copy(PyObject *self, PyObject *args)
+{
+    int eintr_retry, error, fd_in, fd_out, stat_in_acquired, stat_out_acquired;
+    int lseek_works, sendfile_works;
+    off_t offset_out, len;
+    ssize_t buf_bytes, buf_offset, copyfunc_ret;
+    struct stat stat_in, stat_out;
+    char* buf;
+    ssize_t (*copyfunc)(int, int, loff_t *, size_t);
+
+    if (!PyArg_ParseTuple(args, "ii", &fd_in, &fd_out))
+        return NULL;
+
+    eintr_retry = 1;
+    offset_out = 0;
+    stat_in_acquired = 0;
+    stat_out_acquired = 0;
+    buf = NULL;
+    buf_bytes = 0;
+    buf_offset = 0;
+    copyfunc = cfr_wrapper;
+    lseek_works = 1;
+    sendfile_works = 1;
+
+    while (eintr_retry) {
+
+        Py_BEGIN_ALLOW_THREADS
+
+        /* Linux 3.1 and later support SEEK_DATA (for sparse file support).
+         * This code uses copy_file_range if possible, and falls back to
+         * sendfile for cross-device or when the copy_file_range syscall
+         * is not available (less than Linux 4.5). This will fail for
+         * Linux less than 3.1, which does not support the lseek SEEK_DATA
+         * parameter.
+         */
+        if (sendfile_works && lseek_works) {
+            error = 0;
+
+            while (1) {
+                len = do_lseek_data(fd_out, fd_in, &offset_out);
+                if (!len) {
+                    /* EOF */
+                    break;
+                } else if (len < 0) {
+                    error = errno;
+                    if (errno == EINVAL && !offset_out) {
+                        lseek_works = 0;
+                    }
+                    break;
+                }
+
+                /* For the copyfunc call, the fd_in file offset must be
+                 * exactly equal to offset_out. The above do_lseek_data
+                 * function guarantees correct state.
+                 */
+                copyfunc_ret = copyfunc(fd_out,
+                                        fd_in,
+                                        &offset_out,
+                                        len);
+
+                if (copyfunc_ret < 0) {
+                    error = errno;
+                    if ((errno == EXDEV || errno == ENOSYS) &&
+                        copyfunc == cfr_wrapper) {
+                        /* Use sendfile instead of copy_file_range for
+                         * cross-device copies, or when the copy_file_range
+                         * syscall is not available (less than Linux 4.5).
+                         */
+                        error = 0;
+                        copyfunc = sendfile;
+                        copyfunc_ret = copyfunc(fd_out,
+                                                fd_in,
+                                                &offset_out,
+                                                len);
+
+                        if (copyfunc_ret < 0) {
+                            error = errno;
+                            /* On Linux, if lseek succeeded above, then
+                             * sendfile should have worked here too, so
+                             * don't bother to fallback for EINVAL here.
+                             */
+                            break;
+                        }
+                    } else {
+                        break;
+                    }
+                }
+            }
+        }
+
+        /* Less than Linux 3.1 does not support SEEK_DATA or copy_file_range,
+         * so just use sendfile for in-kernel copy. This will fail for Linux
+         * versions from 2.6.0 to 2.6.32, because sendfile does not support
+         * writing to regular files.
+         */
+        if (sendfile_works && !lseek_works) {
+            error = 0;
+
+            if (!stat_in_acquired && fstat(fd_in, &stat_in) < 0) {
+                error = errno;
+            } else {
+                stat_in_acquired = 1;
+
+                /* For the sendfile call, the fd_in file offset must be
+                 * exactly equal to offset_out. Use lseek to ensure
+                 * correct state, in case an EINTR retry caused it to
+                 * get out of sync somewhow.
+                 */
+                if (lseek(fd_in, offset_out, SEEK_SET) < 0) {
+                    error = errno;
+                } else {
+                    while (offset_out < stat_in.st_size) {
+                        copyfunc_ret = sendfile(fd_out,
+                                                fd_in,
+                                                &offset_out,
+                                                stat_in.st_size - offset_out);
+
+                        if (copyfunc_ret < 0) {
+                            error = errno;
+                            if (errno == EINVAL && !offset_out) {
+                                sendfile_works = 0;
+                            }
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+
+        /* This implementation will work on any kernel. */
+        if (!sendfile_works) {
+            error = 0;
+
+            if (!stat_out_acquired && fstat(fd_in, &stat_out) < 0) {
+                error = errno;
+            } else {
+                stat_out_acquired = 1;
+                if (buf == NULL)
+                    buf = malloc(stat_out.st_blksize);
+                if (buf == NULL) {
+                    error = errno;
+
+                /* For the read call, the fd_in file offset must be
+                 * exactly equal to offset_out. Use lseek to ensure
+                 * correct state, in case an EINTR retry caused it to
+                 * get out of sync somewhow.
+                 */
+                } else if (lseek(fd_in, offset_out, SEEK_SET) < 0) {
+                    error = errno;
+                } else {
+                    while (1) {
+                        /* Some bytes may still be buffered from the
+                         * previous iteration of the outer loop.
+                         */
+                        if (!buf_bytes) {
+                            buf_offset = 0;
+                            buf_bytes = read(fd_in, buf, stat_out.st_blksize);
+
+                            if (!buf_bytes) {
+                                /* EOF */
+                                break;
+
+                            } else if (buf_bytes < 0) {
+                                error = errno;
+                                break;
+                            }
+                        }
+
+                        copyfunc_ret = write(fd_out,
+                                             buf + buf_offset,
+                                             buf_bytes);
+
+                        if (copyfunc_ret < 0) {
+                            error = errno;
+                            break;
+                        }
+
+                        buf_bytes -= copyfunc_ret;
+                        buf_offset += copyfunc_ret;
+                        offset_out += copyfunc_ret;
+                    }
+                }
+            }
+        }
+
+        if (!error && ftruncate(fd_out, offset_out) < 0)
+            error = errno;
+
+        Py_END_ALLOW_THREADS
+
+        if (!(error == EINTR && PyErr_CheckSignals() == 0))
+            eintr_retry = 0;
+    }
+
+    if (buf != NULL)
+        free(buf);
+
+    if (error)
+        return PyErr_SetFromErrno(PyExc_OSError);
+
+    return Py_BuildValue("i", offset_out);
+}