/* Copyright 2017 Gentoo Foundation
 * Distributed under the terms of the GNU General Public License v2
 */

#include <Python.h>
#include <errno.h>
#include <stdlib.h>
#include <ctype.h>
#include <sys/sendfile.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>

static PyObject * _reflink_linux_file_copy(PyObject *, PyObject *);

static PyMethodDef reflink_linuxMethods[] = {
    {
            "file_copy",
            _reflink_linux_file_copy,
            METH_VARARGS,
            "Copy between two file descriptors, "
            "with reflink and sparse file support."
    },
    {NULL, NULL, 0, NULL}
};

#if PY_MAJOR_VERSION >= 3
static struct PyModuleDef moduledef = {
    PyModuleDef_HEAD_INIT,
    "reflink_linux",                                /* m_name */
    "Module for reflink_linux copy operations",     /* m_doc */
    -1,                                             /* m_size */
    reflink_linuxMethods,                           /* m_methods */
    NULL,                                           /* m_reload */
    NULL,                                           /* m_traverse */
    NULL,                                           /* m_clear */
    NULL,                                           /* m_free */
};

PyMODINIT_FUNC
PyInit_reflink_linux(void)
{
    PyObject *m;
    m = PyModule_Create(&moduledef);
    return m;
}
#else
PyMODINIT_FUNC
initreflink_linux(void)
{
    Py_InitModule("reflink_linux", reflink_linuxMethods);
}
#endif


/**
 * cfr_wrapper - A copy_file_range syscall wrapper function, having a
 * function signature that is compatible with sendfile.
 * @fd_out: output file descriptor
 * @fd_in: input file descriptor
 * @off_out: offset of the output file
 * @len: number of bytes to copy between the file descriptors
 *
 * Return: Number of bytes written to out_fd on success, -1 on failure
 * (errno is set appropriately).
 */
static ssize_t
cfr_wrapper(int fd_out, int fd_in, off_t *off_out, size_t len)
{
#ifdef __NR_copy_file_range
    return syscall(__NR_copy_file_range, fd_in, NULL, fd_out,
                   off_out, len, 0);
#else
    /* This is how it fails at runtime when the syscall is not supported. */
    errno = ENOSYS;
    return -1;
#endif
}

/**
 * do_lseek_data - Adjust file offsets to the next location containing
 * data, creating sparse empty blocks in the output file as needed.
 * @fd_in: input file descriptor
 * @fd_out: output file descriptor
 * @off_out: offset of the output file
 *
 * Use lseek SEEK_DATA to adjust the fd_in file offset to the next
 * location containing data, and adjust the fd_in file offset and
 * off_out to the same location (creating sparse empty blocks as
 * needed). On success, both fd_in and fd_out file offsets are
 * guaranteed to be exactly equal to the value that off_out points to.
 * 
 * Return: On success, the number of bytes to copy before the next hole,
 * and -1 on failure (errno is set appropriately). Returns 0 when fd_in
 * reaches EOF.
 */
static off_t
do_lseek_data(int fd_out, int fd_in, off_t *off_out) {
#ifdef SEEK_DATA
    /* Use lseek SEEK_DATA/SEEK_HOLE for sparse file support,
     * as suggested in the copy_file_range man page.
     */
    off_t offset_data, offset_hole;

    offset_data = lseek(fd_in, *off_out, SEEK_DATA);
    if (offset_data < 0) {
        if (errno == ENXIO) {
            /* EOF - If the file ends with a hole, then use lseek SEEK_END
             * to find the end offset, and create sparse empty blocks in
             * the output file. It's the caller's responsibility to
             * truncate the file.
             */
            offset_hole = lseek(fd_in, 0, SEEK_END);
            if (offset_hole < 0) {
                return -1;
            } else if (offset_hole != *off_out) {
                if (lseek(fd_out, offset_hole, SEEK_SET) < 0) {
                    return -1;
                }
                *off_out = offset_hole;
            }
            return 0;
        }
        return -1;
    }

    /* Create sparse empty blocks in the output file, up
     * until the next location that will contain data.
     */
    if (offset_data != *off_out) {
        if (lseek(fd_out, offset_data, SEEK_SET) < 0) {
            return -1;
        }
        *off_out = offset_data;
    }

    /* Locate the next hole, so that we know when to
     * stop copying. There is an implicit hole at the
     * end of the file. This should never result in ENXIO
     * after SEEK_DATA has succeeded above.
     */
    offset_hole = lseek(fd_in, offset_data, SEEK_HOLE);
    if (offset_hole < 0) {
        return -1;
    }

    /* Revert SEEK_HOLE offset change, since we're going
     * to copy the data that comes before the hole.
     */
    if (lseek(fd_in, offset_data, SEEK_SET) < 0) {
        return -1;
    }

    return offset_hole - offset_data;
#else
    /* This is how it fails at runtime when lseek SEEK_DATA is not supported. */
    errno = EINVAL;
    return -1;
#endif
}


/**
 * _reflink_linux_file_copy - Copy between two file descriptors, with
 * reflink and sparse file support.
 * @fd_in: input file descriptor
 * @fd_out: output file descriptor
 *
 * When supported, this uses copy_file_range for reflink support,
 * and lseek SEEK_DATA for sparse file support. It has graceful
 * fallbacks when support is unavailable for copy_file_range, lseek
 * SEEK_DATA, or sendfile operations. When all else fails, it uses
 * a plain read/write loop that works in any kernel version.
 *
 * If a syscall is interrupted by a signal, then the function will
 * automatically resume copying a the appropriate location which is
 * tracked internally by the offset_out variable.
 * 
 * Return: The length of the output file on success. Raise OSError
 * on failure.
 */
static PyObject *
_reflink_linux_file_copy(PyObject *self, PyObject *args)
{
    int eintr_retry, error, fd_in, fd_out, stat_in_acquired, stat_out_acquired;
    int lseek_works, sendfile_works;
    off_t offset_out, len;
    ssize_t buf_bytes, buf_offset, copyfunc_ret;
    struct stat stat_in, stat_out;
    char* buf;
    ssize_t (*copyfunc)(int, int, off_t *, size_t);

    if (!PyArg_ParseTuple(args, "ii", &fd_in, &fd_out))
        return NULL;

    eintr_retry = 1;
    offset_out = 0;
    stat_in_acquired = 0;
    stat_out_acquired = 0;
    buf = NULL;
    buf_bytes = 0;
    buf_offset = 0;
    copyfunc = cfr_wrapper;
    lseek_works = 1;
    sendfile_works = 1;

    while (eintr_retry) {

        Py_BEGIN_ALLOW_THREADS

        /* Linux 3.1 and later support SEEK_DATA (for sparse file support).
         * This code uses copy_file_range if possible, and falls back to
         * sendfile for cross-device or when the copy_file_range syscall
         * is not available (less than Linux 4.5). This will fail for
         * Linux less than 3.1, which does not support the lseek SEEK_DATA
         * parameter.
         */
        if (sendfile_works && lseek_works) {
            error = 0;

            while (1) {
                len = do_lseek_data(fd_out, fd_in, &offset_out);
                if (!len) {
                    /* EOF */
                    break;
                } else if (len < 0) {
                    error = errno;
                    if (errno == EINVAL && !offset_out) {
                        lseek_works = 0;
                    }
                    break;
                }

                /* For the copyfunc call, the fd_in file offset must be
                 * exactly equal to offset_out. The above do_lseek_data
                 * function guarantees correct state.
                 */
                copyfunc_ret = copyfunc(fd_out,
                                        fd_in,
                                        &offset_out,
                                        len);

                if (copyfunc_ret < 0) {
                    error = errno;
                    if ((errno == EXDEV || errno == ENOSYS) &&
                        copyfunc == cfr_wrapper) {
                        /* Use sendfile instead of copy_file_range for
                         * cross-device copies, or when the copy_file_range
                         * syscall is not available (less than Linux 4.5).
                         */
                        error = 0;
                        copyfunc = sendfile;
                        copyfunc_ret = copyfunc(fd_out,
                                                fd_in,
                                                &offset_out,
                                                len);

                        if (copyfunc_ret < 0) {
                            error = errno;
                            /* On Linux, if lseek succeeded above, then
                             * sendfile should have worked here too, so
                             * don't bother to fallback for EINVAL here.
                             */
                            break;
                        }
                    } else {
                        break;
                    }
                }
            }
        }

        /* Less than Linux 3.1 does not support SEEK_DATA or copy_file_range,
         * so just use sendfile for in-kernel copy. This will fail for Linux
         * versions from 2.6.0 to 2.6.32, because sendfile does not support
         * writing to regular files.
         */
        if (sendfile_works && !lseek_works) {
            error = 0;

            if (!stat_in_acquired && fstat(fd_in, &stat_in) < 0) {
                error = errno;
            } else {
                stat_in_acquired = 1;

                /* For the sendfile call, the fd_in file offset must be
                 * exactly equal to offset_out. Use lseek to ensure
                 * correct state, in case an EINTR retry caused it to
                 * get out of sync somewhow.
                 */
                if (lseek(fd_in, offset_out, SEEK_SET) < 0) {
                    error = errno;
                } else {
                    while (offset_out < stat_in.st_size) {
                        copyfunc_ret = sendfile(fd_out,
                                                fd_in,
                                                &offset_out,
                                                stat_in.st_size - offset_out);

                        if (copyfunc_ret < 0) {
                            error = errno;
                            if (errno == EINVAL && !offset_out) {
                                sendfile_works = 0;
                            }
                            break;
                        }
                    }
                }
            }
        }

        /* This implementation will work on any kernel. */
        if (!sendfile_works) {
            error = 0;

            if (!stat_out_acquired && fstat(fd_in, &stat_out) < 0) {
                error = errno;
            } else {
                stat_out_acquired = 1;
                if (buf == NULL)
                    buf = malloc(stat_out.st_blksize);
                if (buf == NULL) {
                    error = errno;

                /* For the read call, the fd_in file offset must be exactly
                 * equal to offset_out + buf_bytes, where buf_bytes is the
                 * amount of buffered data that has not been written to
                 * to the output file yet. Use lseek to ensure correct state,
                 * in case an EINTR retry caused it to get out of sync
                 * somewhow.
                 */
                } else if (lseek(fd_in, offset_out + buf_bytes, SEEK_SET) < 0) {
                    error = errno;
                } else {
                    while (1) {
                        /* Some bytes may still be buffered from the
                         * previous iteration of the outer loop.
                         */
                        if (!buf_bytes) {
                            buf_offset = 0;
                            buf_bytes = read(fd_in, buf, stat_out.st_blksize);

                            if (!buf_bytes) {
                                /* EOF */
                                break;

                            } else if (buf_bytes < 0) {
                                error = errno;
                                buf_bytes = 0;
                                break;
                            }
                        }

                        copyfunc_ret = write(fd_out,
                                             buf + buf_offset,
                                             buf_bytes);

                        if (copyfunc_ret < 0) {
                            error = errno;
                            break;
                        }

                        buf_bytes -= copyfunc_ret;
                        buf_offset += copyfunc_ret;
                        offset_out += copyfunc_ret;
                    }
                }
            }
        }

        if (!error && ftruncate(fd_out, offset_out) < 0)
            error = errno;

        Py_END_ALLOW_THREADS

        if (!(error == EINTR && PyErr_CheckSignals() == 0))
            eintr_retry = 0;
    }

    if (buf != NULL)
        free(buf);

    if (error)
        return PyErr_SetFromErrno(PyExc_OSError);

    return Py_BuildValue("i", offset_out);
}
