From 946fdc6a4078e6dcaf8c2b87b5466583e2c18882 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Thu, 13 Feb 2020 21:43:57 +0100
Subject: Implement basic working LXC+LVM snapshot functionality.

Still missing:
- proper exception handling
- stats
---
 nsmount.c | 233 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 233 insertions(+)
 create mode 100644 nsmount.c

(limited to 'nsmount.c')
diff --git a/nsmount.c b/nsmount.c
new file mode 100644
index 0000000..9d1533a
--- /dev/null
+++ b/nsmount.c
@@ -0,0 +1,233 @@
+/**
+ *  nsmount - mount a block device into a mount/pid namespace
+ *  Copyright (C) 2019  Anton Khirnov <anton@khirnov.net>
+ *
+ *  nsmount is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  nsmount is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with nsmount.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define _XOPEN_SOURCE 700
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/mount.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+enum {
+    OP_MOUNT,
+    OP_UMOUNT,
+};
+
+static void print_usage(int argc, const char * const *argv)
+{
+    fprintf(stderr,
+            "%s: mount/unmount a block device in a mount/PID namespace\n\n"
+            "Usage:\n"
+            "   %s m <PID> <mountpoint> <blkdev_path> <fstype>\n"
+            "   %s u <PID> <mountpoint>\n\n"
+            " <PID>: PID (in the namespace in which this program is executed)"
+                " of the process whose namespaces are to be entered into\n"
+            " <mountpoint>: path (in the destination mount namespace) to be mounted"
+                " or unmounted\n"
+            " <blkdev_path>: path (in the namespace in which this program is executed)"
+                " to the block device that shall be mounted\n"
+            " <fstype>: type of the filesystem to be mounted\n",
+            argv[0], argv[0], argv[0]);
+}
+
+int main(int argc, const char * const *argv)
+{
+    char pathbuf[128];
+    int blockdev_fd = -1, pidns_fd = -1, mountns_fd = -1;
+    const char *blockdev, *mountpoint, *fstype;
+    pid_t tgt_pid, child_pid;
+    int op;
+    int ret;
+
+    /* parse the commandline */
+    if (argc < 2) {
+        print_usage(argc, argv);
+        return 1;
+    }
+
+    if (argv[1][0] == 'm') {
+        op = OP_MOUNT;
+        if (argc < 6) {
+            print_usage(argc, argv);
+            return 1;
+        }
+    } else if (argv[1][0] == 'u') {
+        op = OP_UMOUNT;
+        if (argc < 4) {
+            print_usage(argc, argv);
+            return 1;
+        }
+    } else {
+        fprintf(stderr, "Invalid operation: %s\n",
+                argv[1]);
+        print_usage(argc, argv);
+        return  1;
+    }
+
+    tgt_pid  = strtol(argv[2], NULL, 0);
+    mountpoint = argv[3];
+    if (op == OP_MOUNT) {
+        blockdev = argv[4];
+        fstype   = argv[5];
+    }
+
+    /* open the files */
+    if (op == OP_MOUNT) {
+        blockdev_fd = open(blockdev, O_RDONLY);
+        if (blockdev_fd == -1) {
+            fprintf(stderr, "Error opening %s: %s\n",
+                    blockdev, strerror(errno));
+            return 2;
+        }
+    }
+
+    ret = snprintf(pathbuf, sizeof(pathbuf), "/proc/%d/ns/pid",
+                   tgt_pid);
+    if (ret < 0 || ret >= sizeof(pathbuf)) {
+        fprintf(stderr, "Error constructing the PID namespace path\n");
+        ret = 2;
+        goto finish;
+    }
+
+    pidns_fd = open(pathbuf, O_RDONLY | O_CLOEXEC);
+    if (pidns_fd == -1) {
+        fprintf(stderr, "Error opening %s: %s\n",
+                pathbuf, strerror(errno));
+        ret = 2;
+        goto finish;
+    }
+
+    ret = snprintf(pathbuf, sizeof(pathbuf), "/proc/%d/ns/mnt",
+                   tgt_pid);
+    if (ret < 0 || ret >= sizeof(pathbuf)) {
+        fprintf(stderr, "Error constructing the mount namespace path\n");
+        ret = 2;
+        goto finish;
+    }
+
+    mountns_fd = open(pathbuf, O_RDONLY | O_CLOEXEC);
+    if (mountns_fd == -1) {
+        fprintf(stderr, "Error opening %s: %s\n",
+                pathbuf, strerror(errno));
+        ret = 2;
+        goto finish;
+    }
+
+    /* enter the namespaces */
+    ret = setns(pidns_fd, CLONE_NEWPID);
+    if (ret == -1) {
+        fprintf(stderr, "Error entering the PID namespace: %s\n",
+                strerror(errno));
+        ret = 3;
+        goto finish;
+    }
+
+    ret = setns(mountns_fd, CLONE_NEWNS);
+    if (ret == -1) {
+        fprintf(stderr, "Error entering the mount namespace: %s\n",
+                strerror(errno));
+        ret = 3;
+        goto finish;
+    }
+
+    /* fork to actually enter the PID namespace */
+    child_pid = fork();
+    if (child_pid == -1) {
+        fprintf(stderr, "fork() failed: %s\n",
+                strerror(errno));
+        ret = 4;
+        goto finish;
+    }
+
+    if (child_pid) {
+        /* we are the parent */
+        ret = wait(NULL);
+        if (ret == -1) {
+            fprintf(stderr, "Error waiting for the child: %s\n",
+                    strerror(errno));
+            ret = 4;
+            goto finish;
+        }
+    } else {
+        /* we are the child */
+        if (op == OP_MOUNT) {
+            /* we use /proc/self/fd to mount the device
+             * Since the container controls its own filesystem hierarchy, it
+             * could trick us into mounting an arbitrary node located in the
+             * filesystem. This is not considered a major security problem,
+             * since
+             *  - the container should not have access to mknod() or nodes that
+             *    it is not meant to read
+             *  - we mount the filesystem read-only, with nosuid flag
+             *  - since the container will typically live in its own user
+             *    namespace, it will not have the right permissions to access a
+             *    filesystem that is not intended for it
+             *
+             *  Ideally, there would be something like a mountfd() syscall that
+             *  would allow mounting an fd.
+             */
+            ret = snprintf(pathbuf, sizeof(pathbuf),
+                           "/proc/self/fd/%d", blockdev_fd);
+            if (ret < 0 || ret >= sizeof(pathbuf)) {
+                fprintf(stderr, "Error constructing the mount path\n");
+                ret = 4;
+                goto finish;
+            }
+
+            ret = mount(pathbuf, mountpoint, fstype, MS_RDONLY | MS_NOSUID, NULL);
+            if (ret == -1) {
+                fprintf(stderr, "mount(%s, %s) failed: %s\n",
+                        pathbuf, mountpoint, strerror(errno));
+                ret = 5;
+                goto finish;
+            }
+        } else if (op == OP_UMOUNT) {
+            /**
+             * As above, a malicious container can trick us into unmounting a
+             * filesystem in its tree. This should not cause any issues other
+             * than disrupting the container (which a compromised container can
+             * already do without our help).
+             */
+            ret = umount(mountpoint);
+            if (ret == -1) {
+                fprintf(stderr, "umount() failed: %s\n", strerror(errno));
+                ret = 5;
+                goto finish;
+            }
+        }
+    }
+
+    ret = 0;
+finish:
+    if (blockdev_fd >= 0)
+        close(blockdev_fd);
+    if (pidns_fd >= 0)
+        close(pidns_fd);
+    if (mountns_fd >= 0)
+        close(mountns_fd);
+
+    return ret;
+}
-- 
cgit v1.2.3