/** * nsmount - mount a block device into a mount/pid namespace * Copyright (C) 2019 Anton Khirnov * * nsmount is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * nsmount is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with nsmount. If not, see . */ #define _XOPEN_SOURCE 700 #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include enum { OP_MOUNT, OP_UMOUNT, }; static void print_usage(int argc, const char * const *argv) { fprintf(stderr, "%s: mount/unmount a block device in a mount/PID namespace\n\n" "Usage:\n" " %s m \n" " %s u \n\n" " : PID (in the namespace in which this program is executed)" " of the process whose namespaces are to be entered into\n" " : path (in the destination mount namespace) to be mounted" " or unmounted\n" " : path (in the namespace in which this program is executed)" " to the block device that shall be mounted\n" " : type of the filesystem to be mounted\n", argv[0], argv[0], argv[0]); } int main(int argc, const char * const *argv) { char pathbuf[128]; int blockdev_fd = -1, pidns_fd = -1, mountns_fd = -1; const char *blockdev, *mountpoint, *fstype; pid_t tgt_pid, child_pid; int op; int ret; /* parse the commandline */ if (argc < 2) { print_usage(argc, argv); return 1; } if (argv[1][0] == 'm') { op = OP_MOUNT; if (argc < 6) { print_usage(argc, argv); return 1; } } else if (argv[1][0] == 'u') { op = OP_UMOUNT; if (argc < 4) { print_usage(argc, argv); return 1; } } else { fprintf(stderr, "Invalid operation: %s\n", argv[1]); print_usage(argc, argv); return 1; } tgt_pid = strtol(argv[2], NULL, 0); mountpoint = argv[3]; if (op == OP_MOUNT) { blockdev = argv[4]; fstype = argv[5]; } /* open the files */ if (op == OP_MOUNT) { blockdev_fd = open(blockdev, O_RDONLY); if (blockdev_fd == -1) { fprintf(stderr, "Error opening %s: %s\n", blockdev, strerror(errno)); return 2; } } ret = snprintf(pathbuf, sizeof(pathbuf), "/proc/%d/ns/pid", tgt_pid); if (ret < 0 || ret >= sizeof(pathbuf)) { fprintf(stderr, "Error constructing the PID namespace path\n"); ret = 2; goto finish; } pidns_fd = open(pathbuf, O_RDONLY | O_CLOEXEC); if (pidns_fd == -1) { fprintf(stderr, "Error opening %s: %s\n", pathbuf, strerror(errno)); ret = 2; goto finish; } ret = snprintf(pathbuf, sizeof(pathbuf), "/proc/%d/ns/mnt", tgt_pid); if (ret < 0 || ret >= sizeof(pathbuf)) { fprintf(stderr, "Error constructing the mount namespace path\n"); ret = 2; goto finish; } mountns_fd = open(pathbuf, O_RDONLY | O_CLOEXEC); if (mountns_fd == -1) { fprintf(stderr, "Error opening %s: %s\n", pathbuf, strerror(errno)); ret = 2; goto finish; } /* enter the namespaces */ ret = setns(pidns_fd, CLONE_NEWPID); if (ret == -1) { fprintf(stderr, "Error entering the PID namespace: %s\n", strerror(errno)); ret = 3; goto finish; } ret = setns(mountns_fd, CLONE_NEWNS); if (ret == -1) { fprintf(stderr, "Error entering the mount namespace: %s\n", strerror(errno)); ret = 3; goto finish; } /* fork to actually enter the PID namespace */ child_pid = fork(); if (child_pid == -1) { fprintf(stderr, "fork() failed: %s\n", strerror(errno)); ret = 4; goto finish; } if (child_pid) { /* we are the parent */ ret = wait(NULL); if (ret == -1) { fprintf(stderr, "Error waiting for the child: %s\n", strerror(errno)); ret = 4; goto finish; } } else { /* we are the child */ if (op == OP_MOUNT) { /* we use /proc/self/fd to mount the device * Since the container controls its own filesystem hierarchy, it * could trick us into mounting an arbitrary node located in the * filesystem. This is not considered a major security problem, * since * - the container should not have access to mknod() or nodes that * it is not meant to read * - we mount the filesystem read-only, with nosuid flag * - since the container will typically live in its own user * namespace, it will not have the right permissions to access a * filesystem that is not intended for it * * Ideally, there would be something like a mountfd() syscall that * would allow mounting an fd. */ ret = snprintf(pathbuf, sizeof(pathbuf), "/proc/self/fd/%d", blockdev_fd); if (ret < 0 || ret >= sizeof(pathbuf)) { fprintf(stderr, "Error constructing the mount path\n"); ret = 4; goto finish; } ret = mount(pathbuf, mountpoint, fstype, MS_RDONLY | MS_NOSUID, NULL); if (ret == -1) { fprintf(stderr, "mount(%s, %s) failed: %s\n", pathbuf, mountpoint, strerror(errno)); ret = 5; goto finish; } } else if (op == OP_UMOUNT) { /** * As above, a malicious container can trick us into unmounting a * filesystem in its tree. This should not cause any issues other * than disrupting the container (which a compromised container can * already do without our help). */ ret = umount(mountpoint); if (ret == -1) { fprintf(stderr, "umount() failed: %s\n", strerror(errno)); ret = 5; goto finish; } } } ret = 0; finish: if (blockdev_fd >= 0) close(blockdev_fd); if (pidns_fd >= 0) close(pidns_fd); if (mountns_fd >= 0) close(mountns_fd); return ret; }