diff options
author | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2021-08-31 18:48:50 +0000 |
---|---|---|
committer | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2021-08-31 18:48:50 +0000 |
commit | 9c38c3715e4dfacda98a65a0f5606d9059eedf43 (patch) | |
tree | 29c40001dd153d04d0e21c84faa7f093a332536d | |
parent | a25cd900710305660a653c6b63d76286d8d6cdfe (diff) | |
parent | b31ec2c09ec3143af016fd4319502292551f2b90 (diff) | |
download | nsjail-9c38c3715e4dfacda98a65a0f5606d9059eedf43.tar.gz |
Snap for 7691048 from b31ec2c09ec3143af016fd4319502292551f2b90 to build-tools-release
Change-Id: Ib509a1f522a1d1d5ea7ef29e7aade71e9a8c0370
40 files changed, 1679 insertions, 466 deletions
diff --git a/.github/workflows/dockerpush.yml b/.github/workflows/dockerpush.yml new file mode 100644 index 0000000..5898f5b --- /dev/null +++ b/.github/workflows/dockerpush.yml @@ -0,0 +1,66 @@ +name: Docker + +on: + push: + # Publish `master` as Docker `latest` image. + branches: + - master + + # Publish `v1.2.3` tags as releases. + tags: + - v* + + # Run tests for any PRs. + pull_request: + +env: + IMAGE_NAME: nsjail + +jobs: + # Run tests. + # See also https://docs.docker.com/docker-hub/builds/automated-testing/ + test: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + + - name: Run tests + run: docker build . --file Dockerfile + + # Push image to GitHub Package Registry. + # See also https://docs.docker.com/docker-hub/builds/ + push: + # Ensure test job passes before pushing image. + needs: test + + runs-on: ubuntu-latest + if: github.event_name == 'push' + + steps: + - uses: actions/checkout@v2 + + - name: Build image + run: docker build . --file Dockerfile --tag image + + - name: Log into registry + run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login docker.pkg.github.com -u ${{ github.actor }} --password-stdin + + - name: Push image + run: | + IMAGE_ID=docker.pkg.github.com/${{ github.repository }}/$IMAGE_NAME + + # Strip git ref prefix from version + VERSION=$(echo "${{ github.ref }}" | sed -e 's,.*/\(.*\),\1,') + + # Strip "v" prefix from tag name + [[ "${{ github.ref }}" == "refs/tags/"* ]] && VERSION=$(echo $VERSION | sed -e 's/^v//') + + # Use Docker `latest` tag convention + [ "$VERSION" == "master" ] && VERSION=latest + + echo IMAGE_ID=$IMAGE_ID + echo VERSION=$VERSION + + docker tag image $IMAGE_ID:$VERSION + docker push $IMAGE_ID:$VERSION @@ -12,9 +12,11 @@ cc_binary_host { "-Wno-unused-parameter", ], cppflags: ["-fno-exceptions"], + shared_libs: ["libnl"], srcs: [ "caps.cc", "cgroup.cc", + "cgroup2.cc", "cmdline.cc", "config.cc", "contain.cc", @@ -1,4 +1,4 @@ -FROM ubuntu:16.04 +FROM ubuntu:18.04 RUN apt-get -y update && apt-get install -y \ autoconf \ @@ -8,6 +8,7 @@ RUN apt-get -y update && apt-get install -y \ g++ \ git \ libprotobuf-dev \ + libnl-route-3-dev \ libtool \ make \ pkg-config \ @@ -16,7 +16,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -PKG_CONFIG=$(shell which pkg-config) +PKG_CONFIG=$(shell command -v pkg-config 2> /dev/null) ifeq ($(PKG_CONFIG),) $(error "Install pkg-config to make it work") endif @@ -31,13 +31,13 @@ COMMON_FLAGS += -O2 -c \ -Wall -Wextra -Werror \ -Ikafel/include -CXXFLAGS += $(COMMON_FLAGS) $(shell pkg-config --cflags protobuf) \ +CXXFLAGS += $(USER_DEFINES) $(COMMON_FLAGS) $(shell pkg-config --cflags protobuf) \ -std=c++11 -fno-exceptions -Wno-unused -Wno-unused-parameter LDFLAGS += -pie -Wl,-z,noexecstack -lpthread $(shell pkg-config --libs protobuf) BIN = nsjail LIBS = kafel/libkafel.a -SRCS_CXX = caps.cc cgroup.cc cmdline.cc config.cc contain.cc cpu.cc logs.cc mnt.cc net.cc nsjail.cc pid.cc sandbox.cc subproc.cc uts.cc user.cc util.cc +SRCS_CXX = caps.cc cgroup.cc cgroup2.cc cmdline.cc config.cc contain.cc cpu.cc logs.cc mnt.cc net.cc nsjail.cc pid.cc sandbox.cc subproc.cc uts.cc user.cc util.cc SRCS_PROTO = config.proto SRCS_PB_CXX = $(SRCS_PROTO:.proto=.pb.cc) SRCS_PB_H = $(SRCS_PROTO:.proto=.pb.h) @@ -48,14 +48,11 @@ ifdef DEBUG CXXFLAGS += -g -ggdb -gdwarf-4 endif -USE_NL3 ?= yes -ifeq ($(USE_NL3), yes) NL3_EXISTS := $(shell pkg-config --exists libnl-route-3.0 && echo yes) ifeq ($(NL3_EXISTS), yes) - CXXFLAGS += -DNSJAIL_NL3_WITH_MACVLAN $(shell pkg-config --cflags libnl-route-3.0) + CXXFLAGS += $(shell pkg-config --cflags libnl-route-3.0) LDFLAGS += $(shell pkg-config --libs libnl-route-3.0) endif -endif .PHONY: all clean depend indent @@ -66,17 +63,21 @@ all: $(BIN) $(BIN): $(LIBS) $(OBJS) ifneq ($(NL3_EXISTS), yes) - $(warning "==========================================================") - $(warning "No support for libnl3/libnl-route-3; /sbin/ip will be used") - $(warning "==========================================================") + $(warning "============================================================") + $(warning "You probably miss libnl3(-dev)/libnl-route-3(-dev) libraries") + $(warning "============================================================") endif $(CXX) -o $(BIN) $(OBJS) $(LIBS) $(LDFLAGS) -kafel/libkafel.a: +.PHONY: kafel_init +kafel_init: ifeq ("$(wildcard kafel/Makefile)","") git submodule update --init endif - $(MAKE) -C kafel + +kafel/include/kafel.h: kafel_init +kafel/libkafel.a: kafel_init + CFLAGS=-fPIE $(MAKE) -C kafel # Sequence of proto deps, which doesn't fit automatic make rules config.o: $(SRCS_PB_O) $(SRCS_PB_H) @@ -104,9 +105,10 @@ indent: caps.o: caps.h nsjail.h logs.h macros.h util.h cgroup.o: cgroup.h nsjail.h logs.h util.h +cgroup2.o: cgroup2.h nsjail.h logs.h util.h cmdline.o: cmdline.h nsjail.h caps.h config.h logs.h macros.h mnt.h user.h cmdline.o: util.h -config.o: caps.h nsjail.h cmdline.h config.h config.pb.h logs.h macros.h +config.o: config.h nsjail.h caps.h cmdline.h config.pb.h logs.h macros.h config.o: mnt.h user.h util.h contain.o: contain.h nsjail.h caps.h cgroup.h cpu.h logs.h macros.h mnt.h contain.o: net.h pid.h user.h util.h uts.h @@ -116,9 +118,9 @@ mnt.o: mnt.h nsjail.h logs.h macros.h subproc.h util.h net.o: net.h nsjail.h logs.h subproc.h nsjail.o: nsjail.h cmdline.h logs.h macros.h net.h sandbox.h subproc.h util.h pid.o: pid.h nsjail.h logs.h subproc.h -sandbox.o: sandbox.h nsjail.h kafel/include/kafel.h logs.h -subproc.o: subproc.h nsjail.h cgroup.h contain.h logs.h macros.h net.h -subproc.o: sandbox.h user.h util.h +sandbox.o: sandbox.h nsjail.h kafel/include/kafel.h logs.h util.h +subproc.o: subproc.h nsjail.h cgroup.h cgroup2.h contain.h logs.h macros.h +subproc.o: net.h sandbox.h user.h util.h uts.o: uts.h nsjail.h logs.h user.o: user.h nsjail.h logs.h macros.h subproc.h util.h util.o: util.h nsjail.h logs.h macros.h @@ -357,9 +357,9 @@ Options: --rw Mount chroot dir (/) R/W (default: R/O) --user|-u VALUE - Username/uid of processess inside the jail (default: your current uid). You can also use inside_ns_uid:outside_ns_uid:count convention here. Can be specified multiple times + Username/uid of processes inside the jail (default: your current uid). You can also use inside_ns_uid:outside_ns_uid:count convention here. Can be specified multiple times --group|-g VALUE - Groupname/gid of processess inside the jail (default: your current gid). You can also use inside_ns_gid:global_ns_gid:count convention here. Can be specified multiple times + Groupname/gid of processes inside the jail (default: your current gid). You can also use inside_ns_gid:global_ns_gid:count convention here. Can be specified multiple times --hostname|-H VALUE UTS name (hostname) of the jail (default: 'NSJAIL') --cwd|-D VALUE @@ -368,6 +368,8 @@ Options: TCP port to bind to (enables MODE_LISTEN_TCP) (default: 0) --bindhost VALUE IP address to bind the port to (only in [MODE_LISTEN_TCP]), (default: '::') + --max_conns VALUE + Maximum number of connections across all IPs (only in [MODE_LISTEN_TCP]), (default: 0 (unlimited)) --max_conns_per_ip|-i VALUE Maximum number of connections per one IP (only in [MODE_LISTEN_TCP]), (default: 0 (unlimited)) --log|-l VALUE @@ -489,7 +491,7 @@ Options: --cgroup_cpu_ms_per_sec VALUE Number of milliseconds of CPU time per second that the process group can use (default: '0' - no limit) --cgroup_cpu_mount VALUE - Location of cpu cgroup FS (default: '/sys/fs/cgroup/net_cls') + Location of cpu cgroup FS (default: '/sys/fs/cgroup/cpu') --cgroup_cpu_parent VALUE Which pre-existing cpu cgroup to use as a parent (default: 'NSJAIL') --iface_no_lo @@ -21,6 +21,7 @@ #include "caps.h" +#include <errno.h> #include <linux/capability.h> #include <string.h> #include <sys/prctl.h> @@ -80,6 +81,15 @@ struct { #if defined(CAP_AUDIT_READ) NS_VALSTR_STRUCT(CAP_AUDIT_READ), #endif /* defined(CAP_AUDIT_READ) */ +#if defined(CAP_BPF) + NS_VALSTR_STRUCT(CAP_BPF), +#endif /* defined(CAP_BPF) */ +#if defined(CAP_PERFMON) + NS_VALSTR_STRUCT(CAP_PERFMON), +#endif /* defined(CAP_PERFMON) */ +#if defined(CAP_CHECKPOINT_RESTORE) + NS_VALSTR_STRUCT(CAP_CHECKPOINT_RESTORE), +#endif /* defined(CAP_CHECKPOINT_RESTORE) */ }; int nameToVal(const char* name) { @@ -88,7 +98,7 @@ int nameToVal(const char* name) { return cap.val; } } - LOG_W("Uknown capability: '%s'", name); + LOG_W("Unknown capability: '%s'", name); return -1; } @@ -112,7 +122,7 @@ static cap_user_data_t getCaps() { .version = _LINUX_CAPABILITY_VERSION_3, .pid = 0, }; - if (syscall(__NR_capget, &cap_hdr, &cap_data) == -1) { + if (util::syscall(__NR_capget, (uintptr_t)&cap_hdr, (uintptr_t)&cap_data) == -1) { PLOG_W("capget() failed"); return NULL; } @@ -124,7 +134,7 @@ static bool setCaps(const cap_user_data_t cap_data) { .version = _LINUX_CAPABILITY_VERSION_3, .pid = 0, }; - if (syscall(__NR_capset, &cap_hdr, cap_data) == -1) { + if (util::syscall(__NR_capset, (uintptr_t)&cap_hdr, (uintptr_t)cap_data) == -1) { PLOG_W("capset() failed"); return false; } @@ -247,6 +257,11 @@ bool initNs(nsjconf_t* nsjconf) { if (getInheritable(cap_data, i.val)) { continue; } + if (prctl(PR_CAPBSET_READ, (unsigned long)i.val, 0UL, 0UL, 0UL) == -1 && + errno == EINVAL) { + LOG_D("Skipping unsupported capability: %s", i.name); + continue; + } dbgmsg.append(" ").append(i.name); if (prctl(PR_CAPBSET_DROP, (unsigned long)i.val, 0UL, 0UL, 0UL) == -1) { PLOG_W("prctl(PR_CAPBSET_DROP, %s)", i.name); @@ -38,12 +38,11 @@ namespace cgroup { static bool createCgroup(const std::string& cgroup_path, pid_t pid) { - LOG_D("Create '%s' for PID=%d", cgroup_path.c_str(), (int)pid); + LOG_D("Create '%s' for pid=%d", cgroup_path.c_str(), (int)pid); if (mkdir(cgroup_path.c_str(), 0700) == -1 && errno != EEXIST) { PLOG_W("mkdir('%s', 0700) failed", cgroup_path.c_str()); return false; } - return true; } @@ -55,14 +54,13 @@ static bool writeToCgroup( LOG_W("Could not update %s", what.c_str()); return false; } - return true; } static bool addPidToTaskList(const std::string& cgroup_path, pid_t pid) { std::string pid_str = std::to_string(pid); std::string tasks_path = cgroup_path + "/tasks"; - LOG_D("Adding PID='%s' to '%s'", pid_str.c_str(), tasks_path.c_str()); + LOG_D("Adding pid='%s' to '%s'", pid_str.c_str(), tasks_path.c_str()); return writeToCgroup(tasks_path, pid_str, "'" + tasks_path + "' task list"); } @@ -136,12 +134,12 @@ static bool initNsFromParentCpu(nsjconf_t* nsjconf, pid_t pid) { "/NSJAIL." + std::to_string(pid); RETURN_ON_FAILURE(createCgroup(cpu_cgroup_path, pid)); - std::string cpu_ms_per_sec_str = std::to_string(nsjconf->cgroup_cpu_ms_per_sec * 1000U); RETURN_ON_FAILURE( - writeToCgroup(cpu_cgroup_path + "/cpu.cfs_quota_us", cpu_ms_per_sec_str, "cpu quota")); + writeToCgroup(cpu_cgroup_path + "/cpu.cfs_period_us", "1000000", "cpu period")); + std::string cpu_ms_per_sec_str = std::to_string(nsjconf->cgroup_cpu_ms_per_sec * 1000U); RETURN_ON_FAILURE( - writeToCgroup(cpu_cgroup_path + "/cpu.cfs_period_us", "1000000", "cpu period")); + writeToCgroup(cpu_cgroup_path + "/cpu.cfs_quota_us", cpu_ms_per_sec_str, "cpu quota")); return addPidToTaskList(cpu_cgroup_path, pid); } diff --git a/cgroup2.cc b/cgroup2.cc new file mode 100644 index 0000000..6b0dc09 --- /dev/null +++ b/cgroup2.cc @@ -0,0 +1,137 @@ +/* + + nsjail - cgroup2 namespacing + ----------------------------------------- + + Copyright 2014 Google Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +*/ + +#include "cgroup2.h" + +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <stdarg.h> +#include <stdio.h> +#include <string.h> +#include <sys/stat.h> +#include <unistd.h> + +#include <fstream> +#include <iostream> +#include <sstream> + +#include "logs.h" +#include "util.h" + +namespace cgroup2 { + +static std::string getCgroupPath(nsjconf_t *nsjconf, pid_t pid) { + return nsjconf->cgroupv2_mount + "/NSJAIL." + std::to_string(pid); +} + +static bool createCgroup(const std::string &cgroup_path, pid_t pid) { + LOG_D("Create '%s' for pid=%d", cgroup_path.c_str(), (int)pid); + if (mkdir(cgroup_path.c_str(), 0700) == -1 && errno != EEXIST) { + PLOG_W("mkdir('%s', 0700) failed", cgroup_path.c_str()); + return false; + } + return true; +} + +static bool writeToCgroup( + const std::string &cgroup_path, const std::string &resource, const std::string &value) { + LOG_I("Setting '%s' to '%s'", resource.c_str(), value.c_str()); + + if (!util::writeBufToFile( + (cgroup_path + "/" + resource).c_str(), value.c_str(), value.length(), O_WRONLY)) { + LOG_W("Could not update %s", resource.c_str()); + return false; + } + return true; +} + +static bool addPidToProcList(const std::string &cgroup_path, pid_t pid) { + std::string pid_str = std::to_string(pid); + + LOG_D("Adding pid='%s' to cgroup.procs", pid_str.c_str()); + if (!util::writeBufToFile((cgroup_path + "/cgroup.procs").c_str(), pid_str.c_str(), + pid_str.length(), O_WRONLY)) { + LOG_W("Could not update cgroup.procs"); + return false; + } + return true; +} + +static void removeCgroup(const std::string &cgroup_path) { + LOG_D("Remove '%s'", cgroup_path.c_str()); + if (rmdir(cgroup_path.c_str()) == -1) { + PLOG_W("rmdir('%s') failed", cgroup_path.c_str()); + } +} + +static bool initNsFromParentMem(nsjconf_t *nsjconf, pid_t pid) { + if (nsjconf->cgroup_mem_max == (size_t)0) { + return true; + } + + std::string cgroup_path = getCgroupPath(nsjconf, pid); + RETURN_ON_FAILURE(createCgroup(cgroup_path, pid)); + RETURN_ON_FAILURE(addPidToProcList(cgroup_path, pid)); + return writeToCgroup(cgroup_path, "memory.max", std::to_string(nsjconf->cgroup_mem_max)); +} + +static bool initNsFromParentPids(nsjconf_t *nsjconf, pid_t pid) { + if (nsjconf->cgroup_pids_max == 0U) { + return true; + } + std::string cgroup_path = getCgroupPath(nsjconf, pid); + RETURN_ON_FAILURE(createCgroup(cgroup_path, pid)); + RETURN_ON_FAILURE(addPidToProcList(cgroup_path, pid)); + return writeToCgroup(cgroup_path, "pids.max", std::to_string(nsjconf->cgroup_pids_max)); +} + +static bool initNsFromParentCpu(nsjconf_t *nsjconf, pid_t pid) { + if (nsjconf->cgroup_cpu_ms_per_sec == 0U) { + return true; + } + + std::string cgroup_path = getCgroupPath(nsjconf, pid); + RETURN_ON_FAILURE(createCgroup(cgroup_path, pid)); + RETURN_ON_FAILURE(addPidToProcList(cgroup_path, pid)); + + // The maximum bandwidth limit in the format: `$MAX $PERIOD`. + // This indicates that the group may consume up to $MAX in each $PERIOD + // duration. + std::string cpu_ms_per_sec_str = std::to_string(nsjconf->cgroup_cpu_ms_per_sec * 1000U); + cpu_ms_per_sec_str += " 1000000"; + return writeToCgroup(cgroup_path, "cpu.max", cpu_ms_per_sec_str); +} + +bool initNsFromParent(nsjconf_t *nsjconf, pid_t pid) { + RETURN_ON_FAILURE(initNsFromParentMem(nsjconf, pid)); + RETURN_ON_FAILURE(initNsFromParentPids(nsjconf, pid)); + return initNsFromParentCpu(nsjconf, pid); +} + +void finishFromParent(nsjconf_t *nsjconf, pid_t pid) { + if (nsjconf->cgroup_mem_max != (size_t)0 || nsjconf->cgroup_pids_max != 0U || + nsjconf->cgroup_cpu_ms_per_sec != 0U) { + removeCgroup(getCgroupPath(nsjconf, pid)); + } +} + +} // namespace cgroup2 diff --git a/cgroup2.h b/cgroup2.h new file mode 100644 index 0000000..3e0cc71 --- /dev/null +++ b/cgroup2.h @@ -0,0 +1,38 @@ +/* + + nsjail - cgroup2 namespacing + ----------------------------------------- + + Copyright 2014 Google Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +*/ + +#ifndef NS_CGROUP2_H +#define NS_CGROUP2_H + +#include <stdbool.h> +#include <stddef.h> + +#include "nsjail.h" + +namespace cgroup2 { + +bool initNsFromParent(nsjconf_t* nsjconf, pid_t pid); +bool initNs(void); +void finishFromParent(nsjconf_t* nsjconf, pid_t pid); + +} // namespace cgroup2 + +#endif /* _CGROUP2_H */ @@ -76,13 +76,15 @@ struct custom_option custom_opts[] = { { { "exec_file", required_argument, NULL, 'x' }, "File to exec (default: argv[0])" }, { { "execute_fd", no_argument, NULL, 0x0607 }, "Use execveat() to execute a file-descriptor instead of executing the binary path. In such case argv[0]/exec_file denotes a file path before mount namespacing" }, { { "chroot", required_argument, NULL, 'c' }, "Directory containing / of the jail (default: none)" }, + { { "no_pivotroot", no_argument, NULL, 0x600 }, "When creating a mount namespace, use mount(MS_MOVE) and chroot rather than pivot_root. Usefull when pivot_root is disallowed (e.g. initramfs). Note: escapable is some configuration" }, { { "rw", no_argument, NULL, 0x601 }, "Mount chroot dir (/) R/W (default: R/O)" }, - { { "user", required_argument, NULL, 'u' }, "Username/uid of processess inside the jail (default: your current uid). You can also use inside_ns_uid:outside_ns_uid:count convention here. Can be specified multiple times" }, - { { "group", required_argument, NULL, 'g' }, "Groupname/gid of processess inside the jail (default: your current gid). You can also use inside_ns_gid:global_ns_gid:count convention here. Can be specified multiple times" }, + { { "user", required_argument, NULL, 'u' }, "Username/uid of processes inside the jail (default: your current uid). You can also use inside_ns_uid:outside_ns_uid:count convention here. Can be specified multiple times" }, + { { "group", required_argument, NULL, 'g' }, "Groupname/gid of processes inside the jail (default: your current gid). You can also use inside_ns_gid:global_ns_gid:count convention here. Can be specified multiple times" }, { { "hostname", required_argument, NULL, 'H' }, "UTS name (hostname) of the jail (default: 'NSJAIL')" }, { { "cwd", required_argument, NULL, 'D' }, "Directory in the namespace the process will run (default: '/')" }, { { "port", required_argument, NULL, 'p' }, "TCP port to bind to (enables MODE_LISTEN_TCP) (default: 0)" }, { { "bindhost", required_argument, NULL, 0x604 }, "IP address to bind the port to (only in [MODE_LISTEN_TCP]), (default: '::')" }, + { { "max_conns", required_argument, NULL, 0x608 }, "Maximum number of connections across all IPs (only in [MODE_LISTEN_TCP]), (default: 0 (unlimited))" }, { { "max_conns_per_ip", required_argument, NULL, 'i' }, "Maximum number of connections per one IP (only in [MODE_LISTEN_TCP]), (default: 0 (unlimited))" }, { { "log", required_argument, NULL, 'l' }, "Log file (default: use log_fd)" }, { { "log_fd", required_argument, NULL, 'L' }, "Log FD (default: 2)" }, @@ -92,8 +94,8 @@ struct custom_option custom_opts[] = { { { "verbose", no_argument, NULL, 'v' }, "Verbose output" }, { { "quiet", no_argument, NULL, 'q' }, "Log warning and more important messages only" }, { { "really_quiet", no_argument, NULL, 'Q' }, "Log fatal messages only" }, - { { "keep_env", no_argument, NULL, 'e' }, "Pass all environment variables to the child process (default: all envvars are cleared)" }, - { { "env", required_argument, NULL, 'E' }, "Additional environment variable (can be used multiple times). If the envvar doesn't contain '=' (e.g. just the 'DISPLAY' string), the current envvar value will be used" }, + { { "keep_env", no_argument, NULL, 'e' }, "Pass all environment variables to the child process (default: all envars are cleared)" }, + { { "env", required_argument, NULL, 'E' }, "Additional environment variable (can be used multiple times). If the envar doesn't contain '=' (e.g. just the 'DISPLAY' string), the current envar value will be used" }, { { "keep_caps", no_argument, NULL, 0x0501 }, "Don't drop any capabilities" }, { { "cap", required_argument, NULL, 0x0509 }, "Retain this capability, e.g. CAP_PTRACE (can be specified multiple times)" }, { { "silent", no_argument, NULL, 0x0502 }, "Redirect child process' fd:0/1/2 to /dev/null" }, @@ -108,6 +110,10 @@ struct custom_option custom_opts[] = { { { "rlimit_nofile", required_argument, NULL, 0x0205 }, "RLIMIT_NOFILE, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 32)" }, { { "rlimit_nproc", required_argument, NULL, 0x0206 }, "RLIMIT_NPROC, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 'soft')" }, { { "rlimit_stack", required_argument, NULL, 0x0207 }, "RLIMIT_STACK in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 'soft')" }, + { { "rlimit_memlock", required_argument, NULL, 0x0209 }, "RLIMIT_MEMLOCK in KB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 'soft')" }, + { { "rlimit_rtprio", required_argument, NULL, 0x0210 }, "RLIMIT_RTPRIO, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 'soft')" }, + { { "rlimit_msgqueue", required_argument, NULL, 0x0211 }, "RLIMIT_MSGQUEUE in bytes, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 'soft')" }, + { { "disable_rlimits", no_argument, NULL, 0x0208 }, "Disable all rlimits, default to limits set by parent" }, { { "persona_addr_compat_layout", no_argument, NULL, 0x0301 }, "personality(ADDR_COMPAT_LAYOUT)" }, { { "persona_mmap_page_zero", no_argument, NULL, 0x0302 }, "personality(MMAP_PAGE_ZERO)" }, { { "persona_read_implies_exec", no_argument, NULL, 0x0303 }, "personality(READ_IMPLIES_EXEC)" }, @@ -120,6 +126,7 @@ struct custom_option custom_opts[] = { { { "disable_clone_newipc", no_argument, NULL, 0x0405 }, "Don't use CLONE_NEWIPC" }, { { "disable_clone_newuts", no_argument, NULL, 0x0406 }, "Don't use CLONE_NEWUTS" }, { { "disable_clone_newcgroup", no_argument, NULL, 0x0407 }, "Don't use CLONE_NEWCGROUP. Might be required for kernel versions < 4.6" }, + { { "enable_clone_newtime", no_argument, NULL, 0x0408 }, "Use CLONE_NEWTIME. Supported with kernel versions >= 5.3" }, { { "uid_mapping", required_argument, NULL, 'U' }, "Add a custom uid mapping of the form inside_uid:outside_uid:count. Setting this requires newuidmap (set-uid) to be present" }, { { "gid_mapping", required_argument, NULL, 'G' }, "Add a custom gid mapping of the form inside_gid:outside_gid:count. Setting this requires newgidmap (set-uid) to be present" }, { { "bindmount_ro", required_argument, NULL, 'R' }, "List of mountpoints to be mounted --bind (ro) inside the container. Can be specified multiple times. Supports 'source' syntax, or 'source:dest'" }, @@ -133,6 +140,7 @@ struct custom_option custom_opts[] = { { { "seccomp_policy", required_argument, NULL, 'P' }, "Path to file containing seccomp-bpf policy (see kafel/)" }, { { "seccomp_string", required_argument, NULL, 0x0901 }, "String with kafel seccomp-bpf policy (see kafel/)" }, { { "seccomp_log", no_argument, NULL, 0x0902 }, "Use SECCOMP_FILTER_FLAG_LOG. Log all actions except SECCOMP_RET_ALLOW). Supported since kernel version 4.14" }, + { { "nice_level", required_argument, NULL, 0x0903 }, "Set jailed process niceness (-20 is highest -priority, 19 is lowest). By default, set to 19" }, { { "cgroup_mem_max", required_argument, NULL, 0x0801 }, "Maximum number of bytes to use in the group (default: '0' - disabled)" }, { { "cgroup_mem_mount", required_argument, NULL, 0x0802 }, "Location of memory cgroup FS (default: '/sys/fs/cgroup/memory')" }, { { "cgroup_mem_parent", required_argument, NULL, 0x0803 }, "Which pre-existing memory cgroup to use as a parent (default: 'NSJAIL')" }, @@ -143,8 +151,10 @@ struct custom_option custom_opts[] = { { { "cgroup_net_cls_mount", required_argument, NULL, 0x0822 }, "Location of net_cls cgroup FS (default: '/sys/fs/cgroup/net_cls')" }, { { "cgroup_net_cls_parent", required_argument, NULL, 0x0823 }, "Which pre-existing net_cls cgroup to use as a parent (default: 'NSJAIL')" }, { { "cgroup_cpu_ms_per_sec", required_argument, NULL, 0x0831 }, "Number of milliseconds of CPU time per second that the process group can use (default: '0' - no limit)" }, - { { "cgroup_cpu_mount", required_argument, NULL, 0x0822 }, "Location of cpu cgroup FS (default: '/sys/fs/cgroup/net_cls')" }, + { { "cgroup_cpu_mount", required_argument, NULL, 0x0832 }, "Location of cpu cgroup FS (default: '/sys/fs/cgroup/cpu')" }, { { "cgroup_cpu_parent", required_argument, NULL, 0x0833 }, "Which pre-existing cpu cgroup to use as a parent (default: 'NSJAIL')" }, + { { "cgroupv2_mount", required_argument, NULL, 0x0834}, "Location of cgroupv2 directory (default: '/sys/fs/cgroup')"}, + { { "use_cgroupv2", no_argument, NULL, 0x0835}, "Use cgroup v2"}, { { "iface_no_lo", no_argument, NULL, 0x700 }, "Don't bring the 'lo' interface up" }, { { "iface_own", required_argument, NULL, 0x704 }, "Move this existing network interface into the new NET namespace. Can be specified multiple times" }, { { "macvlan_iface", required_argument, NULL, 'I' }, "Interface which will be cloned (MACVLAN) and put inside the subprocess' namespace as 'vs'" }, @@ -152,6 +162,7 @@ struct custom_option custom_opts[] = { { { "macvlan_vs_nm", required_argument, NULL, 0x702 }, "Netmask of the 'vs' interface (e.g. \"255.255.255.0\")" }, { { "macvlan_vs_gw", required_argument, NULL, 0x703 }, "Default GW for the 'vs' interface (e.g. \"192.168.0.1\")" }, { { "macvlan_vs_ma", required_argument, NULL, 0x705 }, "MAC-address of the 'vs' interface (e.g. \"ba:ad:ba:be:45:00\")" }, + { { "macvlan_vs_mo", required_argument, NULL, 0x706 }, "Mode of the 'vs' interface. Can be either 'private', 'vepa', 'bridge' or 'passthru' (default: 'private')" }, }; // clang-format on @@ -194,7 +205,7 @@ void addEnv(nsjconf_t* nsjconf, const std::string& env) { } char* e = getenv(env.c_str()); if (!e) { - LOG_W("Requested to use the '%s' envvar, but it's not set. It'll be ignored", + LOG_W("Requested to use the '%s' envar, but it's not set. It'll be ignored", env.c_str()); return; } @@ -222,23 +233,25 @@ void logParams(nsjconf_t* nsjconf) { LOG_I( "Jail parameters: hostname:'%s', chroot:'%s', process:'%s', bind:[%s]:%d, " - "max_conns_per_ip:%u, time_limit:%" PRId64 + "max_conns:%u, max_conns_per_ip:%u, time_limit:%" PRId64 ", personality:%#lx, daemonize:%s, clone_newnet:%s, " "clone_newuser:%s, clone_newns:%s, clone_newpid:%s, clone_newipc:%s, clone_newuts:%s, " - "clone_newcgroup:%s, keep_caps:%s, disable_no_new_privs:%s, max_cpus:%zu", + "clone_newcgroup:%s, clone_newtime:%s, keep_caps:%s, disable_no_new_privs:%s, " + "max_cpus:%zu", nsjconf->hostname.c_str(), nsjconf->chroot.c_str(), nsjconf->exec_file.empty() ? nsjconf->argv[0].c_str() : nsjconf->exec_file.c_str(), - nsjconf->bindhost.c_str(), nsjconf->port, nsjconf->max_conns_per_ip, nsjconf->tlimit, - nsjconf->personality, logYesNo(nsjconf->daemonize), logYesNo(nsjconf->clone_newnet), - logYesNo(nsjconf->clone_newuser), logYesNo(nsjconf->clone_newns), - logYesNo(nsjconf->clone_newpid), logYesNo(nsjconf->clone_newipc), - logYesNo(nsjconf->clone_newuts), logYesNo(nsjconf->clone_newcgroup), + nsjconf->bindhost.c_str(), nsjconf->port, nsjconf->max_conns, nsjconf->max_conns_per_ip, + nsjconf->tlimit, nsjconf->personality, logYesNo(nsjconf->daemonize), + logYesNo(nsjconf->clone_newnet), logYesNo(nsjconf->clone_newuser), + logYesNo(nsjconf->clone_newns), logYesNo(nsjconf->clone_newpid), + logYesNo(nsjconf->clone_newipc), logYesNo(nsjconf->clone_newuts), + logYesNo(nsjconf->clone_newcgroup), logYesNo(nsjconf->clone_newtime), logYesNo(nsjconf->keep_caps), logYesNo(nsjconf->disable_no_new_privs), nsjconf->max_cpus); for (const auto& p : nsjconf->mountpts) { - LOG_I("%s: %s", p.is_symlink ? "Symlink" : "Mount point", - mnt::describeMountPt(p).c_str()); + LOG_I( + "%s: %s", p.is_symlink ? "Symlink" : "Mount", mnt::describeMountPt(p).c_str()); } for (const auto& uid : nsjconf->uids) { LOG_I("Uid map: inside_uid:%lu outside_uid:%lu count:%zu newuidmap:%s", @@ -298,16 +311,23 @@ static std::string argFromVec(const std::vector<std::string>& vec, size_t pos) { } static bool setupArgv(nsjconf_t* nsjconf, int argc, char** argv, int optind) { - for (int i = optind; i < argc; i++) { - nsjconf->argv.push_back(argv[i]); + /* + * If user provided cmdline via nsjail [opts] -- [cmdline], then override the one from the + * config file + */ + if (optind < argc) { + nsjconf->argv.clear(); + for (int i = optind; i < argc; i++) { + nsjconf->argv.push_back(argv[i]); + } } - if (nsjconf->argv.empty()) { - cmdlineUsage(argv[0]); - LOG_E("No command provided"); - return false; + if (nsjconf->exec_file.empty() && nsjconf->argv.size() > 0) { + nsjconf->exec_file = nsjconf->argv[0]; } if (nsjconf->exec_file.empty()) { - nsjconf->exec_file = nsjconf->argv[0]; + cmdlineUsage(argv[0]); + LOG_E("No command-line provided"); + return false; } if (nsjconf->use_execveat) { @@ -376,6 +396,18 @@ void setupUsers(nsjconf_t* nsjconf) { } } +std::string parseMACVlanMode(const char* optarg) { + if (strcasecmp(optarg, "private") != 0 && strcasecmp(optarg, "vepa") != 0 && + strcasecmp(optarg, "bridge") != 0 && strcasecmp(optarg, "passthru") != 0) { + LOG_F( + "macvlan mode can only be one of the values: " + "'private'/'vepa'/'bridge'/'passthru' ('%s' " + "provided).", + optarg); + } + return std::string(optarg); +} + std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) { std::unique_ptr<nsjconf_t> nsjconf(new nsjconf_t); @@ -391,26 +423,33 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) { nsjconf->keep_env = false; nsjconf->keep_caps = false; nsjconf->disable_no_new_privs = false; - nsjconf->rl_as = 512 * (1024 * 1024); - nsjconf->rl_core = 0; - nsjconf->rl_cpu = 600; - nsjconf->rl_fsize = 1 * (1024 * 1024); - nsjconf->rl_nofile = 32; + nsjconf->rl_as = 4096ULL * (1024ULL * 1024ULL); + nsjconf->rl_core = 0ULL; + nsjconf->rl_cpu = 600ULL; + nsjconf->rl_fsize = 1ULL * (1024ULL * 1024ULL); + nsjconf->rl_nofile = 32ULL; nsjconf->rl_nproc = parseRLimit(RLIMIT_NPROC, "soft", 1); nsjconf->rl_stack = parseRLimit(RLIMIT_STACK, "soft", 1); + nsjconf->rl_mlock = parseRLimit(RLIMIT_MEMLOCK, "soft", 1); + nsjconf->rl_rtpr = parseRLimit(RLIMIT_RTPRIO, "soft", 1); + nsjconf->rl_msgq = parseRLimit(RLIMIT_MSGQUEUE, "soft", 1); + nsjconf->disable_rl = false; nsjconf->personality = 0; nsjconf->clone_newnet = true; nsjconf->clone_newuser = true; nsjconf->clone_newns = true; + nsjconf->no_pivotroot = false; nsjconf->clone_newpid = true; nsjconf->clone_newipc = true; nsjconf->clone_newuts = true; nsjconf->clone_newcgroup = true; + nsjconf->clone_newtime = false; nsjconf->mode = MODE_STANDALONE_ONCE; nsjconf->is_root_rw = false; nsjconf->is_silent = false; nsjconf->stderr_to_null = false; nsjconf->skip_setsid = false; + nsjconf->max_conns = 0; nsjconf->max_conns_per_ip = 0; nsjconf->proc_path = "/proc"; nsjconf->is_proc_rw = false; @@ -426,16 +465,21 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) { nsjconf->cgroup_cpu_mount = "/sys/fs/cgroup/cpu"; nsjconf->cgroup_cpu_parent = "NSJAIL"; nsjconf->cgroup_cpu_ms_per_sec = 0U; + nsjconf->cgroupv2_mount = "/sys/fs/cgroup"; + nsjconf->use_cgroupv2 = false; nsjconf->iface_lo = true; nsjconf->iface_vs_ip = "0.0.0.0"; nsjconf->iface_vs_nm = "255.255.255.0"; nsjconf->iface_vs_gw = "0.0.0.0"; nsjconf->iface_vs_ma = ""; + nsjconf->iface_vs_mo = "private"; nsjconf->orig_uid = getuid(); + nsjconf->orig_euid = geteuid(); nsjconf->num_cpus = sysconf(_SC_NPROCESSORS_ONLN); nsjconf->seccomp_fprog.filter = NULL; nsjconf->seccomp_fprog.len = 0; nsjconf->seccomp_log = false; + nsjconf->nice_level = 19; nsjconf->openfds.push_back(STDIN_FILENO); nsjconf->openfds.push_back(STDOUT_FILENO); @@ -477,20 +521,27 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) { nsjconf->chroot = optarg; break; case 'p': + if (!util::isANumber(optarg)) { + LOG_E("Couldn't parse TCP port '%s'", optarg); + return nullptr; + } nsjconf->port = strtoumax(optarg, NULL, 0); nsjconf->mode = MODE_LISTEN_TCP; break; case 0x604: nsjconf->bindhost = optarg; break; + case 0x608: + nsjconf->max_conns = strtoul(optarg, NULL, 0); + break; case 'i': nsjconf->max_conns_per_ip = strtoul(optarg, NULL, 0); break; case 'l': - logs::logFile(optarg); + logs::logFile(optarg, STDERR_FILENO); break; case 'L': - logs::logFile(std::string("/dev/fd/") + optarg); + logs::logFile("", std::strtol(optarg, NULL, 0)); break; case 'd': nsjconf->daemonize = true; @@ -535,6 +586,18 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) { case 0x0207: nsjconf->rl_stack = parseRLimit(RLIMIT_STACK, optarg, (1024 * 1024)); break; + case 0x0209: + nsjconf->rl_mlock = parseRLimit(RLIMIT_MEMLOCK, optarg, 1024); + break; + case 0x0210: + nsjconf->rl_rtpr = parseRLimit(RLIMIT_RTPRIO, optarg, 1); + break; + case 0x0211: + nsjconf->rl_msgq = parseRLimit(RLIMIT_MSGQUEUE, optarg, 1); + break; + case 0x0208: + nsjconf->disable_rl = true; + break; case 0x0301: nsjconf->personality |= ADDR_COMPAT_LAYOUT; break; @@ -572,7 +635,7 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) { nsjconf->clone_newcgroup = false; break; case 0x0408: - nsjconf->clone_newcgroup = true; + nsjconf->clone_newtime = true; break; case 0x0501: nsjconf->keep_caps = true; @@ -602,6 +665,9 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) { } nsjconf->caps.push_back(cap); } break; + case 0x0600: + nsjconf->no_pivotroot = true; + break; case 0x0601: nsjconf->is_root_rw = true; break; @@ -777,6 +843,9 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) { case 0x705: nsjconf->iface_vs_ma = optarg; break; + case 0x706: + nsjconf->iface_vs_mo = parseMACVlanMode(optarg); + break; case 0x801: nsjconf->cgroup_mem_max = (size_t)strtoull(optarg, NULL, 0); break; @@ -813,6 +882,12 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) { case 0x833: nsjconf->cgroup_cpu_parent = optarg; break; + case 0x834: + nsjconf->cgroupv2_mount = optarg; + break; + case 0x835: + nsjconf->use_cgroupv2 = true; + break; case 'P': nsjconf->kafel_file_path = optarg; break; @@ -822,6 +897,9 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) { case 0x902: nsjconf->seccomp_log = true; break; + case 0x903: + nsjconf->nice_level = (int)strtol(optarg, NULL, 0); + break; default: cmdlineUsage(argv[0]); return nullptr; @@ -830,7 +908,7 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) { } if (nsjconf->daemonize && !logs::logSet()) { - logs::logFile(_LOG_DEFAULT_FILE); + logs::logFile(_LOG_DEFAULT_FILE, STDERR_FILENO); } if (!setupMounts(nsjconf.get())) { return nullptr; @@ -19,7 +19,11 @@ */ +#include "config.h" + #include <fcntl.h> +#include <google/protobuf/io/zero_copy_stream_impl.h> +#include <google/protobuf/text_format.h> #include <stdio.h> #include <sys/mount.h> #include <sys/personality.h> @@ -27,15 +31,12 @@ #include <sys/stat.h> #include <sys/types.h> -#include <google/protobuf/io/zero_copy_stream_impl.h> -#include <google/protobuf/text_format.h> #include <fstream> #include <string> #include <vector> #include "caps.h" #include "cmdline.h" -#include "config.h" #include "config.pb.h" #include "logs.h" #include "macros.h" @@ -78,28 +79,26 @@ static bool configParseInternal(nsjconf_t* nsjconf, const nsjail::NsJailConfig& nsjconf->mode = MODE_STANDALONE_EXECVE; break; default: - LOG_E("Uknown running mode: %d", njc.mode()); + LOG_E("Unknown running mode: %d", njc.mode()); return false; } - if (njc.has_chroot_dir()) { - nsjconf->chroot = njc.chroot_dir(); - } - nsjconf->is_root_rw = njc.is_root_rw(); nsjconf->hostname = njc.hostname(); nsjconf->cwd = njc.cwd(); nsjconf->port = njc.port(); nsjconf->bindhost = njc.bindhost(); + nsjconf->max_conns = njc.max_conns(); nsjconf->max_conns_per_ip = njc.max_conns_per_ip(); nsjconf->tlimit = njc.time_limit(); nsjconf->max_cpus = njc.max_cpus(); nsjconf->daemonize = njc.daemon(); if (njc.has_log_fd()) { - logs::logFile(std::string("/dev/fd/") + std::to_string(njc.log_fd())); + logs::logFile("", njc.log_fd()); } if (njc.has_log_file()) { - logs::logFile(njc.log_file()); + logs::logFile(njc.log_file(), STDERR_FILENO); } + if (njc.has_log_level()) { switch (njc.log_level()) { case nsjail::LogLevel::DEBUG: @@ -159,6 +158,14 @@ static bool configParseInternal(nsjconf_t* nsjconf, const nsjail::NsJailConfig& nsjconf->rl_nproc = configRLimit(RLIMIT_NPROC, njc.rlimit_nproc_type(), njc.rlimit_nproc()); nsjconf->rl_stack = configRLimit( RLIMIT_STACK, njc.rlimit_stack_type(), njc.rlimit_stack(), 1024UL * 1024UL); + nsjconf->rl_mlock = + configRLimit(RLIMIT_MEMLOCK, njc.rlimit_memlock_type(), njc.rlimit_memlock(), 1024UL); + nsjconf->rl_rtpr = + configRLimit(RLIMIT_RTPRIO, njc.rlimit_rtprio_type(), njc.rlimit_rtprio()); + nsjconf->rl_msgq = + configRLimit(RLIMIT_MSGQUEUE, njc.rlimit_msgqueue_type(), njc.rlimit_msgqueue()); + + nsjconf->disable_rl = njc.disable_rl(); if (njc.persona_addr_compat_layout()) { nsjconf->personality |= ADDR_COMPAT_LAYOUT; @@ -183,6 +190,9 @@ static bool configParseInternal(nsjconf_t* nsjconf, const nsjail::NsJailConfig& nsjconf->clone_newipc = njc.clone_newipc(); nsjconf->clone_newuts = njc.clone_newuts(); nsjconf->clone_newcgroup = njc.clone_newcgroup(); + nsjconf->clone_newtime = njc.clone_newtime(); + + nsjconf->no_pivotroot = njc.no_pivotroot(); for (ssize_t i = 0; i < njc.uidmap_size(); i++) { if (!user::parseId(nsjconf, njc.uidmap(i).inside_id(), njc.uidmap(i).outside_id(), @@ -238,6 +248,7 @@ static bool configParseInternal(nsjconf_t* nsjconf, const nsjail::NsJailConfig& nsjconf->kafel_string += '\n'; } nsjconf->seccomp_log = njc.seccomp_log(); + nsjconf->nice_level = njc.nice_level(); nsjconf->cgroup_mem_max = njc.cgroup_mem_max(); nsjconf->cgroup_mem_mount = njc.cgroup_mem_mount(); @@ -251,6 +262,8 @@ static bool configParseInternal(nsjconf_t* nsjconf, const nsjail::NsJailConfig& nsjconf->cgroup_cpu_ms_per_sec = njc.cgroup_cpu_ms_per_sec(); nsjconf->cgroup_cpu_mount = njc.cgroup_cpu_mount(); nsjconf->cgroup_cpu_parent = njc.cgroup_cpu_parent(); + nsjconf->cgroupv2_mount = njc.cgroupv2_mount(); + nsjconf->use_cgroupv2 = njc.use_cgroupv2(); nsjconf->iface_lo = !(njc.iface_no_lo()); for (ssize_t i = 0; i < njc.iface_own().size(); i++) { @@ -263,10 +276,13 @@ static bool configParseInternal(nsjconf_t* nsjconf, const nsjail::NsJailConfig& nsjconf->iface_vs_nm = njc.macvlan_vs_nm(); nsjconf->iface_vs_gw = njc.macvlan_vs_gw(); nsjconf->iface_vs_ma = njc.macvlan_vs_ma(); + nsjconf->iface_vs_mo = njc.macvlan_vs_mo(); if (njc.has_exec_bin()) { - nsjconf->exec_file = njc.exec_bin().path(); - nsjconf->argv.push_back(njc.exec_bin().path()); + if (njc.exec_bin().has_path()) { + nsjconf->exec_file = njc.exec_bin().path(); + nsjconf->argv.push_back(njc.exec_bin().path()); + } for (ssize_t i = 0; i < njc.exec_bin().arg().size(); i++) { nsjconf->argv.push_back(njc.exec_bin().arg(i)); } @@ -287,7 +303,7 @@ static void LogHandler( bool parseFile(nsjconf_t* nsjconf, const char* file) { LOG_D("Parsing configuration from '%s'", file); - int fd = open(file, O_RDONLY | O_CLOEXEC); + int fd = TEMP_FAILURE_RETRY(open(file, O_RDONLY | O_CLOEXEC)); if (fd == -1) { PLOG_W("Couldn't open config file '%s'", file); return false; diff --git a/config.proto b/config.proto index 3988543..96a2b10 100644 --- a/config.proto +++ b/config.proto @@ -4,14 +4,14 @@ package nsjail; enum Mode { LISTEN = 0; /* Listening on a TCP port */ - ONCE = 1; /* Running the command once only */ - RERUN = 2; /* Re-executing the command (forever) */ + ONCE = 1; /* Running the command once only */ + RERUN = 2; /* Re-executing the command (forever) */ EXECVE = 3; /* Executing command w/o the supervisor */ } /* Should be self explanatory */ enum LogLevel { - DEBUG = 0; /* Equivalent to the '-v' cmd-line option */ - INFO = 1; /* Default level */ + DEBUG = 0; /* Equivalent to the '-v' cmd-line option */ + INFO = 1; /* Default level */ WARNING = 2; /* Equivalent to the '-q' cmd-line option */ ERROR = 3; FATAL = 4; @@ -28,13 +28,13 @@ message IdMap { message MountPt { /* Can be skipped for filesystems like 'proc' */ optional string src = 1 [default = ""]; - /* Should 'src' path be prefixed with this envvar? */ + /* Should 'src' path be prefixed with this envar? */ optional string prefix_src_env = 2 [default = ""]; /* If specified, contains buffer that will be written to the dst file */ optional bytes src_content = 3 [default = ""]; /* Mount point inside jail */ required string dst = 4 [default = ""]; - /* Should 'dst' path be prefixed with this envvar? */ + /* Should 'dst' path be prefixed with this envar? */ optional string prefix_dst_env = 5 [default = ""]; /* Can be empty for mount --bind mounts */ optional string fstype = 6 [default = ""]; @@ -81,163 +81,185 @@ message NsJailConfig { /* Execution mode: see 'msg Mode' description for more */ optional Mode mode = 3 [default = ONCE]; - /* Equivalent to a bind mount with dst='/'. DEPRECATED: Use bind mounts. */ - optional string chroot_dir = 4 [deprecated = true]; - /* Applies both to the chroot_dir and to /proc mounts. DEPRECATED: Use bind mounts */ - optional bool is_root_rw = 5 [default = false, deprecated = true]; /* Hostname inside jail */ - optional string hostname = 8 [default = "NSJAIL"]; + optional string hostname = 4 [default = "NSJAIL"]; /* Initial current working directory for the binary */ - optional string cwd = 9 [default = "/"]; + optional string cwd = 5 [default = "/"]; + + /* Defines whether to use switch_root or pivot_root */ + optional bool no_pivotroot = 6 [default = false]; /* TCP port to listen to. Valid with mode=LISTEN only */ - optional uint32 port = 10 [default = 0]; + optional uint32 port = 7 [default = 0]; /* Host to bind to for mode=LISTEN. Must be in IPv6 format */ - optional string bindhost = 11 [default = "::"]; + optional string bindhost = 8 [default = "::"]; + /* For mode=LISTEN, maximum number of connections across all IPs */ + optional uint32 max_conns = 9 [default = 0]; /* For mode=LISTEN, maximum number of connections from a single IP */ - optional uint32 max_conns_per_ip = 12 [default = 0]; + optional uint32 max_conns_per_ip = 10 [default = 0]; /* Wall-time time limit for commands */ - optional uint32 time_limit = 13 [default = 600]; + optional uint32 time_limit = 11 [default = 600]; /* Should nsjail go into background? */ - optional bool daemon = 14 [default = false]; + optional bool daemon = 12 [default = false]; /* Maximum number of CPUs to use: 0 - no limit */ - optional uint32 max_cpus = 15 [default = 0]; + optional uint32 max_cpus = 13 [default = 0]; /* FD to log to. */ - optional int32 log_fd = 16; - /* File to save lofs to */ - optional string log_file = 17; + optional int32 log_fd = 14; + /* File to save logs to. */ + optional string log_file = 15; /* Minimum log level displayed. See 'msg LogLevel' description for more */ - optional LogLevel log_level = 18; + optional LogLevel log_level = 16; /* Should the current environment variables be kept when executing the binary */ - optional bool keep_env = 19 [default = false]; - /* EnvVars to be set before executing binaries. If the envvar doesn't contain '=' - (e.g. just the 'DISPLAY' string), the current envvar value will be used */ - repeated string envar = 20; + optional bool keep_env = 17 [default = false]; + /* EnvVars to be set before executing binaries. If the envar doesn't contain '=' + (e.g. just the 'DISPLAY' string), the current envar value will be used */ + repeated string envar = 18; /* Should capabilities be preserved or dropped */ - optional bool keep_caps = 21 [default = false]; + optional bool keep_caps = 19 [default = false]; /* Which capabilities should be preserved if keep_caps == false. Format: "CAP_SYS_PTRACE" */ - repeated string cap = 22; + repeated string cap = 20; /* Should nsjail close FD=0,1,2 before executing the process */ - optional bool silent = 23 [default = false]; + optional bool silent = 21 [default = false]; /* Should the child process have control over terminal? Can be useful to allow /bin/sh to provide job control / signals. Dangerous, can be used to put characters into the controlling terminal back */ - optional bool skip_setsid = 24 [default = false]; + optional bool skip_setsid = 22 [default = false]; /* Redirect sdterr of the process to /dev/null instead of the socket or original TTY */ - optional bool stderr_to_null = 25 [default = false]; + optional bool stderr_to_null = 23 [default = false]; /* Which FDs should be passed to the newly executed process By default only FD=0,1,2 are passed */ - repeated int32 pass_fd = 26; + repeated int32 pass_fd = 24; /* Setting it to true will allow to have set-uid binaries inside the jail */ - optional bool disable_no_new_privs = 27 [default = false]; + optional bool disable_no_new_privs = 25 [default = false]; /* Various rlimits, the rlimit_as/rlimit_core/... are used only if rlimit_as_type/rlimit_core_type/... are set to RLimit::VALUE */ - optional uint64 rlimit_as = 28 [default = 512]; /* In MiB */ - optional RLimit rlimit_as_type = 29 [default = VALUE]; - optional uint64 rlimit_core = 30 [default = 0]; /* In MiB */ - optional RLimit rlimit_core_type = 31 [default = VALUE]; - optional uint64 rlimit_cpu = 32 [default = 600]; /* In seconds */ - optional RLimit rlimit_cpu_type = 33 [default = VALUE]; - optional uint64 rlimit_fsize = 34 [default = 1]; /* In MiB */ - optional RLimit rlimit_fsize_type = 35 [default = VALUE]; - optional uint64 rlimit_nofile = 36 [default = 32]; - optional RLimit rlimit_nofile_type = 37 [default = VALUE]; + optional uint64 rlimit_as = 26 [default = 4096]; /* In MiB */ + optional RLimit rlimit_as_type = 27 [default = VALUE]; + optional uint64 rlimit_core = 28 [default = 0]; /* In MiB */ + optional RLimit rlimit_core_type = 29 [default = VALUE]; + optional uint64 rlimit_cpu = 30 [default = 600]; /* In seconds */ + optional RLimit rlimit_cpu_type = 31 [default = VALUE]; + optional uint64 rlimit_fsize = 32 [default = 1]; /* In MiB */ + optional RLimit rlimit_fsize_type = 33 [default = VALUE]; + optional uint64 rlimit_nofile = 34 [default = 32]; + optional RLimit rlimit_nofile_type = 35 [default = VALUE]; /* RLIMIT_NPROC is system-wide - tricky to use; use the soft limit value by * default here */ - optional uint64 rlimit_nproc = 38 [default = 1024]; - optional RLimit rlimit_nproc_type = 39 [default = SOFT]; + optional uint64 rlimit_nproc = 36 [default = 1024]; + optional RLimit rlimit_nproc_type = 37 [default = SOFT]; /* In MiB, use the soft limit value by default */ - optional uint64 rlimit_stack = 40 [default = 1048576]; - optional RLimit rlimit_stack_type = 41 [default = SOFT]; + optional uint64 rlimit_stack = 38 [default = 8]; + optional RLimit rlimit_stack_type = 39 [default = SOFT]; + /* In KB, use the soft limit value by default */ + optional uint64 rlimit_memlock = 40 [default = 64]; + optional RLimit rlimit_memlock_type = 41 [default = SOFT]; + optional uint64 rlimit_rtprio = 42 [default = 0]; + optional RLimit rlimit_rtprio_type = 43 [default = SOFT]; + optional uint64 rlimit_msgqueue = 44 [default = 1024]; /* In bytes */ + optional RLimit rlimit_msgqueue_type = 45 [default = SOFT]; + + /* Disable all rlimits, default to limits set by parent */ + optional bool disable_rl = 46 [default = false]; /* See 'man personality' for more */ - optional bool persona_addr_compat_layout = 42 [default = false]; - optional bool persona_mmap_page_zero = 43 [default = false]; - optional bool persona_read_implies_exec = 44 [default = false]; - optional bool persona_addr_limit_3gb = 45 [default = false]; - optional bool persona_addr_no_randomize = 46 [default = false]; + optional bool persona_addr_compat_layout = 47 [default = false]; + optional bool persona_mmap_page_zero = 48 [default = false]; + optional bool persona_read_implies_exec = 49 [default = false]; + optional bool persona_addr_limit_3gb = 50 [default = false]; + optional bool persona_addr_no_randomize = 51 [default = false]; /* Which name-spaces should be used? */ - optional bool clone_newnet = 47 [default = true]; - optional bool clone_newuser = 48 [default = true]; - optional bool clone_newns = 49 [default = true]; - optional bool clone_newpid = 50 [default = true]; - optional bool clone_newipc = 51 [default = true]; - optional bool clone_newuts = 52 [default = true]; + optional bool clone_newnet = 52 [default = true]; + optional bool clone_newuser = 53 [default = true]; + optional bool clone_newns = 54 [default = true]; + optional bool clone_newpid = 55 [default = true]; + optional bool clone_newipc = 56 [default = true]; + optional bool clone_newuts = 57 [default = true]; /* Disable for kernel versions < 4.6 as it's not supported there */ - optional bool clone_newcgroup = 53 [default = true]; + optional bool clone_newcgroup = 58 [default = true]; + /* Supported with kernel versions >= 5.3 */ + optional bool clone_newtime = 59 [default = false]; /* Mappings for UIDs and GIDs. See the description for 'msg IdMap' for more */ - repeated IdMap uidmap = 54; - repeated IdMap gidmap = 55; + repeated IdMap uidmap = 60; + repeated IdMap gidmap = 61; /* Should /proc be mounted (R/O)? This can also be added in the 'mount' section below */ - optional bool mount_proc = 56 [default = false]; + optional bool mount_proc = 62 [default = false]; /* Mount points inside the jail. See the description for 'msg MountPt' for more */ - repeated MountPt mount = 57; + repeated MountPt mount = 63; /* Kafel seccomp-bpf policy file or a string: Homepage of the project: https://github.com/google/kafel */ - optional string seccomp_policy_file = 58; - repeated string seccomp_string = 59; + optional string seccomp_policy_file = 64; + repeated string seccomp_string = 65; /* Setting it to true makes audit write seccomp logs to dmesg */ - optional bool seccomp_log = 60 [default = false]; + optional bool seccomp_log = 66 [default = false]; /* If > 0, maximum cumulative size of RAM used inside any jail */ - optional uint64 cgroup_mem_max = 61 [default = 0]; /* In MiB */ + optional uint64 cgroup_mem_max = 67 [default = 0]; /* In bytes */ /* Mount point for cgroups-memory in your system */ - optional string cgroup_mem_mount = 62 [default = "/sys/fs/cgroup/memory"]; + optional string cgroup_mem_mount = 68 [default = "/sys/fs/cgroup/memory"]; /* Writeable directory (for the nsjail user) under cgroup_mem_mount */ - optional string cgroup_mem_parent = 63 [default = "NSJAIL"]; + optional string cgroup_mem_parent = 69 [default = "NSJAIL"]; /* If > 0, maximum number of PIDs (threads/processes) inside jail */ - optional uint64 cgroup_pids_max = 64 [default = 0]; + optional uint64 cgroup_pids_max = 70 [default = 0]; /* Mount point for cgroups-pids in your system */ - optional string cgroup_pids_mount = 65 [default = "/sys/fs/cgroup/pids"]; + optional string cgroup_pids_mount = 71 [default = "/sys/fs/cgroup/pids"]; /* Writeable directory (for the nsjail user) under cgroup_pids_mount */ - optional string cgroup_pids_parent = 66 [default = "NSJAIL"]; + optional string cgroup_pids_parent = 72 [default = "NSJAIL"]; /* If > 0, Class identifier of network packets inside jail */ - optional uint32 cgroup_net_cls_classid = 67 [default = 0]; + optional uint32 cgroup_net_cls_classid = 73 [default = 0]; /* Mount point for cgroups-net-cls in your system */ - optional string cgroup_net_cls_mount = 68 [default = "/sys/fs/cgroup/net_cls"]; + optional string cgroup_net_cls_mount = 74 [default = "/sys/fs/cgroup/net_cls"]; /* Writeable directory (for the nsjail user) under cgroup_net_mount */ - optional string cgroup_net_cls_parent = 69 [default = "NSJAIL"]; + optional string cgroup_net_cls_parent = 75 [default = "NSJAIL"]; /* If > 0, number of milliseconds of CPU time per second that jailed processes can use */ - optional uint32 cgroup_cpu_ms_per_sec = 70 [default = 0]; + optional uint32 cgroup_cpu_ms_per_sec = 76 [default = 0]; /* Mount point for cgroups-cpu in your system */ - optional string cgroup_cpu_mount = 71 [default = "/sys/fs/cgroup/cpu"]; + optional string cgroup_cpu_mount = 77 [default = "/sys/fs/cgroup/cpu"]; /* Writeable directory (for the nsjail user) under cgroup_cpu_mount */ - optional string cgroup_cpu_parent = 72 [default = "NSJAIL"]; + optional string cgroup_cpu_parent = 78 [default = "NSJAIL"]; + + /* Mount point for cgroup v2 in your system */ + optional string cgroupv2_mount = 79 [default = "/sys/fs/cgroup"]; + /* Use cgroup v2 */ + optional bool use_cgroupv2 = 80 [default = false]; /* Should the 'lo' interface be brought up (active) inside this jail? */ - optional bool iface_no_lo = 73 [default = false]; + optional bool iface_no_lo = 81 [default = false]; /* Put this interface inside the jail */ - repeated string iface_own = 74; + repeated string iface_own = 82; /* Parameters for the cloned MACVLAN interface inside jail */ - optional string macvlan_iface = 75; /* Interface to be cloned, eg 'eth0' */ - optional string macvlan_vs_ip = 76 [default = "192.168.0.2"]; - optional string macvlan_vs_nm = 77 [default = "255.255.255.0"]; - optional string macvlan_vs_gw = 78 [default = "192.168.0.1"]; - optional string macvlan_vs_ma = 79 [default = ""]; + optional string macvlan_iface = 83; /* Interface to be cloned, eg 'eth0' */ + optional string macvlan_vs_ip = 84 [default = "192.168.0.2"]; + optional string macvlan_vs_nm = 85 [default = "255.255.255.0"]; + optional string macvlan_vs_gw = 86 [default = "192.168.0.1"]; + optional string macvlan_vs_ma = 87 [default = ""]; + optional string macvlan_vs_mo = 88 [default = "private"]; + + /* Niceness level of the jailed process */ + optional int32 nice_level = 89 [default = 19]; /* Binary path (with arguments) to be executed. If not specified here, it can be specified with cmd-line as "-- /path/to/command arg1 arg2" */ - optional Exe exec_bin = 80; + optional Exe exec_bin = 90; } diff --git a/configs/apache.cfg b/configs/apache.cfg index f3ae838..a1f2ff6 100644 --- a/configs/apache.cfg +++ b/configs/apache.cfg @@ -1,4 +1,7 @@ +# Example config for nsjail + name: "apache-with-cloned-net" + description: "Tested under Ubuntu 17.04. Other Linux distros might " description: "use different locations for the Apache's HTTPD configuration " description: "files and system libraries" diff --git a/configs/bash-with-fake-geteuid.cfg b/configs/bash-with-fake-geteuid.cfg index c0046ba..99a36af 100644 --- a/configs/bash-with-fake-geteuid.cfg +++ b/configs/bash-with-fake-geteuid.cfg @@ -1,4 +1,7 @@ +# Example config for nsjail + name: "bash-with-fake-geteuid" + description: "An example/demo policy which allows to execute /bin/bash and other commands in " description: "a fairly restricted jail containing only some directories from the main " description: "system, and with blocked __NR_syslog syscall. Also, __NR_geteuid returns -1337 " diff --git a/configs/demo-dont-use-chrome-with-net.cfg b/configs/demo-dont-use-chrome-with-net.cfg index 690657e..c6c6a5f 100644 --- a/configs/demo-dont-use-chrome-with-net.cfg +++ b/configs/demo-dont-use-chrome-with-net.cfg @@ -1,3 +1,5 @@ +# Example config for nsjail + name: "chrome-with-net" description: "Don't use for anything serious - this is just a demo policy. See notes" diff --git a/configs/firefox-with-cloned-net.cfg b/configs/firefox-with-cloned-net.cfg index eb541e3..180ed9a 100644 --- a/configs/firefox-with-cloned-net.cfg +++ b/configs/firefox-with-cloned-net.cfg @@ -1,3 +1,5 @@ +# Example config for nsjail + name: "firefox-with-cloned-net" description: "This policy allows to run firefox inside a jail on a separate eth interface." @@ -30,6 +32,8 @@ time_limit: 0 envar: "HOME=/user" envar: "DISPLAY" envar: "TMP=/tmp" +envar: "FONTCONFIG_FILE=/etc/fonts/fonts.conf" +envar: "FC_CONFIG_FILE=/etc/fonts/fonts.conf" rlimit_as: 4096 rlimit_cpu: 1000 diff --git a/configs/firefox-with-net-wayland.cfg b/configs/firefox-with-net-wayland.cfg new file mode 100644 index 0000000..b132018 --- /dev/null +++ b/configs/firefox-with-net-wayland.cfg @@ -0,0 +1,175 @@ +# Example config for nsjail + +name: "firefox-with-net" + +description: "This policy allows to run firefox inside a jail. Access to networking is" +description: "permitted with this setup (clone_newnet: false)." +description: "" +description: "The only permitted home directory is $HOME/.mozilla and $HOME/Documents." +description: "The rest of available on the FS files/dires are libs and X-related files/dirs." +description: "" +description: "Run as:" +description: "" +description: "./nsjail --config configs/firefox-with-net-wayland.cfg" +description: "" +description: "You can then go to https://uploadfiles.io/ and try to upload a file in order" +description: "to see how your local directory (also, all system directories) look like." + +mode: ONCE +hostname: "FIREFOX" +cwd: "/user" + +time_limit: 0 + +clone_newnet: false + +envar: "HOME=/user" +envar: "TMP=/tmp" +envar: "FONTCONFIG_FILE=/etc/fonts/fonts.conf" +envar: "FC_CONFIG_FILE=/etc/fonts/fonts.conf" +envar: "MOZ_ENABLE_WAYLAND=1" +envar: "XDG_RUNTIME_DIR=/user/run/" +envar: "WAYLAND_DISPLAY=wayland-0" + +rlimit_as: 4096 +rlimit_cpu: 1000 +rlimit_fsize: 1024 +rlimit_nofile: 512 + +uidmap { + inside_id: "9999999" +} + +gidmap { + inside_id: "9999999" +} + +mount { + dst: "/proc" + fstype: "proc" + rw: true +} + +mount { + src: "/lib" + dst: "/lib" + is_bind: true +} + +mount { + src: "/usr/lib" + dst: "/usr/lib" + is_bind: true +} + +mount { + src: "/lib64" + dst: "/lib64" + is_bind: true + mandatory: false +} + +mount { + src: "/lib32" + dst: "/lib32" + is_bind: true + mandatory: false +} + +mount { + src: "/usr/lib/firefox" + dst: "/usr/lib/firefox" + is_bind: true +} + +mount { + src: "/usr/bin/firefox" + dst: "/usr/bin/firefox" + is_bind: true +} + +mount { + src: "/usr/share" + dst: "/usr/share" + is_bind: true +} + +mount { + src_content: "<?xml version=\"1.0\"?>\n<!DOCTYPE fontconfig SYSTEM \"fonts.dtd\">\n<fontconfig><dir>/usr/share/fonts</dir><cachedir>/tmp/fontconfig</cachedir></fontconfig>" + dst: "/etc/fonts/fonts.conf" +} + +mount { + src: "/dev/urandom" + dst: "/dev/urandom" + is_bind: true + rw: true +} + +mount { + src: "/dev/null" + dst: "/dev/null" + is_bind: true + rw: true +} + +mount { + src_content: "nameserver 8.8.8.8" + dst: "/etc/resolv.conf" +} + +mount { + dst: "/tmp" + fstype: "tmpfs" + rw: true + is_bind: false +} + +mount { + dst: "/dev/shm" + fstype: "tmpfs" + rw: true + is_bind: false +} + +mount { + dst: "/user" + fstype: "tmpfs" + rw: true +} + +mount { + prefix_src_env: "HOME" + src: "/Documents" + dst: "/user/Documents" + rw: true + is_bind: true + mandatory: false +} + +mount { + prefix_src_env: "HOME" + src: "/.mozilla" + dst: "/user/.mozilla" + is_bind: true + rw: true + mandatory: false +} + +mount { + src: "/tmp/.X11-unix/X0" + dst: "/tmp/.X11-unix/X0" + is_bind: true +} + +mount { + # Change it to your user id + src: "/run/user/1000/wayland-0" + dst: "/user/run/wayland-0" + is_bind: true + rw: true +} + +exec_bin { + path: "/usr/lib/firefox/firefox" +} diff --git a/configs/firefox-with-net.cfg b/configs/firefox-with-net.cfg index 190f7c2..b88f8ea 100644 --- a/configs/firefox-with-net.cfg +++ b/configs/firefox-with-net.cfg @@ -1,3 +1,5 @@ +# Example config for nsjail + name: "firefox-with-net" description: "This policy allows to run firefox inside a jail. Access to networking is" @@ -24,6 +26,8 @@ clone_newnet: false envar: "HOME=/user" envar: "DISPLAY" envar: "TMP=/tmp" +envar: "FONTCONFIG_FILE=/etc/fonts/fonts.conf" +envar: "FC_CONFIG_FILE=/etc/fonts/fonts.conf" rlimit_as: 4096 rlimit_cpu: 1000 diff --git a/configs/home-documents-with-xorg-no-net.cfg b/configs/home-documents-with-xorg-no-net.cfg index cc2514f..83cfb42 100644 --- a/configs/home-documents-with-xorg-no-net.cfg +++ b/configs/home-documents-with-xorg-no-net.cfg @@ -1,3 +1,5 @@ +# Example config for nsjail + name: "documents-with-xorg" description: "This policy allows to run many X-org based tool, which are allowed" @@ -121,6 +123,20 @@ mount { } mount { + src: "/dev/random" + dst: "/dev/random" + is_bind: true + rw: true +} + +mount { + src: "/dev/urandom" + dst: "/dev/urandom" + is_bind: true + rw: true +} + +mount { src: "/etc/passwd" dst: "/etc/passwd" is_bind: true diff --git a/configs/imagemagick-convert.cfg b/configs/imagemagick-convert.cfg index dfe702d..479b293 100644 --- a/configs/imagemagick-convert.cfg +++ b/configs/imagemagick-convert.cfg @@ -1,3 +1,5 @@ +# Example config for nsjail + name: "imagemagick-convert" description: "This policy allows to run ImageMagick's convert inside a jail." @@ -5,8 +7,9 @@ description: "Your $HOME's Documents will be mapped as /user/Documents" description: "" description: "Run as:" description: "" -description: "./nsjail --config imagemagick-convert.cfg -- /usr/bin/convert \\" -description: " jpg:/user/Documents/input.jpg png:/user/Documents/output.png" +description: "./nsjail --config imagemagick-convert.cfg -- /usr/bin/convert jpg:/user/Documents/input.jpg png:/user/Documents/output.png " +description: "or " +description: "./nsjail --config imagemagick-convert.cfg -- /usr/bin/convert jpg:- png:- <file.jpg >file.png mode: ONCE hostname: "IM-CONVERT" @@ -78,11 +81,12 @@ seccomp_string: " getpid, execveat, getdents, unlink, fchmod," seccomp_string: " getrlimit, getrusage, sysinfo, times, futex," seccomp_string: " arch_prctl, sched_getaffinity, set_tid_address," seccomp_string: " clock_gettime, set_robust_list, exit_group," -seccomp_string: " clone, getcwd, pread64, readlink, prlimit64" +seccomp_string: " clone, getcwd, pread64, readlink, prlimit64, madvise" seccomp_string: "}" seccomp_string: "DEFAULT KILL" exec_bin { - path: "/usr/bin/convert" + path: "" + arg0: "/usr/bin/convert" exec_fd: true } diff --git a/configs/static-busybox-with-execveat.cfg b/configs/static-busybox-with-execveat.cfg index 0d0a49e..ddfe01c 100644 --- a/configs/static-busybox-with-execveat.cfg +++ b/configs/static-busybox-with-execveat.cfg @@ -1,3 +1,5 @@ +# Example config for nsjail + name: "static-busybox-with-execveat" description: "An example/demo policy which allows to execute /bin/busybox-static in an " description: "empty (only /proc) mount namespace which doesn't even include busybox itself" diff --git a/configs/tomcat8.cfg b/configs/tomcat8.cfg new file mode 100644 index 0000000..30af214 --- /dev/null +++ b/configs/tomcat8.cfg @@ -0,0 +1,135 @@ +# Example config for nsjail + +name: "tomcat8" + +description: "Tested under Ubuntu 16.04 with tomcat8=8.0.32-1ubuntu1.9," +description: "libnl-route-3-200=3.2.27-1ubuntu0.16.04.1," +description: "libprotobuf9v5=2.6.1-1.3," +description: "openjdk-8-jre=8u191-b12-2ubuntu0.16.04.1. " +description: "Run as: sudo ./nsjail --config configs/tomcat.cfg" + +mode: ONCE +hostname: "TOMCAT-NSJ" + +envar: "JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre" +envar: "JVM_TMP=/tmp" +envar: "CATALINA_TMPDIR=/tmp" +envar: "CATALINA_HOME=/usr/share/tomcat8" +envar: "CATALINA_BASE=/var/lib/tomcat8" +envar: "CATALINA_OPTS=-server -XX:+UseParallelGC" +envar: "JAVA_OPTS=-Djava.awt.headless=true -Djava.net.preferIPv4Stack=true -Xms256M -Xmx512M -Djava.security.egd=file:/dev/./urandom" + +rlimit_as: 2048 +rlimit_fsize: 1024 +rlimit_cpu_type: INF +rlimit_nofile: 1024 + +time_limit: 0 + +cap: "CAP_NET_BIND_SERVICE" + +uidmap { + inside_id: "tomcat8" + outside_id: "tomcat8" +} + +gidmap { + inside_id: "tomcat8" + outside_id: "tomcat8" +} + +mount_proc: false + +mount { + src: "/etc/tomcat8" + dst: "/etc/tomcat8" + is_bind: true + rw: false +} + +mount { + src: "/var/lib/tomcat8" + dst: "/var/lib/tomcat8" + is_bind: true + rw: true +} + +mount { + src: "/var/log/tomcat8" + dst: "/var/log/tomcat8" + is_bind: true + rw: true +} + +mount { + src: "/var/cache/tomcat8" + dst: "/var/cache/tomcat8" + is_bind: true + rw: true +} + +mount { + src: "/usr/share/tomcat8" + dst: "/usr/share/tomcat8" + is_bind: true + rw: false +} + +mount { + src: "/bin" + dst: "/bin" + is_bind: true + rw: false +} + +mount { + src: "/lib" + dst: "/lib" + is_bind: true + rw: false +} + +mount { + src: "/lib64" + dst: "/lib64" + is_bind: true + rw: false +} + +mount { + src: "/usr/bin" + dst: "/usr/bin" + is_bind: true + rw: false +} + +mount { + src: "/usr/lib" + dst: "/usr/lib" + is_bind: true + rw: false +} + +mount { + src: "/usr/share/java" + dst: "/usr/share/java" + is_bind: true + rw: false +} + +mount { + dst: "/tmp" + fstype: "tmpfs" + rw: true +} + +mount { + dst: "/proc" + fstype: "proc" + rw: false +} + +exec_bin { + path: "/usr/share/tomcat8/bin/catalina.sh" + arg : "run" +} diff --git a/configs/xchat-with-net.cfg b/configs/xchat-with-net.cfg index e8d2759..04c361b 100644 --- a/configs/xchat-with-net.cfg +++ b/configs/xchat-with-net.cfg @@ -1,3 +1,5 @@ +# Example config for nsjail + name: "xchat-with-net" description: "This policy allows to run xchat inside a jail. Access to networking is" @@ -7,11 +9,12 @@ description: "The only permitted home directory is $HOME/.xchat2 and $HOME/Docum description: "The rest of available on the FS files/dires are libs and X-related files/dirs." description: "" description: "Run as:" -description: "./nsjail --config configs/xchat-with-net.cfg --daemon -l /tmp/xchat.log" +description: "./nsjail --config configs/xchat-with-net.cfg" mode: ONCE hostname: "XCHAT" cwd: "/user" +daemon: true time_limit: 0 diff --git a/configs/znc-with-net.cfg b/configs/znc-with-net.cfg new file mode 100644 index 0000000..bdcc53e --- /dev/null +++ b/configs/znc-with-net.cfg @@ -0,0 +1,136 @@ +# Example config for nsjail + +name: "znc-with-net" + +description: "This policy allows to run znc a jail. " +description: "Networking is permitted with this setup (clone_newnet: false). " +description: "" +description: "The only permitted home directory is $HOME/.znc." +description: "" +description: "Run as: nsjail --config configs/znc-with-net.cfg" + +mode: ONCE +hostname: "ZNC" +cwd: "/home/znc" +daemon: true + +time_limit: 0 + +envar: "HOME=/home/znc" +envar: "TMP=/tmp" + +log_fd: 2 + +rlimit_as: 4096 +rlimit_cpu_type: INF +rlimit_fsize: 4096 +rlimit_nofile: 128 + +clone_newnet: false + +mount { + dst: "/proc" + fstype: "proc" +} + +mount { + src: "/lib" + dst: "/lib" + is_bind: true +} + +mount { + src: "/usr/lib" + dst: "/usr/lib" + is_bind: true +} + +mount { + src: "/lib64" + dst: "/lib64" + is_bind: true + mandatory: false +} + +mount { + src: "/lib32" + dst: "/lib32" + is_bind: true + mandatory: false +} + +mount { + src: "/usr/share" + dst: "/usr/share" + is_bind: true +} + +mount { + src: "/dev/urandom" + dst: "/dev/urandom" + is_bind: true + rw: true +} + +mount { + src: "/dev/null" + dst: "/dev/null" + is_bind: true + rw: true +} + +mount { + src: "/etc/resolv.conf" + dst: "/etc/resolv.conf" + is_bind: true + mandatory: false +} + +mount { + src: "/etc/ssl" + dst: "/etc/ssl" + is_bind: true +} + +mount { + dst: "/tmp" + fstype: "tmpfs" + rw: true + is_bind: false +} + +mount { + dst: "/dev/shm" + fstype: "tmpfs" + rw: true + is_bind: false +} + +mount { + dst: "/home/znc" + fstype: "tmpfs" + rw: true + is_bind: false +} + +mount { + prefix_src_env: "HOME" + src: "/.znc" + dst: "/home/znc/.znc" + rw: true + is_bind: true + mandatory: true +} + +seccomp_string: "KILL {" +seccomp_string: " ptrace," +seccomp_string: " process_vm_readv," +seccomp_string: " process_vm_writev" +seccomp_string: "}" +seccomp_string: "DEFAULT ALLOW" + +exec_bin { + path: "/usr/bin/znc" + arg: "-f" + exec_fd: true +} @@ -100,9 +100,10 @@ static bool containPrepareEnv(nsjconf_t* nsjconf) { PLOG_E("personality(%lx)", nsjconf->personality); return false; } + LOG_D("setpriority(%d)", nsjconf->nice_level); errno = 0; - if (setpriority(PRIO_PROCESS, 0, 19) == -1 && errno != 0) { - PLOG_W("setpriority(19)"); + if (setpriority(PRIO_PROCESS, 0, nsjconf->nice_level) == -1 && errno != 0) { + PLOG_W("setpriority(%d)", nsjconf->nice_level); } if (!nsjconf->skip_setsid) { setsid(); @@ -119,6 +120,10 @@ static bool containCPU(nsjconf_t* nsjconf) { } static bool containSetLimits(nsjconf_t* nsjconf) { + if (nsjconf->disable_rl) { + return true; + } + struct rlimit64 rl; rl.rlim_cur = rl.rlim_max = nsjconf->rl_as; if (setrlimit64(RLIMIT_AS, &rl) == -1) { @@ -155,6 +160,21 @@ static bool containSetLimits(nsjconf_t* nsjconf) { PLOG_E("setrlimit64(0, RLIMIT_STACK, %" PRIu64 ")", nsjconf->rl_stack); return false; } + rl.rlim_cur = rl.rlim_max = nsjconf->rl_mlock; + if (setrlimit64(RLIMIT_MEMLOCK, &rl) == -1) { + PLOG_E("setrlimit64(0, RLIMIT_MEMLOCK, %" PRIu64 ")", nsjconf->rl_mlock); + return false; + } + rl.rlim_cur = rl.rlim_max = nsjconf->rl_rtpr; + if (setrlimit64(RLIMIT_RTPRIO, &rl) == -1) { + PLOG_E("setrlimit64(0, RLIMIT_RTPRIO, %" PRIu64 ")", nsjconf->rl_rtpr); + return false; + } + rl.rlim_cur = rl.rlim_max = nsjconf->rl_msgq; + if (setrlimit64(RLIMIT_MSGQUEUE, &rl) == -1) { + PLOG_E("setrlimit64(0, RLIMIT_MSGQUEUE , %" PRIu64 ")", nsjconf->rl_msgq); + return false; + } return true; } @@ -175,14 +195,14 @@ static bool containMakeFdsCOENaive(nsjconf_t* nsjconf) { continue; } if (containPassFd(nsjconf, fd)) { - LOG_D("FD=%d will be passed to the child process", fd); + LOG_D("fd=%d will be passed to the child process", fd); if (TEMP_FAILURE_RETRY(fcntl(fd, F_SETFD, flags & ~(FD_CLOEXEC))) == -1) { - PLOG_E("Could not set FD_CLOEXEC for FD=%d", fd); + PLOG_E("Could not set FD_CLOEXEC for fd=%d", fd); return false; } } else { if (TEMP_FAILURE_RETRY(fcntl(fd, F_SETFD, flags | FD_CLOEXEC)) == -1) { - PLOG_E("Could not set FD_CLOEXEC for FD=%d", fd); + PLOG_E("Could not set FD_CLOEXEC for fd=%d", fd); return false; } } @@ -228,21 +248,21 @@ static bool containMakeFdsCOEProc(nsjconf_t* nsjconf) { } int flags = TEMP_FAILURE_RETRY(fcntl(fd, F_GETFD, 0)); if (flags == -1) { - PLOG_D("fcntl(fd=%xld, F_GETFD, 0)", fd); + PLOG_D("fcntl(fd=%d, F_GETFD, 0)", fd); closedir(dir); return false; } if (containPassFd(nsjconf, fd)) { - LOG_D("FD=%d will be passed to the child process", fd); + LOG_D("fd=%d will be passed to the child process", fd); if (TEMP_FAILURE_RETRY(fcntl(fd, F_SETFD, flags & ~(FD_CLOEXEC))) == -1) { - PLOG_E("Could not clear FD_CLOEXEC for FD=%d", fd); + PLOG_E("Could not clear FD_CLOEXEC for fd=%d", fd); closedir(dir); return false; } } else { - LOG_D("FD=%d will be closed before execve()", fd); + LOG_D("fd=%d will be closed before execve()", fd); if (TEMP_FAILURE_RETRY(fcntl(fd, F_SETFD, flags | FD_CLOEXEC)) == -1) { - PLOG_E("Could not set FD_CLOEXEC for FD=%d", fd); + PLOG_E("Could not set FD_CLOEXEC for fd=%d", fd); closedir(dir); return false; } @@ -265,14 +285,14 @@ static bool containMakeFdsCOE(nsjconf_t* nsjconf) { bool setupFD(nsjconf_t* nsjconf, int fd_in, int fd_out, int fd_err) { if (nsjconf->stderr_to_null) { - LOG_D("Redirecting FD=2 (STDERR_FILENO) to /dev/null"); + LOG_D("Redirecting fd=2 (STDERR_FILENO) to /dev/null"); if ((fd_err = TEMP_FAILURE_RETRY(open("/dev/null", O_RDWR))) == -1) { PLOG_E("open('/dev/null', O_RDWR"); return false; } } if (nsjconf->is_silent) { - LOG_D("Redirecting FD=0/1/2 (STDIN/OUT/ERR_FILENO) to /dev/null"); + LOG_D("Redirecting fd=0-2 (STDIN/OUT/ERR_FILENO) to /dev/null"); if (TEMP_FAILURE_RETRY(fd_in = fd_out = fd_err = open("/dev/null", O_RDWR)) == -1) { PLOG_E("open('/dev/null', O_RDWR)"); return false; @@ -38,8 +38,6 @@ #include "macros.h" #include "util.h" -#include <string.h> - namespace logs { static int _log_fd = STDERR_FILENO; @@ -47,40 +45,51 @@ static bool _log_fd_isatty = true; static enum llevel_t _log_level = INFO; static bool _log_set = false; -__attribute__((constructor)) static void log_init(void) { - _log_fd = fcntl(_log_fd, F_DUPFD_CLOEXEC, 0); +static void setDupLogFdOr(int fd, int orfd) { + int saved_errno = errno; + _log_fd = fcntl(fd, F_DUPFD_CLOEXEC, 0); if (_log_fd == -1) { - _log_fd = STDERR_FILENO; + _log_fd = fcntl(orfd, F_DUPFD_CLOEXEC, 0); } - _log_fd_isatty = isatty(_log_fd); -} - -bool logSet() { - return _log_set; + if (_log_fd == -1) { + _log_fd = orfd; + } + _log_fd_isatty = (isatty(_log_fd) == 1); + errno = saved_errno; } /* * Log to stderr by default. Use a dup()d fd, because in the future we'll associate the * connection socket with fd (0, 1, 2). */ +__attribute__((constructor)) static void log_init(void) { + setDupLogFdOr(STDERR_FILENO, STDERR_FILENO); +} + +bool logSet() { + return _log_set; +} void logLevel(enum llevel_t ll) { _log_level = ll; } -void logFile(const std::string& logfile) { +void logFile(const std::string& log_file, int log_fd) { _log_set = true; + int newlogfd = -1; + if (!log_file.empty()) { + newlogfd = TEMP_FAILURE_RETRY( + open(log_file.c_str(), O_CREAT | O_RDWR | O_APPEND | O_CLOEXEC, 0640)); + if (newlogfd == -1) { + PLOG_W("Couldn't open('%s')", log_file.c_str()); + } + } /* Close previous log_fd */ if (_log_fd > STDERR_FILENO) { close(_log_fd); - _log_fd = STDERR_FILENO; - } - if (TEMP_FAILURE_RETRY(_log_fd = open(logfile.c_str(), - O_CREAT | O_RDWR | O_APPEND | O_CLOEXEC, 0640)) == -1) { - _log_fd = STDERR_FILENO; - PLOG_W("Couldn't open logfile open('%s')", logfile.c_str()); } - _log_fd_isatty = (isatty(_log_fd) == 1); + setDupLogFdOr(newlogfd, log_fd); + close(newlogfd); } void logMsg(enum llevel_t ll, const char* fn, int ln, bool perr, const char* fmt, ...) { @@ -112,7 +121,9 @@ void logMsg(enum llevel_t ll, const char* fn, int ln, bool perr, const char* fmt if (_log_fd_isatty) { msg.append(logLevels[ll].prefix); } - msg.append("[").append(logLevels[ll].descr).append("]"); + if (ll != HELP && ll != HELP_BOLD) { + msg.append("[").append(logLevels[ll].descr).append("]"); + } if (logLevels[ll].print_time) { msg.append("[").append(util::timeToStr(time(NULL))).append("]"); } @@ -59,7 +59,7 @@ void logMsg(enum llevel_t ll, const char* fn, int ln, bool perr, const char* fmt __attribute__((format(printf, 5, 6))); void logStop(int sig); void logLevel(enum llevel_t ll); -void logFile(const std::string& logfile); +void logFile(const std::string& log_file, int log_fd); bool logSet(); } // namespace logs @@ -28,8 +28,7 @@ #define TEMP_FAILURE_RETRY(expression) \ (__extension__({ \ long int __result; \ - do \ - __result = (long int)(expression); \ + do __result = (long int)(expression); \ while (__result == -1L && errno == EINTR); \ __result; \ })) @@ -59,7 +58,7 @@ static void __attribute__ ((unused)) __clang_cleanup_func(void (^*dfunc) (void)) #define _DEFER(a, count) \ auto void _STRMERGE(__defer_f_, count)(void* _defer_arg __attribute__((unused))); \ int _STRMERGE(__defer_var_, count) __attribute__((cleanup(_STRMERGE(__defer_f_, count)))) \ - __attribute__((unused)); \ + __attribute__((unused)); \ void _STRMERGE(__defer_f_, count)(void* _defer_arg __attribute__((unused))) #define defer _DEFER(a, __COUNTER__) #endif @@ -37,7 +37,6 @@ #include <sys/syscall.h> #include <sys/types.h> #include <sys/wait.h> -#include <syscall.h> #include <unistd.h> #include <memory> @@ -179,8 +178,8 @@ static bool mountPt(mount_t* mpt, const char* newroot, const char* tmpdir) { return false; } if (!util::writeToFd(fd, mpt->src_content.data(), mpt->src_content.length())) { - LOG_W("Writting %zu bytes to '%s' failed", mpt->src_content.length(), - srcpath); + LOG_W( + "Writing %zu bytes to '%s' failed", mpt->src_content.length(), srcpath); close(fd); return false; } @@ -283,6 +282,13 @@ static bool mkdirAndTest(const std::string& dir) { static std::unique_ptr<std::string> getDir(nsjconf_t* nsjconf, const char* name) { std::unique_ptr<std::string> dir(new std::string); + dir->assign("/run/user/").append(std::to_string(nsjconf->orig_uid)).append("/nsjail"); + if (mkdirAndTest(*dir)) { + dir->append("/").append(name); + if (mkdirAndTest(*dir)) { + return dir; + } + } dir->assign("/run/user/") .append("/nsjail.") .append(std::to_string(nsjconf->orig_uid)) @@ -331,29 +337,26 @@ static std::unique_ptr<std::string> getDir(nsjconf_t* nsjconf, const char* name) return nullptr; } -static bool initNsInternal(nsjconf_t* nsjconf) { +static bool initNoCloneNs(nsjconf_t* nsjconf) { /* * If CLONE_NEWNS is not used, we would be changing the global mount namespace, so simply * use --chroot in this case */ - if (!nsjconf->clone_newns) { - if (nsjconf->chroot.empty()) { - PLOG_E( - "--chroot was not specified, and it's required when not using " - "CLONE_NEWNS"); - return false; - } - if (chroot(nsjconf->chroot.c_str()) == -1) { - PLOG_E("chroot('%s')", nsjconf->chroot.c_str()); - return false; - } - if (chdir("/") == -1) { - PLOG_E("chdir('/')"); - return false; - } + if (nsjconf->chroot.empty()) { return true; } + if (chroot(nsjconf->chroot.c_str()) == -1) { + PLOG_E("chroot('%s')", nsjconf->chroot.c_str()); + return false; + } + if (chdir("/") == -1) { + PLOG_E("chdir('/')"); + return false; + } + return true; +} +static bool initCloneNs(nsjconf_t* nsjconf) { if (chdir("/") == -1) { PLOG_E("chdir('/')"); return false; @@ -395,25 +398,63 @@ static bool initNsInternal(nsjconf_t* nsjconf) { PLOG_E("umount2('%s', MNT_DETACH)", tmpdir->c_str()); return false; } - /* - * This requires some explanation: It's actually possible to pivot_root('/', '/'). After - * this operation has been completed, the old root is mounted over the new root, and it's OK - * to simply umount('/') now, and to have new_root as '/'. This allows us not care about - * providing any special directory for old_root, which is sometimes not easy, given that - * e.g. /tmp might not always be present inside new_root - */ - if (syscall(__NR_pivot_root, destdir->c_str(), destdir->c_str()) == -1) { - PLOG_E("pivot_root('%s', '%s')", destdir->c_str(), destdir->c_str()); - return false; - } - if (umount2("/", MNT_DETACH) == -1) { - PLOG_E("umount2('/', MNT_DETACH)"); - return false; - } - if (chdir(nsjconf->cwd.c_str()) == -1) { - PLOG_E("chdir('%s')", nsjconf->cwd.c_str()); - return false; + if (!nsjconf->no_pivotroot) { + /* + * This requires some explanation: It's actually possible to pivot_root('/', '/'). + * After this operation has been completed, the old root is mounted over the new + * root, and it's OK to simply umount('/') now, and to have new_root as '/'. This + * allows us not care about providing any special directory for old_root, which is + * sometimes not easy, given that e.g. /tmp might not always be present inside + * new_root + */ + if (util::syscall(__NR_pivot_root, (uintptr_t)destdir->c_str(), + (uintptr_t)destdir->c_str()) == -1) { + PLOG_E("pivot_root('%s', '%s')", destdir->c_str(), destdir->c_str()); + return false; + } + + if (umount2("/", MNT_DETACH) == -1) { + PLOG_E("umount2('/', MNT_DETACH)"); + return false; + } + } else { + /* + * pivot_root would normally un-mount the old root, however in certain cases this + * operation is forbidden. There are systems (mainly embedded) that keep their root + * file system in RAM, when initially loaded by the kernel (e.g. initramfs), + * and there is no other file system that is mounted on top of it.In such systems, + * there is no option to pivot_root! + * For more information, see + * kernel.org/doc/Documentation/filesystems/ramfs-rootfs-initramfs.txt. switch_root + * alternative: Innstead of un-mounting the old rootfs, it is over mounted by moving + * the new root to it. + */ + + /* NOTE: Using mount move and chroot allows escaping back into the old root when + * proper capabilities are kept in the user namespace. It can be acheived by + * unmounting the new root and using setns to re-enter the mount namespace. + */ + LOG_W( + "Using no_pivotroot is escapable when user posseses relevant capabilities, " + "Use it with care!"); + + if (chdir(destdir->c_str()) == -1) { + PLOG_E("chdir('%s')", destdir->c_str()); + return false; + } + + /* mount moving the new root on top of '/'. This operation is atomic and doesn't + involve un-mounting '/' at any stage */ + if (mount(".", "/", NULL, MS_MOVE, NULL) == -1) { + PLOG_E("mount('/', %s, NULL, MS_MOVE, NULL)", destdir->c_str()); + return false; + } + + if (chroot(".") == -1) { + PLOG_E("chroot('%s')", destdir->c_str()); + return false; + } } for (const auto& p : nsjconf->mountpts) { @@ -425,6 +466,24 @@ static bool initNsInternal(nsjconf_t* nsjconf) { return true; } +static bool initNsInternal(nsjconf_t* nsjconf) { + if (nsjconf->clone_newns) { + if (!initCloneNs(nsjconf)) { + return false; + } + } else { + if (!initNoCloneNs(nsjconf)) { + return false; + } + } + + if (chdir(nsjconf->cwd.c_str()) == -1) { + PLOG_E("chdir('%s')", nsjconf->cwd.c_str()); + return false; + } + return true; +} + /* * With mode MODE_STANDALONE_EXECVE it's required to mount /proc inside a new process, * as the current process is still in the original PID namespace (man pid_namespaces) @@ -434,7 +493,7 @@ bool initNs(nsjconf_t* nsjconf) { return initNsInternal(nsjconf); } - pid_t pid = subproc::cloneProc(CLONE_FS | SIGCHLD); + pid_t pid = subproc::cloneProc(CLONE_FS, SIGCHLD); if (pid == -1) { return false; } @@ -459,7 +518,7 @@ static bool addMountPt(mount_t* mnt, const std::string& src, const std::string& if (!src_env.empty()) { const char* e = getenv(src_env.c_str()); if (e == NULL) { - LOG_W("No such envvar:'%s'", src_env.c_str()); + LOG_W("No such envar:'%s'", src_env.c_str()); return false; } mnt->src = e; @@ -469,7 +528,7 @@ static bool addMountPt(mount_t* mnt, const std::string& src, const std::string& if (!dst_env.empty()) { const char* e = getenv(dst_env.c_str()); if (e == NULL) { - LOG_W("No such envvar:'%s'", dst_env.c_str()); + LOG_W("No such envar:'%s'", dst_env.c_str()); return false; } mnt->dst = e; @@ -553,9 +612,9 @@ const std::string describeMountPt(const mount_t& mpt) { .append("'"); if (mpt.is_dir) { - descr.append(" is_dir:true"); + descr.append(" dir:true"); } else { - descr.append(" is_dir:false"); + descr.append(" dir:false"); } if (!mpt.is_mandatory) { descr.append(" mandatory:false"); @@ -23,6 +23,7 @@ #include <arpa/inet.h> #include <errno.h> +#include <fcntl.h> #include <net/if.h> #include <net/route.h> #include <netinet/in.h> @@ -50,7 +51,6 @@ namespace net { #define IFACE_NAME "vs" -#if defined(NSJAIL_NL3_WITH_MACVLAN) #include <netlink/route/link.h> #include <netlink/route/link/macvlan.h> @@ -85,6 +85,12 @@ static bool cloneIface( nl_addr_put(nladdr); } + if ((err = rtnl_link_macvlan_set_mode( + rmv, rtnl_link_macvlan_str2mode(nsjconf->iface_vs_mo.c_str()))) < 0) { + LOG_E("rtnl_link_macvlan_set_mode(mode:'%s') failed: %s", + nsjconf->iface_vs_mo.c_str(), nl_geterror(err)); + } + if ((err = rtnl_link_add(sk, rmv, NLM_F_CREATE)) < 0) { LOG_E("rtnl_link_add(name:'%s' link:'%s'): %s", IFACE_NAME, nsjconf->iface_vs.c_str(), nl_geterror(err)); @@ -116,7 +122,7 @@ static bool moveToNs( int err = rtnl_link_change(sk, orig_link, new_link, RTM_SETLINK); if (err < 0) { - LOG_E("rtnl_link_change(): set NS of interface '%s' to PID=%d: %s", iface.c_str(), + LOG_E("rtnl_link_change(): set NS of interface '%s' to pid=%d: %s", iface.c_str(), (int)pid, nl_geterror(err)); rtnl_link_put(new_link); rtnl_link_put(orig_link); @@ -169,52 +175,6 @@ bool initNsFromParent(nsjconf_t* nsjconf, int pid) { nl_socket_free(sk); return true; } -#else // defined(NSJAIL_NL3_WITH_MACVLAN) - -static bool moveToNs(const std::string& iface, pid_t pid) { - const std::vector<std::string> argv{ - "/sbin/ip", "link", "set", iface, "netns", std::to_string(pid)}; - if (subproc::systemExe(argv, environ) != 0) { - LOG_E("Couldn't put interface '%s' into NET ns of the PID=%d", iface.c_str(), - (int)pid); - return false; - } - return true; -} - -bool initNsFromParent(nsjconf_t* nsjconf, int pid) { - if (!nsjconf->clone_newnet) { - return true; - } - for (const auto& iface : nsjconf->ifaces) { - if (!moveToNs(iface, pid)) { - return false; - } - } - if (nsjconf->iface_vs.empty()) { - return true; - } - - LOG_D("Putting iface:'%s' into namespace of PID:%d (with /sbin/ip)", - nsjconf->iface_vs.c_str(), pid); - - std::vector<std::string> argv; - - if (nsjconf->iface_vs_ma != "") { - argv = {"/sbin/ip", "link", "add", "link", nsjconf->iface_vs, "name", IFACE_NAME, - "netns", std::to_string(pid), "address", nsjconf->iface_vs_ma, "type", - "macvlan", "mode", "bridge"}; - } else { - argv = {"/sbin/ip", "link", "add", "link", nsjconf->iface_vs, "name", IFACE_NAME, - "netns", std::to_string(pid), "type", "macvlan", "mode", "bridge"}; - } - if (subproc::systemExe(argv, environ) != 0) { - LOG_E("Couldn't create MACVTAP interface for '%s'", nsjconf->iface_vs.c_str()); - return false; - } - return true; -} -#endif // defined(NSJAIL_NL3_WITH_MACVLAN) static bool isSocket(int fd) { int optval; @@ -228,6 +188,12 @@ static bool isSocket(int fd) { bool limitConns(nsjconf_t* nsjconf, int connsock) { /* 0 means 'unlimited' */ + if (nsjconf->max_conns != 0 && nsjconf->pids.size() >= nsjconf->max_conns) { + LOG_W("Rejecting connection, max_conns limit reached: %u", nsjconf->max_conns); + return false; + } + + /* 0 means 'unlimited' */ if (nsjconf->max_conns_per_ip == 0) { return true; } @@ -237,8 +203,8 @@ bool limitConns(nsjconf_t* nsjconf, int connsock) { unsigned cnt = 0; for (const auto& pid : nsjconf->pids) { - if (memcmp(addr.sin6_addr.s6_addr, pid.remote_addr.sin6_addr.s6_addr, - sizeof(pid.remote_addr.sin6_addr.s6_addr)) == 0) { + if (memcmp(addr.sin6_addr.s6_addr, pid.second.remote_addr.sin6_addr.s6_addr, + sizeof(pid.second.remote_addr.sin6_addr.s6_addr)) == 0) { cnt++; } } @@ -252,7 +218,7 @@ bool limitConns(nsjconf_t* nsjconf, int connsock) { } int getRecvSocket(const char* bindhost, int port) { - if (port < 1 || port > 65535) { + if (port < 0 || port > 65535) { LOG_F( "TCP port %d out of bounds (0 <= port <= 65535), specify one with --port " "<port>", @@ -279,6 +245,10 @@ int getRecvSocket(const char* bindhost, int port) { PLOG_E("socket(AF_INET6)"); return -1; } + if (fcntl(sockfd, F_SETFL, O_NONBLOCK)) { + PLOG_E("fcntl(%d, F_SETFL, O_NONBLOCK)", sockfd); + return -1; + } int so = 1; if (setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &so, sizeof(so)) == -1) { PLOG_E("setsockopt(%d, SO_REUSEADDR)", sockfd); @@ -311,7 +281,7 @@ int getRecvSocket(const char* bindhost, int port) { int acceptConn(int listenfd) { struct sockaddr_in6 cli_addr; socklen_t socklen = sizeof(cli_addr); - int connfd = accept(listenfd, (struct sockaddr*)&cli_addr, &socklen); + int connfd = accept4(listenfd, (struct sockaddr*)&cli_addr, &socklen, SOCK_NONBLOCK); if (connfd == -1) { if (errno != EINTR) { PLOG_E("accept(%d)", listenfd); @@ -44,10 +44,10 @@ Directory containing / of the jail (default: none) Mount chroot dir (/) R/W (default: R/O) .TP \fB\-\-user\fR|\fB\-u\fR VALUE -Username/uid of processess inside the jail (default: your current uid). You can also use inside_ns_uid:outside_ns_uid:count convention here. Can be specified multiple times +Username/uid of processes inside the jail (default: your current uid). You can also use inside_ns_uid:outside_ns_uid:count convention here. Can be specified multiple times .TP \fB\-\-group\fR|\fB\-g\fR VALUE -Groupname/gid of processess inside the jail (default: your current gid). You can also use inside_ns_gid:global_ns_gid:count convention here. Can be specified multiple times +Groupname/gid of processes inside the jail (default: your current gid). You can also use inside_ns_gid:global_ns_gid:count convention here. Can be specified multiple times .TP \fB\-\-hostname\fR|\fB\-H\fR VALUE UTS name (hostname) of the jail (default: 'NSJAIL') @@ -61,6 +61,9 @@ TCP port to bind to (enables MODE_LISTEN_TCP) (default: 0) \fB\-\-bindhost\fR VALUE IP address to bind the port to (only in [MODE_LISTEN_TCP]), (default: '::') .TP +\fB\-\-max_conns\fR VALUE +Maximum number of connections across all IPs (only in [MODE_LISTEN_TCP]), (default: 0 (unlimited)) +.TP \fB\-\-max_conns_per_ip\fR|\fB\-i\fR VALUE Maximum number of connections per one IP (only in [MODE_LISTEN_TCP]), (default: 0 (unlimited)) .TP @@ -89,10 +92,10 @@ Log warning and more important messages only Log fatal messages only .TP \fB\-\-keep_env\fR|\fB\-e\fR -Pass all environment variables be passed process (default: all envvars are cleared) +Pass all environment variables be passed process (default: all envars are cleared) .TP \fB\-\-env\fR|\fB\-E\fR VALUE -Additional environment variable (can be used multiple times). If the envvar doesn't contain '=' (e.g. just the 'DISPLAY' string), the current envvar value will be used +Additional environment variable (can be used multiple times). If the envar doesn't contain '=' (e.g. just the 'DISPLAY' string), the current envar value will be used .TP \fB\-\-keep_caps\fR Don't drop any capabilities @@ -136,6 +139,9 @@ RLIMIT_NPROC, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for th \fB\-\-rlimit_stack\fR VALUE RLIMIT_STACK in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM_INFINITY (default: 'soft') .TP +\fB\-\-disable_rlimits\fR +Disable all rlimits, default to limits set by parent +.TP \fB\-\-persona_addr_compat_layout\fR personality(ADDR_COMPAT_LAYOUT) .TP @@ -247,6 +253,12 @@ Location of cpu cgroup FS (default: '/sys/fs/cgroup/net_cls') \fB\-\-cgroup_cpu_parent\fR VALUE Which pre-existing cpu cgroup to use as a parent (default: 'NSJAIL') .TP +\fB\-\-cgroupv2_mount\fR VALUE +Location of cgroup v2 directory (default: '/sys/fs/cgroup') +.TP +\fB\-\-use_cgroupv2\fR +Use cgroup v2 +.TP \fB\-\-iface_no_lo\fR Don't bring the 'lo' interface up .TP @@ -21,6 +21,8 @@ #include "nsjail.h" +#include <fcntl.h> +#include <poll.h> #include <signal.h> #include <stdbool.h> #include <stdio.h> @@ -31,7 +33,10 @@ #include <termios.h> #include <unistd.h> +#include <algorithm> +#include <cerrno> #include <memory> +#include <vector> #include "cmdline.h" #include "logs.h" @@ -47,10 +52,7 @@ static __thread int sigFatal = 0; static __thread bool showProc = false; static void sigHandler(int sig) { - if (sig == SIGALRM) { - return; - } - if (sig == SIGCHLD) { + if (sig == SIGALRM || sig == SIGCHLD || sig == SIGPIPE) { return; } if (sig == SIGUSR1 || sig == SIGQUIT) { @@ -74,7 +76,7 @@ static bool setSigHandler(int sig) { if (sig == SIGTTIN || sig == SIGTTOU) { sa.sa_handler = SIG_IGN; - }; + } if (sigaction(sig, &sa, NULL) == -1) { PLOG_E("sigaction(%d)", sig); return false; @@ -115,6 +117,104 @@ static bool setTimer(nsjconf_t* nsjconf) { return true; } +static bool pipeTraffic(nsjconf_t* nsjconf, int listenfd) { + std::vector<struct pollfd> fds; + fds.reserve(nsjconf->pipes.size() * 3 + 1); + for (const auto& p : nsjconf->pipes) { + fds.push_back({ + .fd = p.sock_fd, + .events = POLLIN | POLLOUT, + .revents = 0, + }); + fds.push_back({ + .fd = p.pipe_in, + .events = POLLOUT, + .revents = 0, + }); + fds.push_back({ + .fd = p.pipe_out, + .events = POLLIN, + .revents = 0, + }); + } + fds.push_back({ + .fd = listenfd, + .events = POLLIN, + .revents = 0, + }); + LOG_D("Waiting for fd activity"); + while (poll(fds.data(), fds.size(), -1) > 0) { + if (sigFatal > 0 || showProc) { + return false; + } + if (fds.back().revents != 0) { + LOG_D("New connection ready"); + return true; + } + bool cleanup = false; + for (size_t i = 0; i < fds.size() - 1; ++i) { + if (fds[i].revents & POLLIN) { + fds[i].events &= ~POLLIN; + } + if (fds[i].revents & POLLOUT) { + fds[i].events &= ~POLLOUT; + } + } + for (size_t i = 0; i < fds.size() - 3; i += 3) { + const size_t pipe_no = i / 3; + int in, out; + const char* direction; + bool closed = false; + std::tuple<int, int, const char*> direction_map[] = { + std::make_tuple(i, i + 1, "in"), + std::make_tuple(i + 2, i, "out"), + }; + for (const auto& entry : direction_map) { + std::tie(in, out, direction) = entry; + bool in_ready = (fds[in].events & POLLIN) == 0 || + (fds[in].revents & POLLIN) == POLLIN; + bool out_ready = (fds[out].events & POLLOUT) == 0 || + (fds[out].revents & POLLOUT) == POLLOUT; + if (in_ready && out_ready) { + LOG_D("#%zu piping data %s", pipe_no, direction); + ssize_t rv = splice(fds[in].fd, nullptr, fds[out].fd, + nullptr, 4096, SPLICE_F_NONBLOCK); + if (rv == -1 && errno != EAGAIN) { + PLOG_E("splice fd pair #%zu {%d, %d}\n", pipe_no, + fds[in].fd, fds[out].fd); + } + if (rv == 0) { + closed = true; + } + fds[in].events |= POLLIN; + fds[out].events |= POLLOUT; + } + if ((fds[in].revents & (POLLERR | POLLHUP)) != 0 || + (fds[out].revents & (POLLERR | POLLHUP)) != 0) { + closed = true; + } + } + if (closed) { + LOG_D("#%zu connection closed", pipe_no); + cleanup = true; + close(nsjconf->pipes[pipe_no].sock_fd); + close(nsjconf->pipes[pipe_no].pipe_in); + close(nsjconf->pipes[pipe_no].pipe_out); + if (nsjconf->pipes[pipe_no].pid > 0) { + kill(nsjconf->pipes[pipe_no].pid, SIGKILL); + } + nsjconf->pipes[pipe_no] = {}; + } + } + if (cleanup) { + break; + } + } + nsjconf->pipes.erase(std::remove(nsjconf->pipes.begin(), nsjconf->pipes.end(), pipemap_t{}), + nsjconf->pipes.end()); + return false; +} + static int listenMode(nsjconf_t* nsjconf) { int listenfd = net::getRecvSocket(nsjconf->bindhost.c_str(), nsjconf->port); if (listenfd == -1) { @@ -131,10 +231,35 @@ static int listenMode(nsjconf_t* nsjconf) { showProc = false; subproc::displayProc(nsjconf); } - int connfd = net::acceptConn(listenfd); - if (connfd >= 0) { - subproc::runChild(nsjconf, connfd, connfd, connfd); - close(connfd); + if (pipeTraffic(nsjconf, listenfd)) { + int connfd = net::acceptConn(listenfd); + if (connfd >= 0) { + int in[2]; + int out[2]; + if (pipe(in) != 0 || pipe(out) != 0) { + PLOG_E("pipe"); + continue; + } + + pid_t pid = + subproc::runChild(nsjconf, connfd, in[0], out[1], out[1]); + + close(in[0]); + close(out[1]); + + if (pid <= 0) { + close(in[1]); + close(out[0]); + close(connfd); + } else { + nsjconf->pipes.push_back({ + .sock_fd = connfd, + .pipe_in = in[1], + .pipe_out = out[0], + .pid = pid, + }); + } + } } subproc::reapProc(nsjconf); } @@ -142,7 +267,8 @@ static int listenMode(nsjconf_t* nsjconf) { static int standaloneMode(nsjconf_t* nsjconf) { for (;;) { - if (!subproc::runChild(nsjconf, STDIN_FILENO, STDOUT_FILENO, STDERR_FILENO)) { + if (subproc::runChild(nsjconf, /* netfd= */ -1, STDIN_FILENO, STDOUT_FILENO, + STDERR_FILENO) == -1) { LOG_E("Couldn't launch the child process"); return 0xff; } @@ -188,7 +314,10 @@ void setTC(int fd, const struct termios* trm) { PLOG_W("ioctl(fd=%d, TCSETS) failed", fd); return; } - LOG_D("Restored the previous state of the TTY"); + if (tcflush(fd, TCIFLUSH) == -1) { + PLOG_W("tcflush(fd=%d, TCIFLUSH) failed", fd); + return; + } } } // namespace nsjail @@ -200,10 +329,7 @@ int main(int argc, char* argv[]) { if (!nsjconf) { LOG_F("Couldn't parse cmdline options"); } - if (!nsjconf->clone_newuser && geteuid() != 0) { - LOG_W("--disable_clone_newuser might require root() privs"); - } - if (nsjconf->daemonize && (daemon(0, 0) == -1)) { + if (nsjconf->daemonize && (daemon(/* nochdir= */ 1, /* noclose= */ 0) == -1)) { PLOG_F("daemon"); } cmdline::logParams(nsjconf.get()); @@ -226,7 +352,9 @@ int main(int argc, char* argv[]) { sandbox::closePolicy(nsjconf.get()); /* Try to restore the underlying console's params in case some program has changed it */ - nsjail::setTC(STDIN_FILENO, trm.get()); + if (!nsjconf->daemonize) { + nsjail::setTC(STDIN_FILENO, trm.get()); + } LOG_D("Returning with %d", ret); return ret; @@ -32,6 +32,7 @@ #include <time.h> #include <unistd.h> +#include <map> #include <string> #include <vector> @@ -44,10 +45,10 @@ static const int nssigs[] = { SIGTERM, SIGTTIN, SIGTTOU, + SIGPIPE, }; struct pids_t { - pid_t pid; time_t start; std::string remote_txt; struct sockaddr_in6 remote_addr; @@ -81,6 +82,16 @@ enum ns_mode_t { MODE_STANDALONE_RERUN }; +struct pipemap_t { + int sock_fd; + int pipe_in; + int pipe_out; + pid_t pid; + bool operator==(const pipemap_t& o) { + return sock_fd == o.sock_fd && pipe_in == o.pipe_in && pipe_out == o.pipe_out; + } +}; + struct nsjconf_t { std::string exec_file; bool use_execveat; @@ -104,19 +115,26 @@ struct nsjconf_t { uint64_t rl_nofile; uint64_t rl_nproc; uint64_t rl_stack; + uint64_t rl_mlock; + uint64_t rl_rtpr; + uint64_t rl_msgq; + bool disable_rl; unsigned long personality; bool clone_newnet; bool clone_newuser; bool clone_newns; + bool no_pivotroot; bool clone_newpid; bool clone_newipc; bool clone_newuts; bool clone_newcgroup; + bool clone_newtime; enum ns_mode_t mode; bool is_root_rw; bool is_silent; bool stderr_to_null; bool skip_setsid; + unsigned int max_conns; unsigned int max_conns_per_ip; std::string proc_path; bool is_proc_rw; @@ -126,6 +144,7 @@ struct nsjconf_t { std::string iface_vs_nm; std::string iface_vs_gw; std::string iface_vs_ma; + std::string iface_vs_mo; std::string cgroup_mem_mount; std::string cgroup_mem_parent; size_t cgroup_mem_max; @@ -138,20 +157,25 @@ struct nsjconf_t { std::string cgroup_cpu_mount; std::string cgroup_cpu_parent; unsigned int cgroup_cpu_ms_per_sec; + std::string cgroupv2_mount; + bool use_cgroupv2; std::string kafel_file_path; std::string kafel_string; struct sock_fprog seccomp_fprog; bool seccomp_log; + int nice_level; long num_cpus; uid_t orig_uid; + uid_t orig_euid; std::vector<mount_t> mountpts; - std::vector<pids_t> pids; + std::map<pid_t, pids_t> pids; std::vector<idmap_t> uids; std::vector<idmap_t> gids; std::vector<std::string> envs; std::vector<int> openfds; std::vector<int> caps; std::vector<std::string> ifaces; + std::vector<pipemap_t> pipes; }; #endif /* _NSJAIL_H */ @@ -48,7 +48,7 @@ bool initNs(nsjconf_t* nsjconf) { * first clone/fork will work, and the rest will fail with ENOMEM (see 'man pid_namespaces' * for details on this behavior) */ - pid_t pid = subproc::cloneProc(CLONE_FS); + pid_t pid = subproc::cloneProc(CLONE_FS, 0); if (pid == -1) { PLOG_E("Couldn't create a dummy init process"); return false; @@ -33,6 +33,7 @@ extern "C" { #include "kafel.h" } #include "logs.h" +#include "util.h" namespace sandbox { @@ -65,9 +66,9 @@ static bool prepareAndCommit(nsjconf_t* nsjconf) { "too old?)"); return false; #else - if (syscall(__NR_seccomp, (uintptr_t)SECCOMP_SET_MODE_FILTER, + if (util::syscall(__NR_seccomp, (uintptr_t)SECCOMP_SET_MODE_FILTER, (uintptr_t)(SECCOMP_FILTER_FLAG_TSYNC | SECCOMP_FILTER_FLAG_LOG), - &nsjconf->seccomp_fprog) == -1) { + (uintptr_t)&nsjconf->seccomp_fprog) == -1) { PLOG_E( "seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC | " "SECCOMP_FILTER_FLAG_LOG) failed"); @@ -45,6 +45,7 @@ #include <vector> #include "cgroup.h" +#include "cgroup2.h" #include "contain.h" #include "logs.h" #include "macros.h" @@ -58,6 +59,9 @@ namespace subproc { #if !defined(CLONE_NEWCGROUP) #define CLONE_NEWCGROUP 0x02000000 #endif /* !defined(CLONE_NEWCGROUP) */ +#if !defined(CLONE_NEWTIME) +#define CLONE_NEWTIME 0x00000080 +#endif /* !defined(CLONE_NEWTIME) */ static const std::string cloneFlagsToStr(uintptr_t flags) { std::string res; @@ -66,43 +70,50 @@ static const std::string cloneFlagsToStr(uintptr_t flags) { const uintptr_t flag; const char* const name; } static const cloneFlags[] = { - NS_VALSTR_STRUCT(CLONE_VM), - NS_VALSTR_STRUCT(CLONE_FS), - NS_VALSTR_STRUCT(CLONE_FILES), - NS_VALSTR_STRUCT(CLONE_SIGHAND), - NS_VALSTR_STRUCT(CLONE_PTRACE), - NS_VALSTR_STRUCT(CLONE_VFORK), - NS_VALSTR_STRUCT(CLONE_PARENT), - NS_VALSTR_STRUCT(CLONE_THREAD), - NS_VALSTR_STRUCT(CLONE_NEWNS), - NS_VALSTR_STRUCT(CLONE_SYSVSEM), - NS_VALSTR_STRUCT(CLONE_SETTLS), - NS_VALSTR_STRUCT(CLONE_PARENT_SETTID), - NS_VALSTR_STRUCT(CLONE_CHILD_CLEARTID), - NS_VALSTR_STRUCT(CLONE_DETACHED), - NS_VALSTR_STRUCT(CLONE_UNTRACED), - NS_VALSTR_STRUCT(CLONE_CHILD_SETTID), - NS_VALSTR_STRUCT(CLONE_NEWCGROUP), - NS_VALSTR_STRUCT(CLONE_NEWUTS), - NS_VALSTR_STRUCT(CLONE_NEWIPC), - NS_VALSTR_STRUCT(CLONE_NEWUSER), - NS_VALSTR_STRUCT(CLONE_NEWPID), - NS_VALSTR_STRUCT(CLONE_NEWNET), - NS_VALSTR_STRUCT(CLONE_IO), + NS_VALSTR_STRUCT(CLONE_NEWTIME), + NS_VALSTR_STRUCT(CLONE_VM), + NS_VALSTR_STRUCT(CLONE_FS), + NS_VALSTR_STRUCT(CLONE_FILES), + NS_VALSTR_STRUCT(CLONE_SIGHAND), +#if !defined(CLONE_PIDFD) +#define CLONE_PIDFD 0x00001000 +#endif + NS_VALSTR_STRUCT(CLONE_PIDFD), + NS_VALSTR_STRUCT(CLONE_PTRACE), + NS_VALSTR_STRUCT(CLONE_VFORK), + NS_VALSTR_STRUCT(CLONE_PARENT), + NS_VALSTR_STRUCT(CLONE_THREAD), + NS_VALSTR_STRUCT(CLONE_NEWNS), + NS_VALSTR_STRUCT(CLONE_SYSVSEM), + NS_VALSTR_STRUCT(CLONE_SETTLS), + NS_VALSTR_STRUCT(CLONE_PARENT_SETTID), + NS_VALSTR_STRUCT(CLONE_CHILD_CLEARTID), + NS_VALSTR_STRUCT(CLONE_DETACHED), + NS_VALSTR_STRUCT(CLONE_UNTRACED), + NS_VALSTR_STRUCT(CLONE_CHILD_SETTID), + NS_VALSTR_STRUCT(CLONE_NEWCGROUP), + NS_VALSTR_STRUCT(CLONE_NEWUTS), + NS_VALSTR_STRUCT(CLONE_NEWIPC), + NS_VALSTR_STRUCT(CLONE_NEWUSER), + NS_VALSTR_STRUCT(CLONE_NEWPID), + NS_VALSTR_STRUCT(CLONE_NEWNET), + NS_VALSTR_STRUCT(CLONE_IO), }; - uintptr_t knownFlagMask = CSIGNAL; + uintptr_t knownFlagMask = 0; for (const auto& i : cloneFlags) { if (flags & i.flag) { - res.append(i.name).append("|"); + if (!res.empty()) { + res.append("|"); + } + res.append(i.name); } knownFlagMask |= i.flag; } if (flags & ~(knownFlagMask)) { - util::StrAppend(&res, "%#tx|", flags & ~(knownFlagMask)); + util::StrAppend(&res, "|%#tx", flags & ~(knownFlagMask)); } - res.append(util::sigName(flags & CSIGNAL).c_str()); return res; } @@ -128,7 +139,8 @@ static bool resetEnv(void) { static const char kSubprocDoneChar = 'D'; static const char kSubprocErrorChar = 'E'; -static void subprocNewProc(nsjconf_t* nsjconf, int fd_in, int fd_out, int fd_err, int pipefd) { +static void subprocNewProc( + nsjconf_t* nsjconf, int netfd, int fd_in, int fd_out, int fd_err, int pipefd) { if (!contain::setupFD(nsjconf, fd_in, fd_out, fd_err)) { return; } @@ -141,7 +153,12 @@ static void subprocNewProc(nsjconf_t* nsjconf, int fd_in, int fd_out, int fd_err LOG_E("Couldn't initialize net user namespace"); return; } - if (!cgroup::initNsFromParent(nsjconf, getpid())) { + if (nsjconf->use_cgroupv2) { + if (!cgroup2::initNsFromParent(nsjconf, getpid())) { + LOG_E("Couldn't initialize net user namespace"); + return; + } + } else if (!cgroup::initNsFromParent(nsjconf, getpid())) { LOG_E("Couldn't initialize net user namespace"); return; } @@ -164,7 +181,7 @@ static void subprocNewProc(nsjconf_t* nsjconf, int fd_in, int fd_out, int fd_err putenv(const_cast<char*>(env.c_str())); } - auto connstr = net::connToText(fd_in, /* remote= */ true, NULL); + auto connstr = net::connToText(netfd, /* remote= */ true, NULL); LOG_I("Executing '%s' for '%s'", nsjconf->exec_file.c_str(), connstr.c_str()); std::vector<const char*> argv; @@ -181,8 +198,8 @@ static void subprocNewProc(nsjconf_t* nsjconf, int fd_in, int fd_out, int fd_err if (nsjconf->use_execveat) { #if defined(__NR_execveat) - syscall(__NR_execveat, (uintptr_t)nsjconf->exec_fd, "", (char* const*)argv.data(), - environ, (uintptr_t)AT_EMPTY_PATH); + util::syscall(__NR_execveat, nsjconf->exec_fd, (uintptr_t) "", + (uintptr_t)argv.data(), (uintptr_t)environ, AT_EMPTY_PATH); #else /* defined(__NR_execveat) */ LOG_E("Your system doesn't support execveat() syscall"); return; @@ -197,7 +214,6 @@ static void subprocNewProc(nsjconf_t* nsjconf, int fd_in, int fd_out, int fd_err static void addProc(nsjconf_t* nsjconf, pid_t pid, int sock) { pids_t p; - p.pid = pid; p.start = time(NULL); p.remote_txt = net::connToText(sock, /* remote= */ true, &p.remote_addr); @@ -205,24 +221,27 @@ static void addProc(nsjconf_t* nsjconf, pid_t pid, int sock) { snprintf(fname, sizeof(fname), "/proc/%d/syscall", (int)pid); p.pid_syscall_fd = TEMP_FAILURE_RETRY(open(fname, O_RDONLY | O_CLOEXEC)); - nsjconf->pids.push_back(p); + if (nsjconf->pids.find(pid) != nsjconf->pids.end()) { + LOG_F("pid=%d already exists", pid); + } + nsjconf->pids.insert(std::make_pair(pid, p)); - LOG_D("Added pid '%d' with start time '%u' to the queue for IP: '%s'", p.pid, + LOG_D("Added pid=%d with start time '%u' to the queue for IP: '%s'", pid, (unsigned int)p.start, p.remote_txt.c_str()); } static void removeProc(nsjconf_t* nsjconf, pid_t pid) { - for (auto p = nsjconf->pids.begin(); p != nsjconf->pids.end(); ++p) { - if (p->pid == pid) { - LOG_D("Removing pid '%d' from the queue (IP:'%s', start time:'%s')", p->pid, - p->remote_txt.c_str(), util::timeToStr(p->start).c_str()); - close(p->pid_syscall_fd); - nsjconf->pids.erase(p); - - return; - } + if (nsjconf->pids.find(pid) == nsjconf->pids.end()) { + LOG_W("pid=%d doesn't exist ?", pid); + return; } - LOG_W("PID: %d not found (?)", pid); + + const auto& p = nsjconf->pids[pid]; + LOG_D("Removed pid=%d from the queue (IP:'%s', start time:'%s')", pid, p.remote_txt.c_str(), + util::timeToStr(p.start).c_str()); + + close(p.pid_syscall_fd); + nsjconf->pids.erase(pid); } int countProc(nsjconf_t* nsjconf) { @@ -233,37 +252,35 @@ void displayProc(nsjconf_t* nsjconf) { LOG_I("Total number of spawned namespaces: %d", countProc(nsjconf)); time_t now = time(NULL); for (const auto& pid : nsjconf->pids) { - time_t diff = now - pid.start; + time_t diff = now - pid.second.start; uint64_t left = nsjconf->tlimit ? nsjconf->tlimit - (uint64_t)diff : 0; - LOG_I("PID: %d, Remote host: %s, Run time: %ld sec. (time left: %" PRId64 " sec.)", - pid.pid, pid.remote_txt.c_str(), (long)diff, left); + LOG_I("pid=%d, Remote host: %s, Run time: %ld sec. (time left: %s s.)", pid.first, + pid.second.remote_txt.c_str(), (long)diff, + nsjconf->tlimit ? std::to_string(left).c_str() : "unlimited"); } } -static const pids_t* getPidElem(nsjconf_t* nsjconf, pid_t pid) { - for (const auto& p : nsjconf->pids) { - if (p.pid == pid) { - return &p; - } - } - return NULL; -} - static void seccompViolation(nsjconf_t* nsjconf, siginfo_t* si) { - LOG_W("PID: %d commited a syscall/seccomp violation and exited with SIGSYS", si->si_pid); + LOG_W("pid=%d committed a syscall/seccomp violation and exited with SIGSYS", si->si_pid); - const pids_t* p = getPidElem(nsjconf, si->si_pid); - if (p == NULL) { - LOG_W("PID:%d SiSyscall: %d, SiCode: %d, SiErrno: %d, SiSigno: %d", (int)si->si_pid, - si->si_syscall, si->si_code, si->si_errno, si->si_signo); - LOG_E("Couldn't find pid element in the subproc list for PID: %d", (int)si->si_pid); + const auto& p = nsjconf->pids.find(si->si_pid); + if (p == nsjconf->pids.end()) { + LOG_W( + "pid=%d SiSyscall: %d, SiCode: %d, SiErrno: %d, SiSigno: %d. (If " + "SiSyscall==31, then it's most likely the SIGSYS value. See 'dmesg' or " + "'journalctl -ek' for possible auditd report with more data)", + (int)si->si_pid, si->si_syscall, si->si_code, si->si_errno, si->si_signo); + LOG_E("Couldn't find pid element in the subproc list for pid=%d", (int)si->si_pid); return; } char buf[4096]; - ssize_t rdsize = util::readFromFd(p->pid_syscall_fd, buf, sizeof(buf) - 1); + ssize_t rdsize = util::readFromFd(p->second.pid_syscall_fd, buf, sizeof(buf) - 1); if (rdsize < 1) { - LOG_W("PID: %d, SiSyscall: %d, SiCode: %d, SiErrno: %d, SiSigno: %d", + LOG_W( + "pid=%d, SiSyscall: %d, SiCode: %d, SiErrno: %d, SiSigno: %d. (If " + "SiSyscall==31, then it's most likely the SIGSYS value. See 'dmesg' or " + "'journalctl -ek' for possible auditd report with more data)", (int)si->si_pid, si->si_syscall, si->si_code, si->si_errno, si->si_signo); return; } @@ -275,18 +292,22 @@ static void seccompViolation(nsjconf_t* nsjconf, siginfo_t* si) { &arg4, &arg5, &arg6, &sp, &pc); if (ret == 9) { LOG_W( - "PID: %d, Syscall number: %td, Arguments: %#tx, %#tx, %#tx, %#tx, %#tx, %#tx, " + "pid=%d, Syscall number: %td, Arguments: %#tx, %#tx, %#tx, %#tx, %#tx, %#tx, " "SP: %#tx, PC: %#tx, si_syscall: %d, si_errno: %#x", (int)si->si_pid, sc, arg1, arg2, arg3, arg4, arg5, arg6, sp, pc, si->si_syscall, si->si_errno); } else if (ret == 3) { LOG_W( - "PID: %d, SiSyscall: %d, SiCode: %d, SiErrno: %d, SiSigno: %d, SP: %#tx, PC: " - "%#tx", + "pid=%d, SiSyscall: %d, SiCode: %d, SiErrno: %d, SiSigno: %d, SP: %#tx, PC: " + "%#tx (If SiSyscall==31, then it's most likely the SIGSYS value. See 'dmesg' " + "or 'journalctl -ek' for possible auditd report with more data)", (int)si->si_pid, si->si_syscall, si->si_code, si->si_errno, si->si_signo, arg1, arg2); } else { - LOG_W("PID: %d, SiSyscall: %d, SiCode: %d, SiErrno: %d, Syscall string '%s'", + LOG_W( + "pid=%d, SiSyscall: %d, SiCode: %d, SiErrno: %d, Syscall string '%s'. (If " + "SiSyscall==31, then it's most likely the SIGSYS value. See 'dmesg' or " + "'journalctl -ek' for possible auditd report with more data)", (int)si->si_pid, si->si_syscall, si->si_code, si->si_errno, buf); } } @@ -295,22 +316,26 @@ static int reapProc(nsjconf_t* nsjconf, pid_t pid, bool should_wait = false) { int status; if (wait4(pid, &status, should_wait ? 0 : WNOHANG, NULL) == pid) { - cgroup::finishFromParent(nsjconf, pid); + if (nsjconf->use_cgroupv2) { + cgroup2::finishFromParent(nsjconf, pid); + } else { + cgroup::finishFromParent(nsjconf, pid); + } std::string remote_txt = "[UNKNOWN]"; - const pids_t* elem = getPidElem(nsjconf, pid); - if (elem) { - remote_txt = elem->remote_txt; + const auto& p = nsjconf->pids.find(pid); + if (p != nsjconf->pids.end()) { + remote_txt = p->second.remote_txt; } if (WIFEXITED(status)) { - LOG_I("PID: %d (%s) exited with status: %d, (PIDs left: %d)", pid, + LOG_I("pid=%d (%s) exited with status: %d, (PIDs left: %d)", pid, remote_txt.c_str(), WEXITSTATUS(status), countProc(nsjconf) - 1); removeProc(nsjconf, pid); return WEXITSTATUS(status); } if (WIFSIGNALED(status)) { - LOG_I("PID: %d (%s) terminated with signal: %s (%d), (PIDs left: %d)", pid, + LOG_I("pid=%d (%s) terminated with signal: %s (%d), (PIDs left: %d)", pid, remote_txt.c_str(), util::sigName(WTERMSIG(status)).c_str(), WTERMSIG(status), countProc(nsjconf) - 1); removeProc(nsjconf, pid); @@ -343,20 +368,19 @@ int reapProc(nsjconf_t* nsjconf) { if (nsjconf->tlimit == 0) { continue; } - pid_t pid = p.pid; - time_t diff = now - p.start; + pid_t pid = p.first; + time_t diff = now - p.second.start; if ((uint64_t)diff >= nsjconf->tlimit) { - LOG_I("PID: %d run time >= time limit (%ld >= %" PRIu64 - ") (%s). Killing it", - pid, (long)diff, nsjconf->tlimit, p.remote_txt.c_str()); + LOG_I("pid=%d run time >= time limit (%ld >= %" PRIu64 ") (%s). Killing it", + pid, (long)diff, nsjconf->tlimit, p.second.remote_txt.c_str()); /* * Probably a kernel bug - some processes cannot be killed with KILL if * they're namespaced, and in a stopped state */ kill(pid, SIGCONT); - LOG_D("Sent SIGCONT to PID: %d", pid); + LOG_D("Sent SIGCONT to pid=%d", pid); kill(pid, SIGKILL); - LOG_D("Sent SIGKILL to PID: %d", pid); + LOG_D("Sent SIGKILL to pid=%d", pid); } } return rv; @@ -364,7 +388,7 @@ int reapProc(nsjconf_t* nsjconf) { void killAndReapAll(nsjconf_t* nsjconf) { while (!nsjconf->pids.empty()) { - pid_t pid = nsjconf->pids.front().pid; + pid_t pid = nsjconf->pids.begin()->first; if (kill(pid, SIGKILL) == 0) { reapProc(nsjconf, pid, true); } else { @@ -375,15 +399,22 @@ void killAndReapAll(nsjconf_t* nsjconf) { static bool initParent(nsjconf_t* nsjconf, pid_t pid, int pipefd) { if (!net::initNsFromParent(nsjconf, pid)) { - LOG_E("Couldn't initialize net namespace for pid '%d'", pid); + LOG_E("Couldn't initialize net namespace for pid=%d", pid); return false; } - if (!cgroup::initNsFromParent(nsjconf, pid)) { - LOG_E("Couldn't initialize cgroup user namespace for pid '%d'", pid); + + if (nsjconf->use_cgroupv2) { + if (!cgroup2::initNsFromParent(nsjconf, pid)) { + LOG_E("Couldn't initialize cgroup 2 user namespace for pid=%d", pid); + exit(0xff); + } + } else if (!cgroup::initNsFromParent(nsjconf, pid)) { + LOG_E("Couldn't initialize cgroup user namespace for pid=%d", pid); exit(0xff); } + if (!user::initNsFromParent(nsjconf, pid)) { - LOG_E("Couldn't initialize user namespace for pid %d", pid); + LOG_E("Couldn't initialize user namespace for pid=%d", pid); return false; } if (!util::writeToFd(pipefd, &kSubprocDoneChar, sizeof(kSubprocDoneChar))) { @@ -393,9 +424,9 @@ static bool initParent(nsjconf_t* nsjconf, pid_t pid, int pipefd) { return true; } -bool runChild(nsjconf_t* nsjconf, int fd_in, int fd_out, int fd_err) { - if (!net::limitConns(nsjconf, fd_in)) { - return true; +pid_t runChild(nsjconf_t* nsjconf, int netfd, int fd_in, int fd_out, int fd_err) { + if (!net::limitConns(nsjconf, netfd)) { + return 0; } unsigned long flags = 0UL; flags |= (nsjconf->clone_newnet ? CLONE_NEWNET : 0); @@ -405,54 +436,48 @@ bool runChild(nsjconf_t* nsjconf, int fd_in, int fd_out, int fd_err) { flags |= (nsjconf->clone_newipc ? CLONE_NEWIPC : 0); flags |= (nsjconf->clone_newuts ? CLONE_NEWUTS : 0); flags |= (nsjconf->clone_newcgroup ? CLONE_NEWCGROUP : 0); + flags |= (nsjconf->clone_newtime ? CLONE_NEWTIME : 0); if (nsjconf->mode == MODE_STANDALONE_EXECVE) { + LOG_D("unshare(flags: %s)", cloneFlagsToStr(flags).c_str()); if (unshare(flags) == -1) { PLOG_F("unshare(%s)", cloneFlagsToStr(flags).c_str()); } - subprocNewProc(nsjconf, fd_in, fd_out, fd_err, -1); + subprocNewProc(nsjconf, netfd, fd_in, fd_out, fd_err, -1); LOG_F("Launching new process failed"); } - flags |= SIGCHLD; - LOG_D("Creating new process with clone flags:%s", cloneFlagsToStr(flags).c_str()); + LOG_D("Creating new process with clone flags:%s and exit_signal:SIGCHLD", + cloneFlagsToStr(flags).c_str()); int sv[2]; if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, sv) == -1) { PLOG_E("socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC) failed"); - return false; + return -1; } int child_fd = sv[0]; int parent_fd = sv[1]; - pid_t pid = cloneProc(flags); + pid_t pid = cloneProc(flags, SIGCHLD); if (pid == 0) { close(parent_fd); - subprocNewProc(nsjconf, fd_in, fd_out, fd_err, child_fd); + subprocNewProc(nsjconf, netfd, fd_in, fd_out, fd_err, child_fd); util::writeToFd(child_fd, &kSubprocErrorChar, sizeof(kSubprocErrorChar)); LOG_F("Launching child process failed"); } close(child_fd); if (pid == -1) { - if (flags & CLONE_NEWCGROUP) { - PLOG_E( - "nsjail tried to use the CLONE_NEWCGROUP clone flag, which is " - "supported under kernel versions >= 4.6 only. Try disabling this flag"); - } - PLOG_E( - "clone(flags=%s) failed. You probably need root privileges if your system " - "doesn't support CLONE_NEWUSER. Alternatively, you might want to recompile " - "your kernel with support for namespaces or check the current value of the " - "kernel.unprivileged_userns_clone sysctl", - cloneFlagsToStr(flags).c_str()); + auto saved_errno = errno; + PLOG_W("clone(flags=%s) failed", cloneFlagsToStr(flags).c_str()); close(parent_fd); - return false; + errno = saved_errno; + return pid; } - addProc(nsjconf, pid, fd_in); + addProc(nsjconf, pid, netfd); if (!initParent(nsjconf, pid, parent_fd)) { close(parent_fd); - return false; + return -1; } char rcvChar; @@ -460,11 +485,11 @@ bool runChild(nsjconf_t* nsjconf, int fd_in, int fd_out, int fd_err) { rcvChar == kSubprocErrorChar) { LOG_W("Received error message from the child process before it has been executed"); close(parent_fd); - return false; + return -1; } close(parent_fd); - return true; + return pid; } /* @@ -485,9 +510,45 @@ static int cloneFunc(void* arg __attribute__((unused))) { * update the internal PID/TID caches, what can lead to invalid values being returned by getpid() * or incorrect PID/TIDs used in raise()/abort() functions */ -pid_t cloneProc(uintptr_t flags) { +pid_t cloneProc(uintptr_t flags, int exit_signal) { + exit_signal &= CSIGNAL; + if (flags & CLONE_VM) { LOG_E("Cannot use clone(flags & CLONE_VM)"); + errno = 0; + return -1; + } + + if (flags & CLONE_NEWTIME) { + LOG_W( + "CLONE_NEWTIME reuqested, but it's only supported with the unshare() mode " + "(-Me)"); + } + +#if defined(__NR_clone3) + struct clone_args ca = { + .flags = (uint64_t)flags, + .pidfd = 0, + .child_tid = 0, + .parent_tid = 0, + .exit_signal = (uint64_t)exit_signal, + .stack = 0, + .stack_size = 0, + .tls = 0, + .set_tid = 0, + .set_tid_size = 0, + .cgroup = 0, + }; + + pid_t ret = util::syscall(__NR_clone3, (uintptr_t)&ca, sizeof(ca)); + if (ret != -1 || errno != ENOSYS) { + return ret; + } +#endif /* defined(__NR_clone3) */ + + if (flags & CLONE_NEWTIME) { + LOG_E("CLONE_NEWTIME was requested but clone3() is not supported"); + errno = 0; return -1; } @@ -500,7 +561,7 @@ pid_t cloneProc(uintptr_t flags) { */ void* stack = &cloneStack[sizeof(cloneStack) / 2]; /* Parent */ - return clone(cloneFunc, stack, flags, NULL, NULL, NULL); + return clone(cloneFunc, stack, flags | exit_signal, NULL, NULL, NULL); } /* Child */ return 0; @@ -557,7 +618,7 @@ int systemExe(const std::vector<std::string>& args, char** env) { } if (WIFEXITED(status)) { int exit_code = WEXITSTATUS(status); - LOG_D("PID %d exited with exit code: %d", pid, exit_code); + LOG_D("pid=%d exited with exit code: %d", pid, exit_code); if (exec_failed) { return -1; } else if (exit_code == 0) { @@ -568,7 +629,7 @@ int systemExe(const std::vector<std::string>& args, char** env) { } if (WIFSIGNALED(status)) { int exit_signal = WTERMSIG(status); - LOG_W("PID %d killed by signal: %d (%s)", pid, exit_signal, + LOG_W("pid=%d killed by signal: %d (%s)", pid, exit_signal, util::sigName(exit_signal).c_str()); return 2; } @@ -33,14 +33,15 @@ namespace subproc { -bool runChild(nsjconf_t* nsjconf, int fd_in, int fd_out, int fd_err); +/* 0 - network connection limit reached, -1 - error */ +pid_t runChild(nsjconf_t* nsjconf, int listen_fd, int fd_in, int fd_out, int fd_err); int countProc(nsjconf_t* nsjconf); void displayProc(nsjconf_t* nsjconf); void killAndReapAll(nsjconf_t* nsjconf); /* Returns the exit code of the first failing subprocess, or 0 if none fail */ int reapProc(nsjconf_t* nsjconf); int systemExe(const std::vector<std::string>& args, char** env); -pid_t cloneProc(uintptr_t flags); +pid_t cloneProc(uintptr_t flags, int exit_signal); } // namespace subproc @@ -43,17 +43,33 @@ #include "subproc.h" #include "util.h" +#define STR_(x) #x +#define STR(x) STR_(x) + +constexpr char kNewUidPath[] = +#ifdef NEWUIDMAP_PATH + STR(NEWUIDMAP_PATH); +#else + "/usr/bin/newuidmap"; +#endif +constexpr char kNewGidPath[] = +#ifdef NEWGIDMAP_PATH + STR(NEWGIDMAP_PATH); +#else + "/usr/bin/newgidmap"; +#endif + namespace user { static bool setResGid(gid_t gid) { LOG_D("setresgid(%d)", gid); #if defined(__NR_setresgid32) - if (syscall(__NR_setresgid32, (uintptr_t)gid, (uintptr_t)gid, (uintptr_t)gid) == -1) { + if (util::syscall(__NR_setresgid32, (uintptr_t)gid, (uintptr_t)gid, (uintptr_t)gid) == -1) { PLOG_W("setresgid32(%d)", (int)gid); return false; } #else /* defined(__NR_setresgid32) */ - if (syscall(__NR_setresgid, (uintptr_t)gid, (uintptr_t)gid, (uintptr_t)gid) == -1) { + if (util::syscall(__NR_setresgid, (uintptr_t)gid, (uintptr_t)gid, (uintptr_t)gid) == -1) { PLOG_W("setresgid(%d)", gid); return false; } @@ -64,12 +80,12 @@ static bool setResGid(gid_t gid) { static bool setResUid(uid_t uid) { LOG_D("setresuid(%d)", uid); #if defined(__NR_setresuid32) - if (syscall(__NR_setresuid32, (uintptr_t)uid, (uintptr_t)uid, (uintptr_t)uid) == -1) { + if (util::syscall(__NR_setresuid32, (uintptr_t)uid, (uintptr_t)uid, (uintptr_t)uid) == -1) { PLOG_W("setresuid32(%d)", (int)uid); return false; } #else /* defined(__NR_setresuid32) */ - if (syscall(__NR_setresuid, (uintptr_t)uid, (uintptr_t)uid, (uintptr_t)uid) == -1) { + if (util::syscall(__NR_setresuid, (uintptr_t)uid, (uintptr_t)uid, (uintptr_t)uid) == -1) { PLOG_W("setresuid(%d)", uid); return false; } @@ -77,18 +93,27 @@ static bool setResUid(uid_t uid) { return true; } -static bool setGroups(pid_t pid) { +static bool hasGidMapSelf(nsjconf_t* nsjconf) { + for (const auto& gid : nsjconf->gids) { + if (!gid.is_newidmap) { + return true; + } + } + return false; +} + +static bool setGroupsDeny(nsjconf_t* nsjconf, pid_t pid) { /* * No need to write 'deny' to /proc/pid/setgroups if our euid==0, as writing to * uid_map/gid_map will succeed anyway */ - if (geteuid() == 0) { + if (!nsjconf->clone_newuser || nsjconf->orig_euid == 0 || !hasGidMapSelf(nsjconf)) { return true; } char fname[PATH_MAX]; snprintf(fname, sizeof(fname), "/proc/%d/setgroups", pid); - const char* denystr = "deny"; + const char* const denystr = "deny"; if (!util::writeBufToFile(fname, denystr, strlen(denystr), O_WRONLY | O_CLOEXEC)) { LOG_E("util::writeBufToFile('%s', '%s') failed", fname, denystr); return false; @@ -152,11 +177,11 @@ static bool gidMapSelf(nsjconf_t* nsjconf, pid_t pid) { return true; } -/* Use /usr/bin/newgidmap for writing the gid map */ -static bool gidMapExternal(nsjconf_t* nsjconf, pid_t pid UNUSED) { +/* Use newgidmap for writing the gid map */ +static bool gidMapExternal(nsjconf_t* nsjconf, pid_t pid) { bool use = false; - std::vector<std::string> argv = {"/usr/bin/newgidmap", std::to_string(pid)}; + std::vector<std::string> argv = {kNewGidPath, std::to_string(pid)}; for (const auto& gid : nsjconf->gids) { if (!gid.is_newidmap) { continue; @@ -171,18 +196,18 @@ static bool gidMapExternal(nsjconf_t* nsjconf, pid_t pid UNUSED) { return true; } if (subproc::systemExe(argv, environ) != 0) { - LOG_E("'/usr/bin/newgidmap' failed"); + LOG_E("'%s' failed", kNewGidPath); return false; } return true; } -/* Use /usr/bin/newuidmap for writing the uid map */ -static bool uidMapExternal(nsjconf_t* nsjconf, pid_t pid UNUSED) { +/* Use newuidmap for writing the uid map */ +static bool uidMapExternal(nsjconf_t* nsjconf, pid_t pid) { bool use = false; - std::vector<std::string> argv = {"/usr/bin/newuidmap", std::to_string(pid)}; + std::vector<std::string> argv = {kNewUidPath, std::to_string(pid)}; for (const auto& uid : nsjconf->uids) { if (!uid.is_newidmap) { continue; @@ -197,7 +222,7 @@ static bool uidMapExternal(nsjconf_t* nsjconf, pid_t pid UNUSED) { return true; } if (subproc::systemExe(argv, environ) != 0) { - LOG_E("'/usr/bin/newuidmap' failed"); + LOG_E("'%s' failed", kNewUidPath); return false; } @@ -214,7 +239,7 @@ static bool uidGidMap(nsjconf_t* nsjconf, pid_t pid) { } bool initNsFromParent(nsjconf_t* nsjconf, pid_t pid) { - if (!setGroups(pid)) { + if (!setGroupsDeny(nsjconf, pid)) { return false; } if (!nsjconf->clone_newuser) { @@ -227,13 +252,8 @@ bool initNsFromParent(nsjconf_t* nsjconf, pid_t pid) { } bool initNsFromChild(nsjconf_t* nsjconf) { - /* - * Best effort because of /proc/self/setgroups - */ - LOG_D("setgroups(0, NULL)"); - const gid_t* group_list = NULL; - if (setgroups(0, group_list) == -1) { - PLOG_D("setgroups(NULL) failed"); + if (!nsjconf->clone_newuser && nsjconf->orig_euid != 0) { + return true; } /* @@ -246,12 +266,48 @@ bool initNsFromChild(nsjconf_t* nsjconf) { return false; } + /* + * Best effort because of /proc/self/setgroups. We deny + * setgroups(2) calls only if user namespaces are in use. + */ + std::vector<gid_t> groups; + std::string groupsString = "["; + if (!nsjconf->clone_newuser && nsjconf->gids.size() > 1) { + for (auto it = nsjconf->gids.begin() + 1; it != nsjconf->gids.end(); it++) { + groups.push_back(it->inside_id); + groupsString += std::to_string(it->inside_id); + if (it < nsjconf->gids.end() - 1) groupsString += ", "; + } + } + groupsString += "]"; + if (!setResGid(nsjconf->gids[0].inside_id)) { - PLOG_E("setresgid(%u)", nsjconf->gids[0].inside_id); + PLOG_E("setresgid(%lu)", (unsigned long)nsjconf->gids[0].inside_id); return false; } + + LOG_D("setgroups(%zu, %s)", groups.size(), groupsString.c_str()); + if (setgroups(groups.size(), groups.data()) == -1) { + /* Indicate error if specific groups were requested */ + if (groups.size() > 0) { + PLOG_E("setgroups(%zu, %s) failed", groups.size(), groupsString.c_str()); + return false; + } + PLOG_D("setgroups(%zu, %s) failed", groups.size(), groupsString.c_str()); + } + if (!setResUid(nsjconf->uids[0].inside_id)) { - PLOG_E("setresuid(%u)", nsjconf->uids[0].inside_id); + PLOG_E("setresuid(%lu)", (unsigned long)nsjconf->uids[0].inside_id); + return false; + } + + /* + * Disable securebits again to avoid spawned programs + * unexpectedly retaining capabilities after a UID/GID + * change. + */ + if (prctl(PR_SET_SECUREBITS, 0UL, 0UL, 0UL, 0UL) == -1) { + PLOG_E("prctl(PR_SET_SECUREBITS, 0)"); return false; } @@ -64,8 +64,7 @@ ssize_t readFromFd(int fd, void* buf, size_t len) { } ssize_t readFromFile(const char* fname, void* buf, size_t len) { - int fd; - TEMP_FAILURE_RETRY(fd = open(fname, O_RDONLY | O_CLOEXEC)); + int fd = TEMP_FAILURE_RETRY(open(fname, O_RDONLY | O_CLOEXEC)); if (fd == -1) { LOG_E("open('%s', O_RDONLY|O_CLOEXEC)", fname); return -1; @@ -212,11 +211,11 @@ static const uint64_t c = 1442695040888963407ULL; static void rndInitThread(void) { #if defined(__NR_getrandom) - if (syscall(__NR_getrandom, &rndX, sizeof(rndX), 0) == sizeof(rndX)) { + if (util::syscall(__NR_getrandom, (uintptr_t)&rndX, sizeof(rndX), 0) == sizeof(rndX)) { return; } #endif /* defined(__NR_getrandom) */ - int fd = open("/dev/urandom", O_RDONLY | O_CLOEXEC); + int fd = TEMP_FAILURE_RETRY(open("/dev/urandom", O_RDONLY | O_CLOEXEC)); if (fd == -1) { PLOG_D( "Couldn't open /dev/urandom for reading. Using gettimeofday " @@ -317,4 +316,9 @@ std::vector<std::string> strSplit(const std::string str, char delim) { return vec; } +long syscall(long sysno, uintptr_t a0, uintptr_t a1, uintptr_t a2, uintptr_t a3, uintptr_t a4, + uintptr_t a5) { + return ::syscall(sysno, a0, a1, a2, a3, a4, a5); +} + } // namespace util @@ -22,6 +22,7 @@ #ifndef NS_UTIL_H #define NS_UTIL_H +#include <inttypes.h> #include <stdbool.h> #include <stdint.h> #include <stdlib.h> @@ -53,6 +54,8 @@ uint64_t rnd64(void); const std::string sigName(int signo); const std::string timeToStr(time_t t); std::vector<std::string> strSplit(const std::string str, char delim); +long syscall(long sysno, uintptr_t a0 = 0, uintptr_t a1 = 0, uintptr_t a2 = 0, uintptr_t a3 = 0, + uintptr_t a4 = 0, uintptr_t a5 = 0); } // namespace util |