aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2021-08-31 18:48:50 +0000
committerAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2021-08-31 18:48:50 +0000
commit9c38c3715e4dfacda98a65a0f5606d9059eedf43 (patch)
tree29c40001dd153d04d0e21c84faa7f093a332536d
parenta25cd900710305660a653c6b63d76286d8d6cdfe (diff)
parentb31ec2c09ec3143af016fd4319502292551f2b90 (diff)
downloadnsjail-9c38c3715e4dfacda98a65a0f5606d9059eedf43.tar.gz
Snap for 7691048 from b31ec2c09ec3143af016fd4319502292551f2b90 to build-tools-release
Change-Id: Ib509a1f522a1d1d5ea7ef29e7aade71e9a8c0370
-rw-r--r--.github/workflows/dockerpush.yml66
-rw-r--r--Android.bp2
-rw-r--r--Dockerfile3
-rw-r--r--Makefile34
-rw-r--r--README.md8
-rw-r--r--caps.cc21
-rw-r--r--cgroup.cc12
-rw-r--r--cgroup2.cc137
-rw-r--r--cgroup2.h38
-rw-r--r--cmdline.cc140
-rw-r--r--config.cc42
-rw-r--r--config.proto194
-rw-r--r--configs/apache.cfg3
-rw-r--r--configs/bash-with-fake-geteuid.cfg3
-rw-r--r--configs/demo-dont-use-chrome-with-net.cfg2
-rw-r--r--configs/firefox-with-cloned-net.cfg4
-rw-r--r--configs/firefox-with-net-wayland.cfg175
-rw-r--r--configs/firefox-with-net.cfg4
-rw-r--r--configs/home-documents-with-xorg-no-net.cfg16
-rw-r--r--configs/imagemagick-convert.cfg12
-rw-r--r--configs/static-busybox-with-execveat.cfg2
-rw-r--r--configs/tomcat8.cfg135
-rw-r--r--configs/xchat-with-net.cfg5
-rw-r--r--configs/znc-with-net.cfg136
-rw-r--r--contain.cc44
-rw-r--r--logs.cc49
-rw-r--r--logs.h2
-rw-r--r--macros.h5
-rw-r--r--mnt.cc143
-rw-r--r--net.cc74
-rw-r--r--nsjail.120
-rw-r--r--nsjail.cc160
-rw-r--r--nsjail.h28
-rw-r--r--pid.cc2
-rw-r--r--sandbox.cc5
-rw-r--r--subproc.cc293
-rw-r--r--subproc.h5
-rw-r--r--user.cc106
-rw-r--r--util.cc12
-rw-r--r--util.h3
40 files changed, 1679 insertions, 466 deletions
diff --git a/.github/workflows/dockerpush.yml b/.github/workflows/dockerpush.yml
new file mode 100644
index 0000000..5898f5b
--- /dev/null
+++ b/.github/workflows/dockerpush.yml
@@ -0,0 +1,66 @@
+name: Docker
+
+on:
+ push:
+ # Publish `master` as Docker `latest` image.
+ branches:
+ - master
+
+ # Publish `v1.2.3` tags as releases.
+ tags:
+ - v*
+
+ # Run tests for any PRs.
+ pull_request:
+
+env:
+ IMAGE_NAME: nsjail
+
+jobs:
+ # Run tests.
+ # See also https://docs.docker.com/docker-hub/builds/automated-testing/
+ test:
+ runs-on: ubuntu-latest
+
+ steps:
+ - uses: actions/checkout@v2
+
+ - name: Run tests
+ run: docker build . --file Dockerfile
+
+ # Push image to GitHub Package Registry.
+ # See also https://docs.docker.com/docker-hub/builds/
+ push:
+ # Ensure test job passes before pushing image.
+ needs: test
+
+ runs-on: ubuntu-latest
+ if: github.event_name == 'push'
+
+ steps:
+ - uses: actions/checkout@v2
+
+ - name: Build image
+ run: docker build . --file Dockerfile --tag image
+
+ - name: Log into registry
+ run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login docker.pkg.github.com -u ${{ github.actor }} --password-stdin
+
+ - name: Push image
+ run: |
+ IMAGE_ID=docker.pkg.github.com/${{ github.repository }}/$IMAGE_NAME
+
+ # Strip git ref prefix from version
+ VERSION=$(echo "${{ github.ref }}" | sed -e 's,.*/\(.*\),\1,')
+
+ # Strip "v" prefix from tag name
+ [[ "${{ github.ref }}" == "refs/tags/"* ]] && VERSION=$(echo $VERSION | sed -e 's/^v//')
+
+ # Use Docker `latest` tag convention
+ [ "$VERSION" == "master" ] && VERSION=latest
+
+ echo IMAGE_ID=$IMAGE_ID
+ echo VERSION=$VERSION
+
+ docker tag image $IMAGE_ID:$VERSION
+ docker push $IMAGE_ID:$VERSION
diff --git a/Android.bp b/Android.bp
index 806d250..cef4436 100644
--- a/Android.bp
+++ b/Android.bp
@@ -12,9 +12,11 @@ cc_binary_host {
"-Wno-unused-parameter",
],
cppflags: ["-fno-exceptions"],
+ shared_libs: ["libnl"],
srcs: [
"caps.cc",
"cgroup.cc",
+ "cgroup2.cc",
"cmdline.cc",
"config.cc",
"contain.cc",
diff --git a/Dockerfile b/Dockerfile
index 5bd472a..ce5c64f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:16.04
+FROM ubuntu:18.04
RUN apt-get -y update && apt-get install -y \
autoconf \
@@ -8,6 +8,7 @@ RUN apt-get -y update && apt-get install -y \
g++ \
git \
libprotobuf-dev \
+ libnl-route-3-dev \
libtool \
make \
pkg-config \
diff --git a/Makefile b/Makefile
index e318820..9494732 100644
--- a/Makefile
+++ b/Makefile
@@ -16,7 +16,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-PKG_CONFIG=$(shell which pkg-config)
+PKG_CONFIG=$(shell command -v pkg-config 2> /dev/null)
ifeq ($(PKG_CONFIG),)
$(error "Install pkg-config to make it work")
endif
@@ -31,13 +31,13 @@ COMMON_FLAGS += -O2 -c \
-Wall -Wextra -Werror \
-Ikafel/include
-CXXFLAGS += $(COMMON_FLAGS) $(shell pkg-config --cflags protobuf) \
+CXXFLAGS += $(USER_DEFINES) $(COMMON_FLAGS) $(shell pkg-config --cflags protobuf) \
-std=c++11 -fno-exceptions -Wno-unused -Wno-unused-parameter
LDFLAGS += -pie -Wl,-z,noexecstack -lpthread $(shell pkg-config --libs protobuf)
BIN = nsjail
LIBS = kafel/libkafel.a
-SRCS_CXX = caps.cc cgroup.cc cmdline.cc config.cc contain.cc cpu.cc logs.cc mnt.cc net.cc nsjail.cc pid.cc sandbox.cc subproc.cc uts.cc user.cc util.cc
+SRCS_CXX = caps.cc cgroup.cc cgroup2.cc cmdline.cc config.cc contain.cc cpu.cc logs.cc mnt.cc net.cc nsjail.cc pid.cc sandbox.cc subproc.cc uts.cc user.cc util.cc
SRCS_PROTO = config.proto
SRCS_PB_CXX = $(SRCS_PROTO:.proto=.pb.cc)
SRCS_PB_H = $(SRCS_PROTO:.proto=.pb.h)
@@ -48,14 +48,11 @@ ifdef DEBUG
CXXFLAGS += -g -ggdb -gdwarf-4
endif
-USE_NL3 ?= yes
-ifeq ($(USE_NL3), yes)
NL3_EXISTS := $(shell pkg-config --exists libnl-route-3.0 && echo yes)
ifeq ($(NL3_EXISTS), yes)
- CXXFLAGS += -DNSJAIL_NL3_WITH_MACVLAN $(shell pkg-config --cflags libnl-route-3.0)
+ CXXFLAGS += $(shell pkg-config --cflags libnl-route-3.0)
LDFLAGS += $(shell pkg-config --libs libnl-route-3.0)
endif
-endif
.PHONY: all clean depend indent
@@ -66,17 +63,21 @@ all: $(BIN)
$(BIN): $(LIBS) $(OBJS)
ifneq ($(NL3_EXISTS), yes)
- $(warning "==========================================================")
- $(warning "No support for libnl3/libnl-route-3; /sbin/ip will be used")
- $(warning "==========================================================")
+ $(warning "============================================================")
+ $(warning "You probably miss libnl3(-dev)/libnl-route-3(-dev) libraries")
+ $(warning "============================================================")
endif
$(CXX) -o $(BIN) $(OBJS) $(LIBS) $(LDFLAGS)
-kafel/libkafel.a:
+.PHONY: kafel_init
+kafel_init:
ifeq ("$(wildcard kafel/Makefile)","")
git submodule update --init
endif
- $(MAKE) -C kafel
+
+kafel/include/kafel.h: kafel_init
+kafel/libkafel.a: kafel_init
+ CFLAGS=-fPIE $(MAKE) -C kafel
# Sequence of proto deps, which doesn't fit automatic make rules
config.o: $(SRCS_PB_O) $(SRCS_PB_H)
@@ -104,9 +105,10 @@ indent:
caps.o: caps.h nsjail.h logs.h macros.h util.h
cgroup.o: cgroup.h nsjail.h logs.h util.h
+cgroup2.o: cgroup2.h nsjail.h logs.h util.h
cmdline.o: cmdline.h nsjail.h caps.h config.h logs.h macros.h mnt.h user.h
cmdline.o: util.h
-config.o: caps.h nsjail.h cmdline.h config.h config.pb.h logs.h macros.h
+config.o: config.h nsjail.h caps.h cmdline.h config.pb.h logs.h macros.h
config.o: mnt.h user.h util.h
contain.o: contain.h nsjail.h caps.h cgroup.h cpu.h logs.h macros.h mnt.h
contain.o: net.h pid.h user.h util.h uts.h
@@ -116,9 +118,9 @@ mnt.o: mnt.h nsjail.h logs.h macros.h subproc.h util.h
net.o: net.h nsjail.h logs.h subproc.h
nsjail.o: nsjail.h cmdline.h logs.h macros.h net.h sandbox.h subproc.h util.h
pid.o: pid.h nsjail.h logs.h subproc.h
-sandbox.o: sandbox.h nsjail.h kafel/include/kafel.h logs.h
-subproc.o: subproc.h nsjail.h cgroup.h contain.h logs.h macros.h net.h
-subproc.o: sandbox.h user.h util.h
+sandbox.o: sandbox.h nsjail.h kafel/include/kafel.h logs.h util.h
+subproc.o: subproc.h nsjail.h cgroup.h cgroup2.h contain.h logs.h macros.h
+subproc.o: net.h sandbox.h user.h util.h
uts.o: uts.h nsjail.h logs.h
user.o: user.h nsjail.h logs.h macros.h subproc.h util.h
util.o: util.h nsjail.h logs.h macros.h
diff --git a/README.md b/README.md
index 1b83daf..f3f2ccf 100644
--- a/README.md
+++ b/README.md
@@ -357,9 +357,9 @@ Options:
--rw
Mount chroot dir (/) R/W (default: R/O)
--user|-u VALUE
- Username/uid of processess inside the jail (default: your current uid). You can also use inside_ns_uid:outside_ns_uid:count convention here. Can be specified multiple times
+ Username/uid of processes inside the jail (default: your current uid). You can also use inside_ns_uid:outside_ns_uid:count convention here. Can be specified multiple times
--group|-g VALUE
- Groupname/gid of processess inside the jail (default: your current gid). You can also use inside_ns_gid:global_ns_gid:count convention here. Can be specified multiple times
+ Groupname/gid of processes inside the jail (default: your current gid). You can also use inside_ns_gid:global_ns_gid:count convention here. Can be specified multiple times
--hostname|-H VALUE
UTS name (hostname) of the jail (default: 'NSJAIL')
--cwd|-D VALUE
@@ -368,6 +368,8 @@ Options:
TCP port to bind to (enables MODE_LISTEN_TCP) (default: 0)
--bindhost VALUE
IP address to bind the port to (only in [MODE_LISTEN_TCP]), (default: '::')
+ --max_conns VALUE
+ Maximum number of connections across all IPs (only in [MODE_LISTEN_TCP]), (default: 0 (unlimited))
--max_conns_per_ip|-i VALUE
Maximum number of connections per one IP (only in [MODE_LISTEN_TCP]), (default: 0 (unlimited))
--log|-l VALUE
@@ -489,7 +491,7 @@ Options:
--cgroup_cpu_ms_per_sec VALUE
Number of milliseconds of CPU time per second that the process group can use (default: '0' - no limit)
--cgroup_cpu_mount VALUE
- Location of cpu cgroup FS (default: '/sys/fs/cgroup/net_cls')
+ Location of cpu cgroup FS (default: '/sys/fs/cgroup/cpu')
--cgroup_cpu_parent VALUE
Which pre-existing cpu cgroup to use as a parent (default: 'NSJAIL')
--iface_no_lo
diff --git a/caps.cc b/caps.cc
index 07785da..2c1c23a 100644
--- a/caps.cc
+++ b/caps.cc
@@ -21,6 +21,7 @@
#include "caps.h"
+#include <errno.h>
#include <linux/capability.h>
#include <string.h>
#include <sys/prctl.h>
@@ -80,6 +81,15 @@ struct {
#if defined(CAP_AUDIT_READ)
NS_VALSTR_STRUCT(CAP_AUDIT_READ),
#endif /* defined(CAP_AUDIT_READ) */
+#if defined(CAP_BPF)
+ NS_VALSTR_STRUCT(CAP_BPF),
+#endif /* defined(CAP_BPF) */
+#if defined(CAP_PERFMON)
+ NS_VALSTR_STRUCT(CAP_PERFMON),
+#endif /* defined(CAP_PERFMON) */
+#if defined(CAP_CHECKPOINT_RESTORE)
+ NS_VALSTR_STRUCT(CAP_CHECKPOINT_RESTORE),
+#endif /* defined(CAP_CHECKPOINT_RESTORE) */
};
int nameToVal(const char* name) {
@@ -88,7 +98,7 @@ int nameToVal(const char* name) {
return cap.val;
}
}
- LOG_W("Uknown capability: '%s'", name);
+ LOG_W("Unknown capability: '%s'", name);
return -1;
}
@@ -112,7 +122,7 @@ static cap_user_data_t getCaps() {
.version = _LINUX_CAPABILITY_VERSION_3,
.pid = 0,
};
- if (syscall(__NR_capget, &cap_hdr, &cap_data) == -1) {
+ if (util::syscall(__NR_capget, (uintptr_t)&cap_hdr, (uintptr_t)&cap_data) == -1) {
PLOG_W("capget() failed");
return NULL;
}
@@ -124,7 +134,7 @@ static bool setCaps(const cap_user_data_t cap_data) {
.version = _LINUX_CAPABILITY_VERSION_3,
.pid = 0,
};
- if (syscall(__NR_capset, &cap_hdr, cap_data) == -1) {
+ if (util::syscall(__NR_capset, (uintptr_t)&cap_hdr, (uintptr_t)cap_data) == -1) {
PLOG_W("capset() failed");
return false;
}
@@ -247,6 +257,11 @@ bool initNs(nsjconf_t* nsjconf) {
if (getInheritable(cap_data, i.val)) {
continue;
}
+ if (prctl(PR_CAPBSET_READ, (unsigned long)i.val, 0UL, 0UL, 0UL) == -1 &&
+ errno == EINVAL) {
+ LOG_D("Skipping unsupported capability: %s", i.name);
+ continue;
+ }
dbgmsg.append(" ").append(i.name);
if (prctl(PR_CAPBSET_DROP, (unsigned long)i.val, 0UL, 0UL, 0UL) == -1) {
PLOG_W("prctl(PR_CAPBSET_DROP, %s)", i.name);
diff --git a/cgroup.cc b/cgroup.cc
index 91a09ce..15c7649 100644
--- a/cgroup.cc
+++ b/cgroup.cc
@@ -38,12 +38,11 @@
namespace cgroup {
static bool createCgroup(const std::string& cgroup_path, pid_t pid) {
- LOG_D("Create '%s' for PID=%d", cgroup_path.c_str(), (int)pid);
+ LOG_D("Create '%s' for pid=%d", cgroup_path.c_str(), (int)pid);
if (mkdir(cgroup_path.c_str(), 0700) == -1 && errno != EEXIST) {
PLOG_W("mkdir('%s', 0700) failed", cgroup_path.c_str());
return false;
}
-
return true;
}
@@ -55,14 +54,13 @@ static bool writeToCgroup(
LOG_W("Could not update %s", what.c_str());
return false;
}
-
return true;
}
static bool addPidToTaskList(const std::string& cgroup_path, pid_t pid) {
std::string pid_str = std::to_string(pid);
std::string tasks_path = cgroup_path + "/tasks";
- LOG_D("Adding PID='%s' to '%s'", pid_str.c_str(), tasks_path.c_str());
+ LOG_D("Adding pid='%s' to '%s'", pid_str.c_str(), tasks_path.c_str());
return writeToCgroup(tasks_path, pid_str, "'" + tasks_path + "' task list");
}
@@ -136,12 +134,12 @@ static bool initNsFromParentCpu(nsjconf_t* nsjconf, pid_t pid) {
"/NSJAIL." + std::to_string(pid);
RETURN_ON_FAILURE(createCgroup(cpu_cgroup_path, pid));
- std::string cpu_ms_per_sec_str = std::to_string(nsjconf->cgroup_cpu_ms_per_sec * 1000U);
RETURN_ON_FAILURE(
- writeToCgroup(cpu_cgroup_path + "/cpu.cfs_quota_us", cpu_ms_per_sec_str, "cpu quota"));
+ writeToCgroup(cpu_cgroup_path + "/cpu.cfs_period_us", "1000000", "cpu period"));
+ std::string cpu_ms_per_sec_str = std::to_string(nsjconf->cgroup_cpu_ms_per_sec * 1000U);
RETURN_ON_FAILURE(
- writeToCgroup(cpu_cgroup_path + "/cpu.cfs_period_us", "1000000", "cpu period"));
+ writeToCgroup(cpu_cgroup_path + "/cpu.cfs_quota_us", cpu_ms_per_sec_str, "cpu quota"));
return addPidToTaskList(cpu_cgroup_path, pid);
}
diff --git a/cgroup2.cc b/cgroup2.cc
new file mode 100644
index 0000000..6b0dc09
--- /dev/null
+++ b/cgroup2.cc
@@ -0,0 +1,137 @@
+/*
+
+ nsjail - cgroup2 namespacing
+ -----------------------------------------
+
+ Copyright 2014 Google Inc. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+*/
+
+#include "cgroup2.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+
+#include "logs.h"
+#include "util.h"
+
+namespace cgroup2 {
+
+static std::string getCgroupPath(nsjconf_t *nsjconf, pid_t pid) {
+ return nsjconf->cgroupv2_mount + "/NSJAIL." + std::to_string(pid);
+}
+
+static bool createCgroup(const std::string &cgroup_path, pid_t pid) {
+ LOG_D("Create '%s' for pid=%d", cgroup_path.c_str(), (int)pid);
+ if (mkdir(cgroup_path.c_str(), 0700) == -1 && errno != EEXIST) {
+ PLOG_W("mkdir('%s', 0700) failed", cgroup_path.c_str());
+ return false;
+ }
+ return true;
+}
+
+static bool writeToCgroup(
+ const std::string &cgroup_path, const std::string &resource, const std::string &value) {
+ LOG_I("Setting '%s' to '%s'", resource.c_str(), value.c_str());
+
+ if (!util::writeBufToFile(
+ (cgroup_path + "/" + resource).c_str(), value.c_str(), value.length(), O_WRONLY)) {
+ LOG_W("Could not update %s", resource.c_str());
+ return false;
+ }
+ return true;
+}
+
+static bool addPidToProcList(const std::string &cgroup_path, pid_t pid) {
+ std::string pid_str = std::to_string(pid);
+
+ LOG_D("Adding pid='%s' to cgroup.procs", pid_str.c_str());
+ if (!util::writeBufToFile((cgroup_path + "/cgroup.procs").c_str(), pid_str.c_str(),
+ pid_str.length(), O_WRONLY)) {
+ LOG_W("Could not update cgroup.procs");
+ return false;
+ }
+ return true;
+}
+
+static void removeCgroup(const std::string &cgroup_path) {
+ LOG_D("Remove '%s'", cgroup_path.c_str());
+ if (rmdir(cgroup_path.c_str()) == -1) {
+ PLOG_W("rmdir('%s') failed", cgroup_path.c_str());
+ }
+}
+
+static bool initNsFromParentMem(nsjconf_t *nsjconf, pid_t pid) {
+ if (nsjconf->cgroup_mem_max == (size_t)0) {
+ return true;
+ }
+
+ std::string cgroup_path = getCgroupPath(nsjconf, pid);
+ RETURN_ON_FAILURE(createCgroup(cgroup_path, pid));
+ RETURN_ON_FAILURE(addPidToProcList(cgroup_path, pid));
+ return writeToCgroup(cgroup_path, "memory.max", std::to_string(nsjconf->cgroup_mem_max));
+}
+
+static bool initNsFromParentPids(nsjconf_t *nsjconf, pid_t pid) {
+ if (nsjconf->cgroup_pids_max == 0U) {
+ return true;
+ }
+ std::string cgroup_path = getCgroupPath(nsjconf, pid);
+ RETURN_ON_FAILURE(createCgroup(cgroup_path, pid));
+ RETURN_ON_FAILURE(addPidToProcList(cgroup_path, pid));
+ return writeToCgroup(cgroup_path, "pids.max", std::to_string(nsjconf->cgroup_pids_max));
+}
+
+static bool initNsFromParentCpu(nsjconf_t *nsjconf, pid_t pid) {
+ if (nsjconf->cgroup_cpu_ms_per_sec == 0U) {
+ return true;
+ }
+
+ std::string cgroup_path = getCgroupPath(nsjconf, pid);
+ RETURN_ON_FAILURE(createCgroup(cgroup_path, pid));
+ RETURN_ON_FAILURE(addPidToProcList(cgroup_path, pid));
+
+ // The maximum bandwidth limit in the format: `$MAX $PERIOD`.
+ // This indicates that the group may consume up to $MAX in each $PERIOD
+ // duration.
+ std::string cpu_ms_per_sec_str = std::to_string(nsjconf->cgroup_cpu_ms_per_sec * 1000U);
+ cpu_ms_per_sec_str += " 1000000";
+ return writeToCgroup(cgroup_path, "cpu.max", cpu_ms_per_sec_str);
+}
+
+bool initNsFromParent(nsjconf_t *nsjconf, pid_t pid) {
+ RETURN_ON_FAILURE(initNsFromParentMem(nsjconf, pid));
+ RETURN_ON_FAILURE(initNsFromParentPids(nsjconf, pid));
+ return initNsFromParentCpu(nsjconf, pid);
+}
+
+void finishFromParent(nsjconf_t *nsjconf, pid_t pid) {
+ if (nsjconf->cgroup_mem_max != (size_t)0 || nsjconf->cgroup_pids_max != 0U ||
+ nsjconf->cgroup_cpu_ms_per_sec != 0U) {
+ removeCgroup(getCgroupPath(nsjconf, pid));
+ }
+}
+
+} // namespace cgroup2
diff --git a/cgroup2.h b/cgroup2.h
new file mode 100644
index 0000000..3e0cc71
--- /dev/null
+++ b/cgroup2.h
@@ -0,0 +1,38 @@
+/*
+
+ nsjail - cgroup2 namespacing
+ -----------------------------------------
+
+ Copyright 2014 Google Inc. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+*/
+
+#ifndef NS_CGROUP2_H
+#define NS_CGROUP2_H
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#include "nsjail.h"
+
+namespace cgroup2 {
+
+bool initNsFromParent(nsjconf_t* nsjconf, pid_t pid);
+bool initNs(void);
+void finishFromParent(nsjconf_t* nsjconf, pid_t pid);
+
+} // namespace cgroup2
+
+#endif /* _CGROUP2_H */
diff --git a/cmdline.cc b/cmdline.cc
index 4347e9a..a2b825b 100644
--- a/cmdline.cc
+++ b/cmdline.cc
@@ -76,13 +76,15 @@ struct custom_option custom_opts[] = {
{ { "exec_file", required_argument, NULL, 'x' }, "File to exec (default: argv[0])" },
{ { "execute_fd", no_argument, NULL, 0x0607 }, "Use execveat() to execute a file-descriptor instead of executing the binary path. In such case argv[0]/exec_file denotes a file path before mount namespacing" },
{ { "chroot", required_argument, NULL, 'c' }, "Directory containing / of the jail (default: none)" },
+ { { "no_pivotroot", no_argument, NULL, 0x600 }, "When creating a mount namespace, use mount(MS_MOVE) and chroot rather than pivot_root. Usefull when pivot_root is disallowed (e.g. initramfs). Note: escapable is some configuration" },
{ { "rw", no_argument, NULL, 0x601 }, "Mount chroot dir (/) R/W (default: R/O)" },
- { { "user", required_argument, NULL, 'u' }, "Username/uid of processess inside the jail (default: your current uid). You can also use inside_ns_uid:outside_ns_uid:count convention here. Can be specified multiple times" },
- { { "group", required_argument, NULL, 'g' }, "Groupname/gid of processess inside the jail (default: your current gid). You can also use inside_ns_gid:global_ns_gid:count convention here. Can be specified multiple times" },
+ { { "user", required_argument, NULL, 'u' }, "Username/uid of processes inside the jail (default: your current uid). You can also use inside_ns_uid:outside_ns_uid:count convention here. Can be specified multiple times" },
+ { { "group", required_argument, NULL, 'g' }, "Groupname/gid of processes inside the jail (default: your current gid). You can also use inside_ns_gid:global_ns_gid:count convention here. Can be specified multiple times" },
{ { "hostname", required_argument, NULL, 'H' }, "UTS name (hostname) of the jail (default: 'NSJAIL')" },
{ { "cwd", required_argument, NULL, 'D' }, "Directory in the namespace the process will run (default: '/')" },
{ { "port", required_argument, NULL, 'p' }, "TCP port to bind to (enables MODE_LISTEN_TCP) (default: 0)" },
{ { "bindhost", required_argument, NULL, 0x604 }, "IP address to bind the port to (only in [MODE_LISTEN_TCP]), (default: '::')" },
+ { { "max_conns", required_argument, NULL, 0x608 }, "Maximum number of connections across all IPs (only in [MODE_LISTEN_TCP]), (default: 0 (unlimited))" },
{ { "max_conns_per_ip", required_argument, NULL, 'i' }, "Maximum number of connections per one IP (only in [MODE_LISTEN_TCP]), (default: 0 (unlimited))" },
{ { "log", required_argument, NULL, 'l' }, "Log file (default: use log_fd)" },
{ { "log_fd", required_argument, NULL, 'L' }, "Log FD (default: 2)" },
@@ -92,8 +94,8 @@ struct custom_option custom_opts[] = {
{ { "verbose", no_argument, NULL, 'v' }, "Verbose output" },
{ { "quiet", no_argument, NULL, 'q' }, "Log warning and more important messages only" },
{ { "really_quiet", no_argument, NULL, 'Q' }, "Log fatal messages only" },
- { { "keep_env", no_argument, NULL, 'e' }, "Pass all environment variables to the child process (default: all envvars are cleared)" },
- { { "env", required_argument, NULL, 'E' }, "Additional environment variable (can be used multiple times). If the envvar doesn't contain '=' (e.g. just the 'DISPLAY' string), the current envvar value will be used" },
+ { { "keep_env", no_argument, NULL, 'e' }, "Pass all environment variables to the child process (default: all envars are cleared)" },
+ { { "env", required_argument, NULL, 'E' }, "Additional environment variable (can be used multiple times). If the envar doesn't contain '=' (e.g. just the 'DISPLAY' string), the current envar value will be used" },
{ { "keep_caps", no_argument, NULL, 0x0501 }, "Don't drop any capabilities" },
{ { "cap", required_argument, NULL, 0x0509 }, "Retain this capability, e.g. CAP_PTRACE (can be specified multiple times)" },
{ { "silent", no_argument, NULL, 0x0502 }, "Redirect child process' fd:0/1/2 to /dev/null" },
@@ -108,6 +110,10 @@ struct custom_option custom_opts[] = {
{ { "rlimit_nofile", required_argument, NULL, 0x0205 }, "RLIMIT_NOFILE, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 32)" },
{ { "rlimit_nproc", required_argument, NULL, 0x0206 }, "RLIMIT_NPROC, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 'soft')" },
{ { "rlimit_stack", required_argument, NULL, 0x0207 }, "RLIMIT_STACK in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 'soft')" },
+ { { "rlimit_memlock", required_argument, NULL, 0x0209 }, "RLIMIT_MEMLOCK in KB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 'soft')" },
+ { { "rlimit_rtprio", required_argument, NULL, 0x0210 }, "RLIMIT_RTPRIO, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 'soft')" },
+ { { "rlimit_msgqueue", required_argument, NULL, 0x0211 }, "RLIMIT_MSGQUEUE in bytes, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 'soft')" },
+ { { "disable_rlimits", no_argument, NULL, 0x0208 }, "Disable all rlimits, default to limits set by parent" },
{ { "persona_addr_compat_layout", no_argument, NULL, 0x0301 }, "personality(ADDR_COMPAT_LAYOUT)" },
{ { "persona_mmap_page_zero", no_argument, NULL, 0x0302 }, "personality(MMAP_PAGE_ZERO)" },
{ { "persona_read_implies_exec", no_argument, NULL, 0x0303 }, "personality(READ_IMPLIES_EXEC)" },
@@ -120,6 +126,7 @@ struct custom_option custom_opts[] = {
{ { "disable_clone_newipc", no_argument, NULL, 0x0405 }, "Don't use CLONE_NEWIPC" },
{ { "disable_clone_newuts", no_argument, NULL, 0x0406 }, "Don't use CLONE_NEWUTS" },
{ { "disable_clone_newcgroup", no_argument, NULL, 0x0407 }, "Don't use CLONE_NEWCGROUP. Might be required for kernel versions < 4.6" },
+ { { "enable_clone_newtime", no_argument, NULL, 0x0408 }, "Use CLONE_NEWTIME. Supported with kernel versions >= 5.3" },
{ { "uid_mapping", required_argument, NULL, 'U' }, "Add a custom uid mapping of the form inside_uid:outside_uid:count. Setting this requires newuidmap (set-uid) to be present" },
{ { "gid_mapping", required_argument, NULL, 'G' }, "Add a custom gid mapping of the form inside_gid:outside_gid:count. Setting this requires newgidmap (set-uid) to be present" },
{ { "bindmount_ro", required_argument, NULL, 'R' }, "List of mountpoints to be mounted --bind (ro) inside the container. Can be specified multiple times. Supports 'source' syntax, or 'source:dest'" },
@@ -133,6 +140,7 @@ struct custom_option custom_opts[] = {
{ { "seccomp_policy", required_argument, NULL, 'P' }, "Path to file containing seccomp-bpf policy (see kafel/)" },
{ { "seccomp_string", required_argument, NULL, 0x0901 }, "String with kafel seccomp-bpf policy (see kafel/)" },
{ { "seccomp_log", no_argument, NULL, 0x0902 }, "Use SECCOMP_FILTER_FLAG_LOG. Log all actions except SECCOMP_RET_ALLOW). Supported since kernel version 4.14" },
+ { { "nice_level", required_argument, NULL, 0x0903 }, "Set jailed process niceness (-20 is highest -priority, 19 is lowest). By default, set to 19" },
{ { "cgroup_mem_max", required_argument, NULL, 0x0801 }, "Maximum number of bytes to use in the group (default: '0' - disabled)" },
{ { "cgroup_mem_mount", required_argument, NULL, 0x0802 }, "Location of memory cgroup FS (default: '/sys/fs/cgroup/memory')" },
{ { "cgroup_mem_parent", required_argument, NULL, 0x0803 }, "Which pre-existing memory cgroup to use as a parent (default: 'NSJAIL')" },
@@ -143,8 +151,10 @@ struct custom_option custom_opts[] = {
{ { "cgroup_net_cls_mount", required_argument, NULL, 0x0822 }, "Location of net_cls cgroup FS (default: '/sys/fs/cgroup/net_cls')" },
{ { "cgroup_net_cls_parent", required_argument, NULL, 0x0823 }, "Which pre-existing net_cls cgroup to use as a parent (default: 'NSJAIL')" },
{ { "cgroup_cpu_ms_per_sec", required_argument, NULL, 0x0831 }, "Number of milliseconds of CPU time per second that the process group can use (default: '0' - no limit)" },
- { { "cgroup_cpu_mount", required_argument, NULL, 0x0822 }, "Location of cpu cgroup FS (default: '/sys/fs/cgroup/net_cls')" },
+ { { "cgroup_cpu_mount", required_argument, NULL, 0x0832 }, "Location of cpu cgroup FS (default: '/sys/fs/cgroup/cpu')" },
{ { "cgroup_cpu_parent", required_argument, NULL, 0x0833 }, "Which pre-existing cpu cgroup to use as a parent (default: 'NSJAIL')" },
+ { { "cgroupv2_mount", required_argument, NULL, 0x0834}, "Location of cgroupv2 directory (default: '/sys/fs/cgroup')"},
+ { { "use_cgroupv2", no_argument, NULL, 0x0835}, "Use cgroup v2"},
{ { "iface_no_lo", no_argument, NULL, 0x700 }, "Don't bring the 'lo' interface up" },
{ { "iface_own", required_argument, NULL, 0x704 }, "Move this existing network interface into the new NET namespace. Can be specified multiple times" },
{ { "macvlan_iface", required_argument, NULL, 'I' }, "Interface which will be cloned (MACVLAN) and put inside the subprocess' namespace as 'vs'" },
@@ -152,6 +162,7 @@ struct custom_option custom_opts[] = {
{ { "macvlan_vs_nm", required_argument, NULL, 0x702 }, "Netmask of the 'vs' interface (e.g. \"255.255.255.0\")" },
{ { "macvlan_vs_gw", required_argument, NULL, 0x703 }, "Default GW for the 'vs' interface (e.g. \"192.168.0.1\")" },
{ { "macvlan_vs_ma", required_argument, NULL, 0x705 }, "MAC-address of the 'vs' interface (e.g. \"ba:ad:ba:be:45:00\")" },
+ { { "macvlan_vs_mo", required_argument, NULL, 0x706 }, "Mode of the 'vs' interface. Can be either 'private', 'vepa', 'bridge' or 'passthru' (default: 'private')" },
};
// clang-format on
@@ -194,7 +205,7 @@ void addEnv(nsjconf_t* nsjconf, const std::string& env) {
}
char* e = getenv(env.c_str());
if (!e) {
- LOG_W("Requested to use the '%s' envvar, but it's not set. It'll be ignored",
+ LOG_W("Requested to use the '%s' envar, but it's not set. It'll be ignored",
env.c_str());
return;
}
@@ -222,23 +233,25 @@ void logParams(nsjconf_t* nsjconf) {
LOG_I(
"Jail parameters: hostname:'%s', chroot:'%s', process:'%s', bind:[%s]:%d, "
- "max_conns_per_ip:%u, time_limit:%" PRId64
+ "max_conns:%u, max_conns_per_ip:%u, time_limit:%" PRId64
", personality:%#lx, daemonize:%s, clone_newnet:%s, "
"clone_newuser:%s, clone_newns:%s, clone_newpid:%s, clone_newipc:%s, clone_newuts:%s, "
- "clone_newcgroup:%s, keep_caps:%s, disable_no_new_privs:%s, max_cpus:%zu",
+ "clone_newcgroup:%s, clone_newtime:%s, keep_caps:%s, disable_no_new_privs:%s, "
+ "max_cpus:%zu",
nsjconf->hostname.c_str(), nsjconf->chroot.c_str(),
nsjconf->exec_file.empty() ? nsjconf->argv[0].c_str() : nsjconf->exec_file.c_str(),
- nsjconf->bindhost.c_str(), nsjconf->port, nsjconf->max_conns_per_ip, nsjconf->tlimit,
- nsjconf->personality, logYesNo(nsjconf->daemonize), logYesNo(nsjconf->clone_newnet),
- logYesNo(nsjconf->clone_newuser), logYesNo(nsjconf->clone_newns),
- logYesNo(nsjconf->clone_newpid), logYesNo(nsjconf->clone_newipc),
- logYesNo(nsjconf->clone_newuts), logYesNo(nsjconf->clone_newcgroup),
+ nsjconf->bindhost.c_str(), nsjconf->port, nsjconf->max_conns, nsjconf->max_conns_per_ip,
+ nsjconf->tlimit, nsjconf->personality, logYesNo(nsjconf->daemonize),
+ logYesNo(nsjconf->clone_newnet), logYesNo(nsjconf->clone_newuser),
+ logYesNo(nsjconf->clone_newns), logYesNo(nsjconf->clone_newpid),
+ logYesNo(nsjconf->clone_newipc), logYesNo(nsjconf->clone_newuts),
+ logYesNo(nsjconf->clone_newcgroup), logYesNo(nsjconf->clone_newtime),
logYesNo(nsjconf->keep_caps), logYesNo(nsjconf->disable_no_new_privs),
nsjconf->max_cpus);
for (const auto& p : nsjconf->mountpts) {
- LOG_I("%s: %s", p.is_symlink ? "Symlink" : "Mount point",
- mnt::describeMountPt(p).c_str());
+ LOG_I(
+ "%s: %s", p.is_symlink ? "Symlink" : "Mount", mnt::describeMountPt(p).c_str());
}
for (const auto& uid : nsjconf->uids) {
LOG_I("Uid map: inside_uid:%lu outside_uid:%lu count:%zu newuidmap:%s",
@@ -298,16 +311,23 @@ static std::string argFromVec(const std::vector<std::string>& vec, size_t pos) {
}
static bool setupArgv(nsjconf_t* nsjconf, int argc, char** argv, int optind) {
- for (int i = optind; i < argc; i++) {
- nsjconf->argv.push_back(argv[i]);
+ /*
+ * If user provided cmdline via nsjail [opts] -- [cmdline], then override the one from the
+ * config file
+ */
+ if (optind < argc) {
+ nsjconf->argv.clear();
+ for (int i = optind; i < argc; i++) {
+ nsjconf->argv.push_back(argv[i]);
+ }
}
- if (nsjconf->argv.empty()) {
- cmdlineUsage(argv[0]);
- LOG_E("No command provided");
- return false;
+ if (nsjconf->exec_file.empty() && nsjconf->argv.size() > 0) {
+ nsjconf->exec_file = nsjconf->argv[0];
}
if (nsjconf->exec_file.empty()) {
- nsjconf->exec_file = nsjconf->argv[0];
+ cmdlineUsage(argv[0]);
+ LOG_E("No command-line provided");
+ return false;
}
if (nsjconf->use_execveat) {
@@ -376,6 +396,18 @@ void setupUsers(nsjconf_t* nsjconf) {
}
}
+std::string parseMACVlanMode(const char* optarg) {
+ if (strcasecmp(optarg, "private") != 0 && strcasecmp(optarg, "vepa") != 0 &&
+ strcasecmp(optarg, "bridge") != 0 && strcasecmp(optarg, "passthru") != 0) {
+ LOG_F(
+ "macvlan mode can only be one of the values: "
+ "'private'/'vepa'/'bridge'/'passthru' ('%s' "
+ "provided).",
+ optarg);
+ }
+ return std::string(optarg);
+}
+
std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) {
std::unique_ptr<nsjconf_t> nsjconf(new nsjconf_t);
@@ -391,26 +423,33 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) {
nsjconf->keep_env = false;
nsjconf->keep_caps = false;
nsjconf->disable_no_new_privs = false;
- nsjconf->rl_as = 512 * (1024 * 1024);
- nsjconf->rl_core = 0;
- nsjconf->rl_cpu = 600;
- nsjconf->rl_fsize = 1 * (1024 * 1024);
- nsjconf->rl_nofile = 32;
+ nsjconf->rl_as = 4096ULL * (1024ULL * 1024ULL);
+ nsjconf->rl_core = 0ULL;
+ nsjconf->rl_cpu = 600ULL;
+ nsjconf->rl_fsize = 1ULL * (1024ULL * 1024ULL);
+ nsjconf->rl_nofile = 32ULL;
nsjconf->rl_nproc = parseRLimit(RLIMIT_NPROC, "soft", 1);
nsjconf->rl_stack = parseRLimit(RLIMIT_STACK, "soft", 1);
+ nsjconf->rl_mlock = parseRLimit(RLIMIT_MEMLOCK, "soft", 1);
+ nsjconf->rl_rtpr = parseRLimit(RLIMIT_RTPRIO, "soft", 1);
+ nsjconf->rl_msgq = parseRLimit(RLIMIT_MSGQUEUE, "soft", 1);
+ nsjconf->disable_rl = false;
nsjconf->personality = 0;
nsjconf->clone_newnet = true;
nsjconf->clone_newuser = true;
nsjconf->clone_newns = true;
+ nsjconf->no_pivotroot = false;
nsjconf->clone_newpid = true;
nsjconf->clone_newipc = true;
nsjconf->clone_newuts = true;
nsjconf->clone_newcgroup = true;
+ nsjconf->clone_newtime = false;
nsjconf->mode = MODE_STANDALONE_ONCE;
nsjconf->is_root_rw = false;
nsjconf->is_silent = false;
nsjconf->stderr_to_null = false;
nsjconf->skip_setsid = false;
+ nsjconf->max_conns = 0;
nsjconf->max_conns_per_ip = 0;
nsjconf->proc_path = "/proc";
nsjconf->is_proc_rw = false;
@@ -426,16 +465,21 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) {
nsjconf->cgroup_cpu_mount = "/sys/fs/cgroup/cpu";
nsjconf->cgroup_cpu_parent = "NSJAIL";
nsjconf->cgroup_cpu_ms_per_sec = 0U;
+ nsjconf->cgroupv2_mount = "/sys/fs/cgroup";
+ nsjconf->use_cgroupv2 = false;
nsjconf->iface_lo = true;
nsjconf->iface_vs_ip = "0.0.0.0";
nsjconf->iface_vs_nm = "255.255.255.0";
nsjconf->iface_vs_gw = "0.0.0.0";
nsjconf->iface_vs_ma = "";
+ nsjconf->iface_vs_mo = "private";
nsjconf->orig_uid = getuid();
+ nsjconf->orig_euid = geteuid();
nsjconf->num_cpus = sysconf(_SC_NPROCESSORS_ONLN);
nsjconf->seccomp_fprog.filter = NULL;
nsjconf->seccomp_fprog.len = 0;
nsjconf->seccomp_log = false;
+ nsjconf->nice_level = 19;
nsjconf->openfds.push_back(STDIN_FILENO);
nsjconf->openfds.push_back(STDOUT_FILENO);
@@ -477,20 +521,27 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) {
nsjconf->chroot = optarg;
break;
case 'p':
+ if (!util::isANumber(optarg)) {
+ LOG_E("Couldn't parse TCP port '%s'", optarg);
+ return nullptr;
+ }
nsjconf->port = strtoumax(optarg, NULL, 0);
nsjconf->mode = MODE_LISTEN_TCP;
break;
case 0x604:
nsjconf->bindhost = optarg;
break;
+ case 0x608:
+ nsjconf->max_conns = strtoul(optarg, NULL, 0);
+ break;
case 'i':
nsjconf->max_conns_per_ip = strtoul(optarg, NULL, 0);
break;
case 'l':
- logs::logFile(optarg);
+ logs::logFile(optarg, STDERR_FILENO);
break;
case 'L':
- logs::logFile(std::string("/dev/fd/") + optarg);
+ logs::logFile("", std::strtol(optarg, NULL, 0));
break;
case 'd':
nsjconf->daemonize = true;
@@ -535,6 +586,18 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) {
case 0x0207:
nsjconf->rl_stack = parseRLimit(RLIMIT_STACK, optarg, (1024 * 1024));
break;
+ case 0x0209:
+ nsjconf->rl_mlock = parseRLimit(RLIMIT_MEMLOCK, optarg, 1024);
+ break;
+ case 0x0210:
+ nsjconf->rl_rtpr = parseRLimit(RLIMIT_RTPRIO, optarg, 1);
+ break;
+ case 0x0211:
+ nsjconf->rl_msgq = parseRLimit(RLIMIT_MSGQUEUE, optarg, 1);
+ break;
+ case 0x0208:
+ nsjconf->disable_rl = true;
+ break;
case 0x0301:
nsjconf->personality |= ADDR_COMPAT_LAYOUT;
break;
@@ -572,7 +635,7 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) {
nsjconf->clone_newcgroup = false;
break;
case 0x0408:
- nsjconf->clone_newcgroup = true;
+ nsjconf->clone_newtime = true;
break;
case 0x0501:
nsjconf->keep_caps = true;
@@ -602,6 +665,9 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) {
}
nsjconf->caps.push_back(cap);
} break;
+ case 0x0600:
+ nsjconf->no_pivotroot = true;
+ break;
case 0x0601:
nsjconf->is_root_rw = true;
break;
@@ -777,6 +843,9 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) {
case 0x705:
nsjconf->iface_vs_ma = optarg;
break;
+ case 0x706:
+ nsjconf->iface_vs_mo = parseMACVlanMode(optarg);
+ break;
case 0x801:
nsjconf->cgroup_mem_max = (size_t)strtoull(optarg, NULL, 0);
break;
@@ -813,6 +882,12 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) {
case 0x833:
nsjconf->cgroup_cpu_parent = optarg;
break;
+ case 0x834:
+ nsjconf->cgroupv2_mount = optarg;
+ break;
+ case 0x835:
+ nsjconf->use_cgroupv2 = true;
+ break;
case 'P':
nsjconf->kafel_file_path = optarg;
break;
@@ -822,6 +897,9 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) {
case 0x902:
nsjconf->seccomp_log = true;
break;
+ case 0x903:
+ nsjconf->nice_level = (int)strtol(optarg, NULL, 0);
+ break;
default:
cmdlineUsage(argv[0]);
return nullptr;
@@ -830,7 +908,7 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) {
}
if (nsjconf->daemonize && !logs::logSet()) {
- logs::logFile(_LOG_DEFAULT_FILE);
+ logs::logFile(_LOG_DEFAULT_FILE, STDERR_FILENO);
}
if (!setupMounts(nsjconf.get())) {
return nullptr;
diff --git a/config.cc b/config.cc
index adabf0e..551e59d 100644
--- a/config.cc
+++ b/config.cc
@@ -19,7 +19,11 @@
*/
+#include "config.h"
+
#include <fcntl.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include <google/protobuf/text_format.h>
#include <stdio.h>
#include <sys/mount.h>
#include <sys/personality.h>
@@ -27,15 +31,12 @@
#include <sys/stat.h>
#include <sys/types.h>
-#include <google/protobuf/io/zero_copy_stream_impl.h>
-#include <google/protobuf/text_format.h>
#include <fstream>
#include <string>
#include <vector>
#include "caps.h"
#include "cmdline.h"
-#include "config.h"
#include "config.pb.h"
#include "logs.h"
#include "macros.h"
@@ -78,28 +79,26 @@ static bool configParseInternal(nsjconf_t* nsjconf, const nsjail::NsJailConfig&
nsjconf->mode = MODE_STANDALONE_EXECVE;
break;
default:
- LOG_E("Uknown running mode: %d", njc.mode());
+ LOG_E("Unknown running mode: %d", njc.mode());
return false;
}
- if (njc.has_chroot_dir()) {
- nsjconf->chroot = njc.chroot_dir();
- }
- nsjconf->is_root_rw = njc.is_root_rw();
nsjconf->hostname = njc.hostname();
nsjconf->cwd = njc.cwd();
nsjconf->port = njc.port();
nsjconf->bindhost = njc.bindhost();
+ nsjconf->max_conns = njc.max_conns();
nsjconf->max_conns_per_ip = njc.max_conns_per_ip();
nsjconf->tlimit = njc.time_limit();
nsjconf->max_cpus = njc.max_cpus();
nsjconf->daemonize = njc.daemon();
if (njc.has_log_fd()) {
- logs::logFile(std::string("/dev/fd/") + std::to_string(njc.log_fd()));
+ logs::logFile("", njc.log_fd());
}
if (njc.has_log_file()) {
- logs::logFile(njc.log_file());
+ logs::logFile(njc.log_file(), STDERR_FILENO);
}
+
if (njc.has_log_level()) {
switch (njc.log_level()) {
case nsjail::LogLevel::DEBUG:
@@ -159,6 +158,14 @@ static bool configParseInternal(nsjconf_t* nsjconf, const nsjail::NsJailConfig&
nsjconf->rl_nproc = configRLimit(RLIMIT_NPROC, njc.rlimit_nproc_type(), njc.rlimit_nproc());
nsjconf->rl_stack = configRLimit(
RLIMIT_STACK, njc.rlimit_stack_type(), njc.rlimit_stack(), 1024UL * 1024UL);
+ nsjconf->rl_mlock =
+ configRLimit(RLIMIT_MEMLOCK, njc.rlimit_memlock_type(), njc.rlimit_memlock(), 1024UL);
+ nsjconf->rl_rtpr =
+ configRLimit(RLIMIT_RTPRIO, njc.rlimit_rtprio_type(), njc.rlimit_rtprio());
+ nsjconf->rl_msgq =
+ configRLimit(RLIMIT_MSGQUEUE, njc.rlimit_msgqueue_type(), njc.rlimit_msgqueue());
+
+ nsjconf->disable_rl = njc.disable_rl();
if (njc.persona_addr_compat_layout()) {
nsjconf->personality |= ADDR_COMPAT_LAYOUT;
@@ -183,6 +190,9 @@ static bool configParseInternal(nsjconf_t* nsjconf, const nsjail::NsJailConfig&
nsjconf->clone_newipc = njc.clone_newipc();
nsjconf->clone_newuts = njc.clone_newuts();
nsjconf->clone_newcgroup = njc.clone_newcgroup();
+ nsjconf->clone_newtime = njc.clone_newtime();
+
+ nsjconf->no_pivotroot = njc.no_pivotroot();
for (ssize_t i = 0; i < njc.uidmap_size(); i++) {
if (!user::parseId(nsjconf, njc.uidmap(i).inside_id(), njc.uidmap(i).outside_id(),
@@ -238,6 +248,7 @@ static bool configParseInternal(nsjconf_t* nsjconf, const nsjail::NsJailConfig&
nsjconf->kafel_string += '\n';
}
nsjconf->seccomp_log = njc.seccomp_log();
+ nsjconf->nice_level = njc.nice_level();
nsjconf->cgroup_mem_max = njc.cgroup_mem_max();
nsjconf->cgroup_mem_mount = njc.cgroup_mem_mount();
@@ -251,6 +262,8 @@ static bool configParseInternal(nsjconf_t* nsjconf, const nsjail::NsJailConfig&
nsjconf->cgroup_cpu_ms_per_sec = njc.cgroup_cpu_ms_per_sec();
nsjconf->cgroup_cpu_mount = njc.cgroup_cpu_mount();
nsjconf->cgroup_cpu_parent = njc.cgroup_cpu_parent();
+ nsjconf->cgroupv2_mount = njc.cgroupv2_mount();
+ nsjconf->use_cgroupv2 = njc.use_cgroupv2();
nsjconf->iface_lo = !(njc.iface_no_lo());
for (ssize_t i = 0; i < njc.iface_own().size(); i++) {
@@ -263,10 +276,13 @@ static bool configParseInternal(nsjconf_t* nsjconf, const nsjail::NsJailConfig&
nsjconf->iface_vs_nm = njc.macvlan_vs_nm();
nsjconf->iface_vs_gw = njc.macvlan_vs_gw();
nsjconf->iface_vs_ma = njc.macvlan_vs_ma();
+ nsjconf->iface_vs_mo = njc.macvlan_vs_mo();
if (njc.has_exec_bin()) {
- nsjconf->exec_file = njc.exec_bin().path();
- nsjconf->argv.push_back(njc.exec_bin().path());
+ if (njc.exec_bin().has_path()) {
+ nsjconf->exec_file = njc.exec_bin().path();
+ nsjconf->argv.push_back(njc.exec_bin().path());
+ }
for (ssize_t i = 0; i < njc.exec_bin().arg().size(); i++) {
nsjconf->argv.push_back(njc.exec_bin().arg(i));
}
@@ -287,7 +303,7 @@ static void LogHandler(
bool parseFile(nsjconf_t* nsjconf, const char* file) {
LOG_D("Parsing configuration from '%s'", file);
- int fd = open(file, O_RDONLY | O_CLOEXEC);
+ int fd = TEMP_FAILURE_RETRY(open(file, O_RDONLY | O_CLOEXEC));
if (fd == -1) {
PLOG_W("Couldn't open config file '%s'", file);
return false;
diff --git a/config.proto b/config.proto
index 3988543..96a2b10 100644
--- a/config.proto
+++ b/config.proto
@@ -4,14 +4,14 @@ package nsjail;
enum Mode {
LISTEN = 0; /* Listening on a TCP port */
- ONCE = 1; /* Running the command once only */
- RERUN = 2; /* Re-executing the command (forever) */
+ ONCE = 1; /* Running the command once only */
+ RERUN = 2; /* Re-executing the command (forever) */
EXECVE = 3; /* Executing command w/o the supervisor */
}
/* Should be self explanatory */
enum LogLevel {
- DEBUG = 0; /* Equivalent to the '-v' cmd-line option */
- INFO = 1; /* Default level */
+ DEBUG = 0; /* Equivalent to the '-v' cmd-line option */
+ INFO = 1; /* Default level */
WARNING = 2; /* Equivalent to the '-q' cmd-line option */
ERROR = 3;
FATAL = 4;
@@ -28,13 +28,13 @@ message IdMap {
message MountPt {
/* Can be skipped for filesystems like 'proc' */
optional string src = 1 [default = ""];
- /* Should 'src' path be prefixed with this envvar? */
+ /* Should 'src' path be prefixed with this envar? */
optional string prefix_src_env = 2 [default = ""];
/* If specified, contains buffer that will be written to the dst file */
optional bytes src_content = 3 [default = ""];
/* Mount point inside jail */
required string dst = 4 [default = ""];
- /* Should 'dst' path be prefixed with this envvar? */
+ /* Should 'dst' path be prefixed with this envar? */
optional string prefix_dst_env = 5 [default = ""];
/* Can be empty for mount --bind mounts */
optional string fstype = 6 [default = ""];
@@ -81,163 +81,185 @@ message NsJailConfig {
/* Execution mode: see 'msg Mode' description for more */
optional Mode mode = 3 [default = ONCE];
- /* Equivalent to a bind mount with dst='/'. DEPRECATED: Use bind mounts. */
- optional string chroot_dir = 4 [deprecated = true];
- /* Applies both to the chroot_dir and to /proc mounts. DEPRECATED: Use bind mounts */
- optional bool is_root_rw = 5 [default = false, deprecated = true];
/* Hostname inside jail */
- optional string hostname = 8 [default = "NSJAIL"];
+ optional string hostname = 4 [default = "NSJAIL"];
/* Initial current working directory for the binary */
- optional string cwd = 9 [default = "/"];
+ optional string cwd = 5 [default = "/"];
+
+ /* Defines whether to use switch_root or pivot_root */
+ optional bool no_pivotroot = 6 [default = false];
/* TCP port to listen to. Valid with mode=LISTEN only */
- optional uint32 port = 10 [default = 0];
+ optional uint32 port = 7 [default = 0];
/* Host to bind to for mode=LISTEN. Must be in IPv6 format */
- optional string bindhost = 11 [default = "::"];
+ optional string bindhost = 8 [default = "::"];
+ /* For mode=LISTEN, maximum number of connections across all IPs */
+ optional uint32 max_conns = 9 [default = 0];
/* For mode=LISTEN, maximum number of connections from a single IP */
- optional uint32 max_conns_per_ip = 12 [default = 0];
+ optional uint32 max_conns_per_ip = 10 [default = 0];
/* Wall-time time limit for commands */
- optional uint32 time_limit = 13 [default = 600];
+ optional uint32 time_limit = 11 [default = 600];
/* Should nsjail go into background? */
- optional bool daemon = 14 [default = false];
+ optional bool daemon = 12 [default = false];
/* Maximum number of CPUs to use: 0 - no limit */
- optional uint32 max_cpus = 15 [default = 0];
+ optional uint32 max_cpus = 13 [default = 0];
/* FD to log to. */
- optional int32 log_fd = 16;
- /* File to save lofs to */
- optional string log_file = 17;
+ optional int32 log_fd = 14;
+ /* File to save logs to. */
+ optional string log_file = 15;
/* Minimum log level displayed.
See 'msg LogLevel' description for more */
- optional LogLevel log_level = 18;
+ optional LogLevel log_level = 16;
/* Should the current environment variables be kept
when executing the binary */
- optional bool keep_env = 19 [default = false];
- /* EnvVars to be set before executing binaries. If the envvar doesn't contain '='
- (e.g. just the 'DISPLAY' string), the current envvar value will be used */
- repeated string envar = 20;
+ optional bool keep_env = 17 [default = false];
+ /* EnvVars to be set before executing binaries. If the envar doesn't contain '='
+ (e.g. just the 'DISPLAY' string), the current envar value will be used */
+ repeated string envar = 18;
/* Should capabilities be preserved or dropped */
- optional bool keep_caps = 21 [default = false];
+ optional bool keep_caps = 19 [default = false];
/* Which capabilities should be preserved if keep_caps == false.
Format: "CAP_SYS_PTRACE" */
- repeated string cap = 22;
+ repeated string cap = 20;
/* Should nsjail close FD=0,1,2 before executing the process */
- optional bool silent = 23 [default = false];
+ optional bool silent = 21 [default = false];
/* Should the child process have control over terminal?
Can be useful to allow /bin/sh to provide
job control / signals. Dangerous, can be used to put
characters into the controlling terminal back */
- optional bool skip_setsid = 24 [default = false];
+ optional bool skip_setsid = 22 [default = false];
/* Redirect sdterr of the process to /dev/null instead of the socket or original TTY */
- optional bool stderr_to_null = 25 [default = false];
+ optional bool stderr_to_null = 23 [default = false];
/* Which FDs should be passed to the newly executed process
By default only FD=0,1,2 are passed */
- repeated int32 pass_fd = 26;
+ repeated int32 pass_fd = 24;
/* Setting it to true will allow to have set-uid binaries
inside the jail */
- optional bool disable_no_new_privs = 27 [default = false];
+ optional bool disable_no_new_privs = 25 [default = false];
/* Various rlimits, the rlimit_as/rlimit_core/... are used only if
rlimit_as_type/rlimit_core_type/... are set to RLimit::VALUE */
- optional uint64 rlimit_as = 28 [default = 512]; /* In MiB */
- optional RLimit rlimit_as_type = 29 [default = VALUE];
- optional uint64 rlimit_core = 30 [default = 0]; /* In MiB */
- optional RLimit rlimit_core_type = 31 [default = VALUE];
- optional uint64 rlimit_cpu = 32 [default = 600]; /* In seconds */
- optional RLimit rlimit_cpu_type = 33 [default = VALUE];
- optional uint64 rlimit_fsize = 34 [default = 1]; /* In MiB */
- optional RLimit rlimit_fsize_type = 35 [default = VALUE];
- optional uint64 rlimit_nofile = 36 [default = 32];
- optional RLimit rlimit_nofile_type = 37 [default = VALUE];
+ optional uint64 rlimit_as = 26 [default = 4096]; /* In MiB */
+ optional RLimit rlimit_as_type = 27 [default = VALUE];
+ optional uint64 rlimit_core = 28 [default = 0]; /* In MiB */
+ optional RLimit rlimit_core_type = 29 [default = VALUE];
+ optional uint64 rlimit_cpu = 30 [default = 600]; /* In seconds */
+ optional RLimit rlimit_cpu_type = 31 [default = VALUE];
+ optional uint64 rlimit_fsize = 32 [default = 1]; /* In MiB */
+ optional RLimit rlimit_fsize_type = 33 [default = VALUE];
+ optional uint64 rlimit_nofile = 34 [default = 32];
+ optional RLimit rlimit_nofile_type = 35 [default = VALUE];
/* RLIMIT_NPROC is system-wide - tricky to use; use the soft limit value by
* default here */
- optional uint64 rlimit_nproc = 38 [default = 1024];
- optional RLimit rlimit_nproc_type = 39 [default = SOFT];
+ optional uint64 rlimit_nproc = 36 [default = 1024];
+ optional RLimit rlimit_nproc_type = 37 [default = SOFT];
/* In MiB, use the soft limit value by default */
- optional uint64 rlimit_stack = 40 [default = 1048576];
- optional RLimit rlimit_stack_type = 41 [default = SOFT];
+ optional uint64 rlimit_stack = 38 [default = 8];
+ optional RLimit rlimit_stack_type = 39 [default = SOFT];
+ /* In KB, use the soft limit value by default */
+ optional uint64 rlimit_memlock = 40 [default = 64];
+ optional RLimit rlimit_memlock_type = 41 [default = SOFT];
+ optional uint64 rlimit_rtprio = 42 [default = 0];
+ optional RLimit rlimit_rtprio_type = 43 [default = SOFT];
+ optional uint64 rlimit_msgqueue = 44 [default = 1024]; /* In bytes */
+ optional RLimit rlimit_msgqueue_type = 45 [default = SOFT];
+
+ /* Disable all rlimits, default to limits set by parent */
+ optional bool disable_rl = 46 [default = false];
/* See 'man personality' for more */
- optional bool persona_addr_compat_layout = 42 [default = false];
- optional bool persona_mmap_page_zero = 43 [default = false];
- optional bool persona_read_implies_exec = 44 [default = false];
- optional bool persona_addr_limit_3gb = 45 [default = false];
- optional bool persona_addr_no_randomize = 46 [default = false];
+ optional bool persona_addr_compat_layout = 47 [default = false];
+ optional bool persona_mmap_page_zero = 48 [default = false];
+ optional bool persona_read_implies_exec = 49 [default = false];
+ optional bool persona_addr_limit_3gb = 50 [default = false];
+ optional bool persona_addr_no_randomize = 51 [default = false];
/* Which name-spaces should be used? */
- optional bool clone_newnet = 47 [default = true];
- optional bool clone_newuser = 48 [default = true];
- optional bool clone_newns = 49 [default = true];
- optional bool clone_newpid = 50 [default = true];
- optional bool clone_newipc = 51 [default = true];
- optional bool clone_newuts = 52 [default = true];
+ optional bool clone_newnet = 52 [default = true];
+ optional bool clone_newuser = 53 [default = true];
+ optional bool clone_newns = 54 [default = true];
+ optional bool clone_newpid = 55 [default = true];
+ optional bool clone_newipc = 56 [default = true];
+ optional bool clone_newuts = 57 [default = true];
/* Disable for kernel versions < 4.6 as it's not supported there */
- optional bool clone_newcgroup = 53 [default = true];
+ optional bool clone_newcgroup = 58 [default = true];
+ /* Supported with kernel versions >= 5.3 */
+ optional bool clone_newtime = 59 [default = false];
/* Mappings for UIDs and GIDs. See the description for 'msg IdMap'
for more */
- repeated IdMap uidmap = 54;
- repeated IdMap gidmap = 55;
+ repeated IdMap uidmap = 60;
+ repeated IdMap gidmap = 61;
/* Should /proc be mounted (R/O)? This can also be added in the 'mount'
section below */
- optional bool mount_proc = 56 [default = false];
+ optional bool mount_proc = 62 [default = false];
/* Mount points inside the jail. See the description for 'msg MountPt'
for more */
- repeated MountPt mount = 57;
+ repeated MountPt mount = 63;
/* Kafel seccomp-bpf policy file or a string:
Homepage of the project: https://github.com/google/kafel */
- optional string seccomp_policy_file = 58;
- repeated string seccomp_string = 59;
+ optional string seccomp_policy_file = 64;
+ repeated string seccomp_string = 65;
/* Setting it to true makes audit write seccomp logs to dmesg */
- optional bool seccomp_log = 60 [default = false];
+ optional bool seccomp_log = 66 [default = false];
/* If > 0, maximum cumulative size of RAM used inside any jail */
- optional uint64 cgroup_mem_max = 61 [default = 0]; /* In MiB */
+ optional uint64 cgroup_mem_max = 67 [default = 0]; /* In bytes */
/* Mount point for cgroups-memory in your system */
- optional string cgroup_mem_mount = 62 [default = "/sys/fs/cgroup/memory"];
+ optional string cgroup_mem_mount = 68 [default = "/sys/fs/cgroup/memory"];
/* Writeable directory (for the nsjail user) under cgroup_mem_mount */
- optional string cgroup_mem_parent = 63 [default = "NSJAIL"];
+ optional string cgroup_mem_parent = 69 [default = "NSJAIL"];
/* If > 0, maximum number of PIDs (threads/processes) inside jail */
- optional uint64 cgroup_pids_max = 64 [default = 0];
+ optional uint64 cgroup_pids_max = 70 [default = 0];
/* Mount point for cgroups-pids in your system */
- optional string cgroup_pids_mount = 65 [default = "/sys/fs/cgroup/pids"];
+ optional string cgroup_pids_mount = 71 [default = "/sys/fs/cgroup/pids"];
/* Writeable directory (for the nsjail user) under cgroup_pids_mount */
- optional string cgroup_pids_parent = 66 [default = "NSJAIL"];
+ optional string cgroup_pids_parent = 72 [default = "NSJAIL"];
/* If > 0, Class identifier of network packets inside jail */
- optional uint32 cgroup_net_cls_classid = 67 [default = 0];
+ optional uint32 cgroup_net_cls_classid = 73 [default = 0];
/* Mount point for cgroups-net-cls in your system */
- optional string cgroup_net_cls_mount = 68 [default = "/sys/fs/cgroup/net_cls"];
+ optional string cgroup_net_cls_mount = 74 [default = "/sys/fs/cgroup/net_cls"];
/* Writeable directory (for the nsjail user) under cgroup_net_mount */
- optional string cgroup_net_cls_parent = 69 [default = "NSJAIL"];
+ optional string cgroup_net_cls_parent = 75 [default = "NSJAIL"];
/* If > 0, number of milliseconds of CPU time per second that jailed processes can use */
- optional uint32 cgroup_cpu_ms_per_sec = 70 [default = 0];
+ optional uint32 cgroup_cpu_ms_per_sec = 76 [default = 0];
/* Mount point for cgroups-cpu in your system */
- optional string cgroup_cpu_mount = 71 [default = "/sys/fs/cgroup/cpu"];
+ optional string cgroup_cpu_mount = 77 [default = "/sys/fs/cgroup/cpu"];
/* Writeable directory (for the nsjail user) under cgroup_cpu_mount */
- optional string cgroup_cpu_parent = 72 [default = "NSJAIL"];
+ optional string cgroup_cpu_parent = 78 [default = "NSJAIL"];
+
+ /* Mount point for cgroup v2 in your system */
+ optional string cgroupv2_mount = 79 [default = "/sys/fs/cgroup"];
+ /* Use cgroup v2 */
+ optional bool use_cgroupv2 = 80 [default = false];
/* Should the 'lo' interface be brought up (active) inside this jail? */
- optional bool iface_no_lo = 73 [default = false];
+ optional bool iface_no_lo = 81 [default = false];
/* Put this interface inside the jail */
- repeated string iface_own = 74;
+ repeated string iface_own = 82;
/* Parameters for the cloned MACVLAN interface inside jail */
- optional string macvlan_iface = 75; /* Interface to be cloned, eg 'eth0' */
- optional string macvlan_vs_ip = 76 [default = "192.168.0.2"];
- optional string macvlan_vs_nm = 77 [default = "255.255.255.0"];
- optional string macvlan_vs_gw = 78 [default = "192.168.0.1"];
- optional string macvlan_vs_ma = 79 [default = ""];
+ optional string macvlan_iface = 83; /* Interface to be cloned, eg 'eth0' */
+ optional string macvlan_vs_ip = 84 [default = "192.168.0.2"];
+ optional string macvlan_vs_nm = 85 [default = "255.255.255.0"];
+ optional string macvlan_vs_gw = 86 [default = "192.168.0.1"];
+ optional string macvlan_vs_ma = 87 [default = ""];
+ optional string macvlan_vs_mo = 88 [default = "private"];
+
+ /* Niceness level of the jailed process */
+ optional int32 nice_level = 89 [default = 19];
/* Binary path (with arguments) to be executed. If not specified here, it
can be specified with cmd-line as "-- /path/to/command arg1 arg2" */
- optional Exe exec_bin = 80;
+ optional Exe exec_bin = 90;
}
diff --git a/configs/apache.cfg b/configs/apache.cfg
index f3ae838..a1f2ff6 100644
--- a/configs/apache.cfg
+++ b/configs/apache.cfg
@@ -1,4 +1,7 @@
+# Example config for nsjail
+
name: "apache-with-cloned-net"
+
description: "Tested under Ubuntu 17.04. Other Linux distros might "
description: "use different locations for the Apache's HTTPD configuration "
description: "files and system libraries"
diff --git a/configs/bash-with-fake-geteuid.cfg b/configs/bash-with-fake-geteuid.cfg
index c0046ba..99a36af 100644
--- a/configs/bash-with-fake-geteuid.cfg
+++ b/configs/bash-with-fake-geteuid.cfg
@@ -1,4 +1,7 @@
+# Example config for nsjail
+
name: "bash-with-fake-geteuid"
+
description: "An example/demo policy which allows to execute /bin/bash and other commands in "
description: "a fairly restricted jail containing only some directories from the main "
description: "system, and with blocked __NR_syslog syscall. Also, __NR_geteuid returns -1337 "
diff --git a/configs/demo-dont-use-chrome-with-net.cfg b/configs/demo-dont-use-chrome-with-net.cfg
index 690657e..c6c6a5f 100644
--- a/configs/demo-dont-use-chrome-with-net.cfg
+++ b/configs/demo-dont-use-chrome-with-net.cfg
@@ -1,3 +1,5 @@
+# Example config for nsjail
+
name: "chrome-with-net"
description: "Don't use for anything serious - this is just a demo policy. See notes"
diff --git a/configs/firefox-with-cloned-net.cfg b/configs/firefox-with-cloned-net.cfg
index eb541e3..180ed9a 100644
--- a/configs/firefox-with-cloned-net.cfg
+++ b/configs/firefox-with-cloned-net.cfg
@@ -1,3 +1,5 @@
+# Example config for nsjail
+
name: "firefox-with-cloned-net"
description: "This policy allows to run firefox inside a jail on a separate eth interface."
@@ -30,6 +32,8 @@ time_limit: 0
envar: "HOME=/user"
envar: "DISPLAY"
envar: "TMP=/tmp"
+envar: "FONTCONFIG_FILE=/etc/fonts/fonts.conf"
+envar: "FC_CONFIG_FILE=/etc/fonts/fonts.conf"
rlimit_as: 4096
rlimit_cpu: 1000
diff --git a/configs/firefox-with-net-wayland.cfg b/configs/firefox-with-net-wayland.cfg
new file mode 100644
index 0000000..b132018
--- /dev/null
+++ b/configs/firefox-with-net-wayland.cfg
@@ -0,0 +1,175 @@
+# Example config for nsjail
+
+name: "firefox-with-net"
+
+description: "This policy allows to run firefox inside a jail. Access to networking is"
+description: "permitted with this setup (clone_newnet: false)."
+description: ""
+description: "The only permitted home directory is $HOME/.mozilla and $HOME/Documents."
+description: "The rest of available on the FS files/dires are libs and X-related files/dirs."
+description: ""
+description: "Run as:"
+description: ""
+description: "./nsjail --config configs/firefox-with-net-wayland.cfg"
+description: ""
+description: "You can then go to https://uploadfiles.io/ and try to upload a file in order"
+description: "to see how your local directory (also, all system directories) look like."
+
+mode: ONCE
+hostname: "FIREFOX"
+cwd: "/user"
+
+time_limit: 0
+
+clone_newnet: false
+
+envar: "HOME=/user"
+envar: "TMP=/tmp"
+envar: "FONTCONFIG_FILE=/etc/fonts/fonts.conf"
+envar: "FC_CONFIG_FILE=/etc/fonts/fonts.conf"
+envar: "MOZ_ENABLE_WAYLAND=1"
+envar: "XDG_RUNTIME_DIR=/user/run/"
+envar: "WAYLAND_DISPLAY=wayland-0"
+
+rlimit_as: 4096
+rlimit_cpu: 1000
+rlimit_fsize: 1024
+rlimit_nofile: 512
+
+uidmap {
+ inside_id: "9999999"
+}
+
+gidmap {
+ inside_id: "9999999"
+}
+
+mount {
+ dst: "/proc"
+ fstype: "proc"
+ rw: true
+}
+
+mount {
+ src: "/lib"
+ dst: "/lib"
+ is_bind: true
+}
+
+mount {
+ src: "/usr/lib"
+ dst: "/usr/lib"
+ is_bind: true
+}
+
+mount {
+ src: "/lib64"
+ dst: "/lib64"
+ is_bind: true
+ mandatory: false
+}
+
+mount {
+ src: "/lib32"
+ dst: "/lib32"
+ is_bind: true
+ mandatory: false
+}
+
+mount {
+ src: "/usr/lib/firefox"
+ dst: "/usr/lib/firefox"
+ is_bind: true
+}
+
+mount {
+ src: "/usr/bin/firefox"
+ dst: "/usr/bin/firefox"
+ is_bind: true
+}
+
+mount {
+ src: "/usr/share"
+ dst: "/usr/share"
+ is_bind: true
+}
+
+mount {
+ src_content: "<?xml version=\"1.0\"?>\n<!DOCTYPE fontconfig SYSTEM \"fonts.dtd\">\n<fontconfig><dir>/usr/share/fonts</dir><cachedir>/tmp/fontconfig</cachedir></fontconfig>"
+ dst: "/etc/fonts/fonts.conf"
+}
+
+mount {
+ src: "/dev/urandom"
+ dst: "/dev/urandom"
+ is_bind: true
+ rw: true
+}
+
+mount {
+ src: "/dev/null"
+ dst: "/dev/null"
+ is_bind: true
+ rw: true
+}
+
+mount {
+ src_content: "nameserver 8.8.8.8"
+ dst: "/etc/resolv.conf"
+}
+
+mount {
+ dst: "/tmp"
+ fstype: "tmpfs"
+ rw: true
+ is_bind: false
+}
+
+mount {
+ dst: "/dev/shm"
+ fstype: "tmpfs"
+ rw: true
+ is_bind: false
+}
+
+mount {
+ dst: "/user"
+ fstype: "tmpfs"
+ rw: true
+}
+
+mount {
+ prefix_src_env: "HOME"
+ src: "/Documents"
+ dst: "/user/Documents"
+ rw: true
+ is_bind: true
+ mandatory: false
+}
+
+mount {
+ prefix_src_env: "HOME"
+ src: "/.mozilla"
+ dst: "/user/.mozilla"
+ is_bind: true
+ rw: true
+ mandatory: false
+}
+
+mount {
+ src: "/tmp/.X11-unix/X0"
+ dst: "/tmp/.X11-unix/X0"
+ is_bind: true
+}
+
+mount {
+ # Change it to your user id
+ src: "/run/user/1000/wayland-0"
+ dst: "/user/run/wayland-0"
+ is_bind: true
+ rw: true
+}
+
+exec_bin {
+ path: "/usr/lib/firefox/firefox"
+}
diff --git a/configs/firefox-with-net.cfg b/configs/firefox-with-net.cfg
index 190f7c2..b88f8ea 100644
--- a/configs/firefox-with-net.cfg
+++ b/configs/firefox-with-net.cfg
@@ -1,3 +1,5 @@
+# Example config for nsjail
+
name: "firefox-with-net"
description: "This policy allows to run firefox inside a jail. Access to networking is"
@@ -24,6 +26,8 @@ clone_newnet: false
envar: "HOME=/user"
envar: "DISPLAY"
envar: "TMP=/tmp"
+envar: "FONTCONFIG_FILE=/etc/fonts/fonts.conf"
+envar: "FC_CONFIG_FILE=/etc/fonts/fonts.conf"
rlimit_as: 4096
rlimit_cpu: 1000
diff --git a/configs/home-documents-with-xorg-no-net.cfg b/configs/home-documents-with-xorg-no-net.cfg
index cc2514f..83cfb42 100644
--- a/configs/home-documents-with-xorg-no-net.cfg
+++ b/configs/home-documents-with-xorg-no-net.cfg
@@ -1,3 +1,5 @@
+# Example config for nsjail
+
name: "documents-with-xorg"
description: "This policy allows to run many X-org based tool, which are allowed"
@@ -121,6 +123,20 @@ mount {
}
mount {
+ src: "/dev/random"
+ dst: "/dev/random"
+ is_bind: true
+ rw: true
+}
+
+mount {
+ src: "/dev/urandom"
+ dst: "/dev/urandom"
+ is_bind: true
+ rw: true
+}
+
+mount {
src: "/etc/passwd"
dst: "/etc/passwd"
is_bind: true
diff --git a/configs/imagemagick-convert.cfg b/configs/imagemagick-convert.cfg
index dfe702d..479b293 100644
--- a/configs/imagemagick-convert.cfg
+++ b/configs/imagemagick-convert.cfg
@@ -1,3 +1,5 @@
+# Example config for nsjail
+
name: "imagemagick-convert"
description: "This policy allows to run ImageMagick's convert inside a jail."
@@ -5,8 +7,9 @@ description: "Your $HOME's Documents will be mapped as /user/Documents"
description: ""
description: "Run as:"
description: ""
-description: "./nsjail --config imagemagick-convert.cfg -- /usr/bin/convert \\"
-description: " jpg:/user/Documents/input.jpg png:/user/Documents/output.png"
+description: "./nsjail --config imagemagick-convert.cfg -- /usr/bin/convert jpg:/user/Documents/input.jpg png:/user/Documents/output.png "
+description: "or "
+description: "./nsjail --config imagemagick-convert.cfg -- /usr/bin/convert jpg:- png:- <file.jpg >file.png
mode: ONCE
hostname: "IM-CONVERT"
@@ -78,11 +81,12 @@ seccomp_string: " getpid, execveat, getdents, unlink, fchmod,"
seccomp_string: " getrlimit, getrusage, sysinfo, times, futex,"
seccomp_string: " arch_prctl, sched_getaffinity, set_tid_address,"
seccomp_string: " clock_gettime, set_robust_list, exit_group,"
-seccomp_string: " clone, getcwd, pread64, readlink, prlimit64"
+seccomp_string: " clone, getcwd, pread64, readlink, prlimit64, madvise"
seccomp_string: "}"
seccomp_string: "DEFAULT KILL"
exec_bin {
- path: "/usr/bin/convert"
+ path: ""
+ arg0: "/usr/bin/convert"
exec_fd: true
}
diff --git a/configs/static-busybox-with-execveat.cfg b/configs/static-busybox-with-execveat.cfg
index 0d0a49e..ddfe01c 100644
--- a/configs/static-busybox-with-execveat.cfg
+++ b/configs/static-busybox-with-execveat.cfg
@@ -1,3 +1,5 @@
+# Example config for nsjail
+
name: "static-busybox-with-execveat"
description: "An example/demo policy which allows to execute /bin/busybox-static in an "
description: "empty (only /proc) mount namespace which doesn't even include busybox itself"
diff --git a/configs/tomcat8.cfg b/configs/tomcat8.cfg
new file mode 100644
index 0000000..30af214
--- /dev/null
+++ b/configs/tomcat8.cfg
@@ -0,0 +1,135 @@
+# Example config for nsjail
+
+name: "tomcat8"
+
+description: "Tested under Ubuntu 16.04 with tomcat8=8.0.32-1ubuntu1.9,"
+description: "libnl-route-3-200=3.2.27-1ubuntu0.16.04.1,"
+description: "libprotobuf9v5=2.6.1-1.3,"
+description: "openjdk-8-jre=8u191-b12-2ubuntu0.16.04.1. "
+description: "Run as: sudo ./nsjail --config configs/tomcat.cfg"
+
+mode: ONCE
+hostname: "TOMCAT-NSJ"
+
+envar: "JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre"
+envar: "JVM_TMP=/tmp"
+envar: "CATALINA_TMPDIR=/tmp"
+envar: "CATALINA_HOME=/usr/share/tomcat8"
+envar: "CATALINA_BASE=/var/lib/tomcat8"
+envar: "CATALINA_OPTS=-server -XX:+UseParallelGC"
+envar: "JAVA_OPTS=-Djava.awt.headless=true -Djava.net.preferIPv4Stack=true -Xms256M -Xmx512M -Djava.security.egd=file:/dev/./urandom"
+
+rlimit_as: 2048
+rlimit_fsize: 1024
+rlimit_cpu_type: INF
+rlimit_nofile: 1024
+
+time_limit: 0
+
+cap: "CAP_NET_BIND_SERVICE"
+
+uidmap {
+ inside_id: "tomcat8"
+ outside_id: "tomcat8"
+}
+
+gidmap {
+ inside_id: "tomcat8"
+ outside_id: "tomcat8"
+}
+
+mount_proc: false
+
+mount {
+ src: "/etc/tomcat8"
+ dst: "/etc/tomcat8"
+ is_bind: true
+ rw: false
+}
+
+mount {
+ src: "/var/lib/tomcat8"
+ dst: "/var/lib/tomcat8"
+ is_bind: true
+ rw: true
+}
+
+mount {
+ src: "/var/log/tomcat8"
+ dst: "/var/log/tomcat8"
+ is_bind: true
+ rw: true
+}
+
+mount {
+ src: "/var/cache/tomcat8"
+ dst: "/var/cache/tomcat8"
+ is_bind: true
+ rw: true
+}
+
+mount {
+ src: "/usr/share/tomcat8"
+ dst: "/usr/share/tomcat8"
+ is_bind: true
+ rw: false
+}
+
+mount {
+ src: "/bin"
+ dst: "/bin"
+ is_bind: true
+ rw: false
+}
+
+mount {
+ src: "/lib"
+ dst: "/lib"
+ is_bind: true
+ rw: false
+}
+
+mount {
+ src: "/lib64"
+ dst: "/lib64"
+ is_bind: true
+ rw: false
+}
+
+mount {
+ src: "/usr/bin"
+ dst: "/usr/bin"
+ is_bind: true
+ rw: false
+}
+
+mount {
+ src: "/usr/lib"
+ dst: "/usr/lib"
+ is_bind: true
+ rw: false
+}
+
+mount {
+ src: "/usr/share/java"
+ dst: "/usr/share/java"
+ is_bind: true
+ rw: false
+}
+
+mount {
+ dst: "/tmp"
+ fstype: "tmpfs"
+ rw: true
+}
+
+mount {
+ dst: "/proc"
+ fstype: "proc"
+ rw: false
+}
+
+exec_bin {
+ path: "/usr/share/tomcat8/bin/catalina.sh"
+ arg : "run"
+}
diff --git a/configs/xchat-with-net.cfg b/configs/xchat-with-net.cfg
index e8d2759..04c361b 100644
--- a/configs/xchat-with-net.cfg
+++ b/configs/xchat-with-net.cfg
@@ -1,3 +1,5 @@
+# Example config for nsjail
+
name: "xchat-with-net"
description: "This policy allows to run xchat inside a jail. Access to networking is"
@@ -7,11 +9,12 @@ description: "The only permitted home directory is $HOME/.xchat2 and $HOME/Docum
description: "The rest of available on the FS files/dires are libs and X-related files/dirs."
description: ""
description: "Run as:"
-description: "./nsjail --config configs/xchat-with-net.cfg --daemon -l /tmp/xchat.log"
+description: "./nsjail --config configs/xchat-with-net.cfg"
mode: ONCE
hostname: "XCHAT"
cwd: "/user"
+daemon: true
time_limit: 0
diff --git a/configs/znc-with-net.cfg b/configs/znc-with-net.cfg
new file mode 100644
index 0000000..bdcc53e
--- /dev/null
+++ b/configs/znc-with-net.cfg
@@ -0,0 +1,136 @@
+# Example config for nsjail
+
+name: "znc-with-net"
+
+description: "This policy allows to run znc a jail. "
+description: "Networking is permitted with this setup (clone_newnet: false). "
+description: ""
+description: "The only permitted home directory is $HOME/.znc."
+description: ""
+description: "Run as: nsjail --config configs/znc-with-net.cfg"
+
+mode: ONCE
+hostname: "ZNC"
+cwd: "/home/znc"
+daemon: true
+
+time_limit: 0
+
+envar: "HOME=/home/znc"
+envar: "TMP=/tmp"
+
+log_fd: 2
+
+rlimit_as: 4096
+rlimit_cpu_type: INF
+rlimit_fsize: 4096
+rlimit_nofile: 128
+
+clone_newnet: false
+
+mount {
+ dst: "/proc"
+ fstype: "proc"
+}
+
+mount {
+ src: "/lib"
+ dst: "/lib"
+ is_bind: true
+}
+
+mount {
+ src: "/usr/lib"
+ dst: "/usr/lib"
+ is_bind: true
+}
+
+mount {
+ src: "/lib64"
+ dst: "/lib64"
+ is_bind: true
+ mandatory: false
+}
+
+mount {
+ src: "/lib32"
+ dst: "/lib32"
+ is_bind: true
+ mandatory: false
+}
+
+mount {
+ src: "/usr/share"
+ dst: "/usr/share"
+ is_bind: true
+}
+
+mount {
+ src: "/dev/urandom"
+ dst: "/dev/urandom"
+ is_bind: true
+ rw: true
+}
+
+mount {
+ src: "/dev/null"
+ dst: "/dev/null"
+ is_bind: true
+ rw: true
+}
+
+mount {
+ src: "/etc/resolv.conf"
+ dst: "/etc/resolv.conf"
+ is_bind: true
+ mandatory: false
+}
+
+mount {
+ src: "/etc/ssl"
+ dst: "/etc/ssl"
+ is_bind: true
+}
+
+mount {
+ dst: "/tmp"
+ fstype: "tmpfs"
+ rw: true
+ is_bind: false
+}
+
+mount {
+ dst: "/dev/shm"
+ fstype: "tmpfs"
+ rw: true
+ is_bind: false
+}
+
+mount {
+ dst: "/home/znc"
+ fstype: "tmpfs"
+ rw: true
+ is_bind: false
+}
+
+mount {
+ prefix_src_env: "HOME"
+ src: "/.znc"
+ dst: "/home/znc/.znc"
+ rw: true
+ is_bind: true
+ mandatory: true
+}
+
+seccomp_string: "KILL {"
+seccomp_string: " ptrace,"
+seccomp_string: " process_vm_readv,"
+seccomp_string: " process_vm_writev"
+seccomp_string: "}"
+seccomp_string: "DEFAULT ALLOW"
+
+exec_bin {
+ path: "/usr/bin/znc"
+ arg: "-f"
+ exec_fd: true
+}
diff --git a/contain.cc b/contain.cc
index 176f216..b5120cc 100644
--- a/contain.cc
+++ b/contain.cc
@@ -100,9 +100,10 @@ static bool containPrepareEnv(nsjconf_t* nsjconf) {
PLOG_E("personality(%lx)", nsjconf->personality);
return false;
}
+ LOG_D("setpriority(%d)", nsjconf->nice_level);
errno = 0;
- if (setpriority(PRIO_PROCESS, 0, 19) == -1 && errno != 0) {
- PLOG_W("setpriority(19)");
+ if (setpriority(PRIO_PROCESS, 0, nsjconf->nice_level) == -1 && errno != 0) {
+ PLOG_W("setpriority(%d)", nsjconf->nice_level);
}
if (!nsjconf->skip_setsid) {
setsid();
@@ -119,6 +120,10 @@ static bool containCPU(nsjconf_t* nsjconf) {
}
static bool containSetLimits(nsjconf_t* nsjconf) {
+ if (nsjconf->disable_rl) {
+ return true;
+ }
+
struct rlimit64 rl;
rl.rlim_cur = rl.rlim_max = nsjconf->rl_as;
if (setrlimit64(RLIMIT_AS, &rl) == -1) {
@@ -155,6 +160,21 @@ static bool containSetLimits(nsjconf_t* nsjconf) {
PLOG_E("setrlimit64(0, RLIMIT_STACK, %" PRIu64 ")", nsjconf->rl_stack);
return false;
}
+ rl.rlim_cur = rl.rlim_max = nsjconf->rl_mlock;
+ if (setrlimit64(RLIMIT_MEMLOCK, &rl) == -1) {
+ PLOG_E("setrlimit64(0, RLIMIT_MEMLOCK, %" PRIu64 ")", nsjconf->rl_mlock);
+ return false;
+ }
+ rl.rlim_cur = rl.rlim_max = nsjconf->rl_rtpr;
+ if (setrlimit64(RLIMIT_RTPRIO, &rl) == -1) {
+ PLOG_E("setrlimit64(0, RLIMIT_RTPRIO, %" PRIu64 ")", nsjconf->rl_rtpr);
+ return false;
+ }
+ rl.rlim_cur = rl.rlim_max = nsjconf->rl_msgq;
+ if (setrlimit64(RLIMIT_MSGQUEUE, &rl) == -1) {
+ PLOG_E("setrlimit64(0, RLIMIT_MSGQUEUE , %" PRIu64 ")", nsjconf->rl_msgq);
+ return false;
+ }
return true;
}
@@ -175,14 +195,14 @@ static bool containMakeFdsCOENaive(nsjconf_t* nsjconf) {
continue;
}
if (containPassFd(nsjconf, fd)) {
- LOG_D("FD=%d will be passed to the child process", fd);
+ LOG_D("fd=%d will be passed to the child process", fd);
if (TEMP_FAILURE_RETRY(fcntl(fd, F_SETFD, flags & ~(FD_CLOEXEC))) == -1) {
- PLOG_E("Could not set FD_CLOEXEC for FD=%d", fd);
+ PLOG_E("Could not set FD_CLOEXEC for fd=%d", fd);
return false;
}
} else {
if (TEMP_FAILURE_RETRY(fcntl(fd, F_SETFD, flags | FD_CLOEXEC)) == -1) {
- PLOG_E("Could not set FD_CLOEXEC for FD=%d", fd);
+ PLOG_E("Could not set FD_CLOEXEC for fd=%d", fd);
return false;
}
}
@@ -228,21 +248,21 @@ static bool containMakeFdsCOEProc(nsjconf_t* nsjconf) {
}
int flags = TEMP_FAILURE_RETRY(fcntl(fd, F_GETFD, 0));
if (flags == -1) {
- PLOG_D("fcntl(fd=%xld, F_GETFD, 0)", fd);
+ PLOG_D("fcntl(fd=%d, F_GETFD, 0)", fd);
closedir(dir);
return false;
}
if (containPassFd(nsjconf, fd)) {
- LOG_D("FD=%d will be passed to the child process", fd);
+ LOG_D("fd=%d will be passed to the child process", fd);
if (TEMP_FAILURE_RETRY(fcntl(fd, F_SETFD, flags & ~(FD_CLOEXEC))) == -1) {
- PLOG_E("Could not clear FD_CLOEXEC for FD=%d", fd);
+ PLOG_E("Could not clear FD_CLOEXEC for fd=%d", fd);
closedir(dir);
return false;
}
} else {
- LOG_D("FD=%d will be closed before execve()", fd);
+ LOG_D("fd=%d will be closed before execve()", fd);
if (TEMP_FAILURE_RETRY(fcntl(fd, F_SETFD, flags | FD_CLOEXEC)) == -1) {
- PLOG_E("Could not set FD_CLOEXEC for FD=%d", fd);
+ PLOG_E("Could not set FD_CLOEXEC for fd=%d", fd);
closedir(dir);
return false;
}
@@ -265,14 +285,14 @@ static bool containMakeFdsCOE(nsjconf_t* nsjconf) {
bool setupFD(nsjconf_t* nsjconf, int fd_in, int fd_out, int fd_err) {
if (nsjconf->stderr_to_null) {
- LOG_D("Redirecting FD=2 (STDERR_FILENO) to /dev/null");
+ LOG_D("Redirecting fd=2 (STDERR_FILENO) to /dev/null");
if ((fd_err = TEMP_FAILURE_RETRY(open("/dev/null", O_RDWR))) == -1) {
PLOG_E("open('/dev/null', O_RDWR");
return false;
}
}
if (nsjconf->is_silent) {
- LOG_D("Redirecting FD=0/1/2 (STDIN/OUT/ERR_FILENO) to /dev/null");
+ LOG_D("Redirecting fd=0-2 (STDIN/OUT/ERR_FILENO) to /dev/null");
if (TEMP_FAILURE_RETRY(fd_in = fd_out = fd_err = open("/dev/null", O_RDWR)) == -1) {
PLOG_E("open('/dev/null', O_RDWR)");
return false;
diff --git a/logs.cc b/logs.cc
index 70eca39..d377505 100644
--- a/logs.cc
+++ b/logs.cc
@@ -38,8 +38,6 @@
#include "macros.h"
#include "util.h"
-#include <string.h>
-
namespace logs {
static int _log_fd = STDERR_FILENO;
@@ -47,40 +45,51 @@ static bool _log_fd_isatty = true;
static enum llevel_t _log_level = INFO;
static bool _log_set = false;
-__attribute__((constructor)) static void log_init(void) {
- _log_fd = fcntl(_log_fd, F_DUPFD_CLOEXEC, 0);
+static void setDupLogFdOr(int fd, int orfd) {
+ int saved_errno = errno;
+ _log_fd = fcntl(fd, F_DUPFD_CLOEXEC, 0);
if (_log_fd == -1) {
- _log_fd = STDERR_FILENO;
+ _log_fd = fcntl(orfd, F_DUPFD_CLOEXEC, 0);
}
- _log_fd_isatty = isatty(_log_fd);
-}
-
-bool logSet() {
- return _log_set;
+ if (_log_fd == -1) {
+ _log_fd = orfd;
+ }
+ _log_fd_isatty = (isatty(_log_fd) == 1);
+ errno = saved_errno;
}
/*
* Log to stderr by default. Use a dup()d fd, because in the future we'll associate the
* connection socket with fd (0, 1, 2).
*/
+__attribute__((constructor)) static void log_init(void) {
+ setDupLogFdOr(STDERR_FILENO, STDERR_FILENO);
+}
+
+bool logSet() {
+ return _log_set;
+}
void logLevel(enum llevel_t ll) {
_log_level = ll;
}
-void logFile(const std::string& logfile) {
+void logFile(const std::string& log_file, int log_fd) {
_log_set = true;
+ int newlogfd = -1;
+ if (!log_file.empty()) {
+ newlogfd = TEMP_FAILURE_RETRY(
+ open(log_file.c_str(), O_CREAT | O_RDWR | O_APPEND | O_CLOEXEC, 0640));
+ if (newlogfd == -1) {
+ PLOG_W("Couldn't open('%s')", log_file.c_str());
+ }
+ }
/* Close previous log_fd */
if (_log_fd > STDERR_FILENO) {
close(_log_fd);
- _log_fd = STDERR_FILENO;
- }
- if (TEMP_FAILURE_RETRY(_log_fd = open(logfile.c_str(),
- O_CREAT | O_RDWR | O_APPEND | O_CLOEXEC, 0640)) == -1) {
- _log_fd = STDERR_FILENO;
- PLOG_W("Couldn't open logfile open('%s')", logfile.c_str());
}
- _log_fd_isatty = (isatty(_log_fd) == 1);
+ setDupLogFdOr(newlogfd, log_fd);
+ close(newlogfd);
}
void logMsg(enum llevel_t ll, const char* fn, int ln, bool perr, const char* fmt, ...) {
@@ -112,7 +121,9 @@ void logMsg(enum llevel_t ll, const char* fn, int ln, bool perr, const char* fmt
if (_log_fd_isatty) {
msg.append(logLevels[ll].prefix);
}
- msg.append("[").append(logLevels[ll].descr).append("]");
+ if (ll != HELP && ll != HELP_BOLD) {
+ msg.append("[").append(logLevels[ll].descr).append("]");
+ }
if (logLevels[ll].print_time) {
msg.append("[").append(util::timeToStr(time(NULL))).append("]");
}
diff --git a/logs.h b/logs.h
index 27727ef..36e9813 100644
--- a/logs.h
+++ b/logs.h
@@ -59,7 +59,7 @@ void logMsg(enum llevel_t ll, const char* fn, int ln, bool perr, const char* fmt
__attribute__((format(printf, 5, 6)));
void logStop(int sig);
void logLevel(enum llevel_t ll);
-void logFile(const std::string& logfile);
+void logFile(const std::string& log_file, int log_fd);
bool logSet();
} // namespace logs
diff --git a/macros.h b/macros.h
index d29b03b..80e4b61 100644
--- a/macros.h
+++ b/macros.h
@@ -28,8 +28,7 @@
#define TEMP_FAILURE_RETRY(expression) \
(__extension__({ \
long int __result; \
- do \
- __result = (long int)(expression); \
+ do __result = (long int)(expression); \
while (__result == -1L && errno == EINTR); \
__result; \
}))
@@ -59,7 +58,7 @@ static void __attribute__ ((unused)) __clang_cleanup_func(void (^*dfunc) (void))
#define _DEFER(a, count) \
auto void _STRMERGE(__defer_f_, count)(void* _defer_arg __attribute__((unused))); \
int _STRMERGE(__defer_var_, count) __attribute__((cleanup(_STRMERGE(__defer_f_, count)))) \
- __attribute__((unused)); \
+ __attribute__((unused)); \
void _STRMERGE(__defer_f_, count)(void* _defer_arg __attribute__((unused)))
#define defer _DEFER(a, __COUNTER__)
#endif
diff --git a/mnt.cc b/mnt.cc
index 4da78d6..1ccd626 100644
--- a/mnt.cc
+++ b/mnt.cc
@@ -37,7 +37,6 @@
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/wait.h>
-#include <syscall.h>
#include <unistd.h>
#include <memory>
@@ -179,8 +178,8 @@ static bool mountPt(mount_t* mpt, const char* newroot, const char* tmpdir) {
return false;
}
if (!util::writeToFd(fd, mpt->src_content.data(), mpt->src_content.length())) {
- LOG_W("Writting %zu bytes to '%s' failed", mpt->src_content.length(),
- srcpath);
+ LOG_W(
+ "Writing %zu bytes to '%s' failed", mpt->src_content.length(), srcpath);
close(fd);
return false;
}
@@ -283,6 +282,13 @@ static bool mkdirAndTest(const std::string& dir) {
static std::unique_ptr<std::string> getDir(nsjconf_t* nsjconf, const char* name) {
std::unique_ptr<std::string> dir(new std::string);
+ dir->assign("/run/user/").append(std::to_string(nsjconf->orig_uid)).append("/nsjail");
+ if (mkdirAndTest(*dir)) {
+ dir->append("/").append(name);
+ if (mkdirAndTest(*dir)) {
+ return dir;
+ }
+ }
dir->assign("/run/user/")
.append("/nsjail.")
.append(std::to_string(nsjconf->orig_uid))
@@ -331,29 +337,26 @@ static std::unique_ptr<std::string> getDir(nsjconf_t* nsjconf, const char* name)
return nullptr;
}
-static bool initNsInternal(nsjconf_t* nsjconf) {
+static bool initNoCloneNs(nsjconf_t* nsjconf) {
/*
* If CLONE_NEWNS is not used, we would be changing the global mount namespace, so simply
* use --chroot in this case
*/
- if (!nsjconf->clone_newns) {
- if (nsjconf->chroot.empty()) {
- PLOG_E(
- "--chroot was not specified, and it's required when not using "
- "CLONE_NEWNS");
- return false;
- }
- if (chroot(nsjconf->chroot.c_str()) == -1) {
- PLOG_E("chroot('%s')", nsjconf->chroot.c_str());
- return false;
- }
- if (chdir("/") == -1) {
- PLOG_E("chdir('/')");
- return false;
- }
+ if (nsjconf->chroot.empty()) {
return true;
}
+ if (chroot(nsjconf->chroot.c_str()) == -1) {
+ PLOG_E("chroot('%s')", nsjconf->chroot.c_str());
+ return false;
+ }
+ if (chdir("/") == -1) {
+ PLOG_E("chdir('/')");
+ return false;
+ }
+ return true;
+}
+static bool initCloneNs(nsjconf_t* nsjconf) {
if (chdir("/") == -1) {
PLOG_E("chdir('/')");
return false;
@@ -395,25 +398,63 @@ static bool initNsInternal(nsjconf_t* nsjconf) {
PLOG_E("umount2('%s', MNT_DETACH)", tmpdir->c_str());
return false;
}
- /*
- * This requires some explanation: It's actually possible to pivot_root('/', '/'). After
- * this operation has been completed, the old root is mounted over the new root, and it's OK
- * to simply umount('/') now, and to have new_root as '/'. This allows us not care about
- * providing any special directory for old_root, which is sometimes not easy, given that
- * e.g. /tmp might not always be present inside new_root
- */
- if (syscall(__NR_pivot_root, destdir->c_str(), destdir->c_str()) == -1) {
- PLOG_E("pivot_root('%s', '%s')", destdir->c_str(), destdir->c_str());
- return false;
- }
- if (umount2("/", MNT_DETACH) == -1) {
- PLOG_E("umount2('/', MNT_DETACH)");
- return false;
- }
- if (chdir(nsjconf->cwd.c_str()) == -1) {
- PLOG_E("chdir('%s')", nsjconf->cwd.c_str());
- return false;
+ if (!nsjconf->no_pivotroot) {
+ /*
+ * This requires some explanation: It's actually possible to pivot_root('/', '/').
+ * After this operation has been completed, the old root is mounted over the new
+ * root, and it's OK to simply umount('/') now, and to have new_root as '/'. This
+ * allows us not care about providing any special directory for old_root, which is
+ * sometimes not easy, given that e.g. /tmp might not always be present inside
+ * new_root
+ */
+ if (util::syscall(__NR_pivot_root, (uintptr_t)destdir->c_str(),
+ (uintptr_t)destdir->c_str()) == -1) {
+ PLOG_E("pivot_root('%s', '%s')", destdir->c_str(), destdir->c_str());
+ return false;
+ }
+
+ if (umount2("/", MNT_DETACH) == -1) {
+ PLOG_E("umount2('/', MNT_DETACH)");
+ return false;
+ }
+ } else {
+ /*
+ * pivot_root would normally un-mount the old root, however in certain cases this
+ * operation is forbidden. There are systems (mainly embedded) that keep their root
+ * file system in RAM, when initially loaded by the kernel (e.g. initramfs),
+ * and there is no other file system that is mounted on top of it.In such systems,
+ * there is no option to pivot_root!
+ * For more information, see
+ * kernel.org/doc/Documentation/filesystems/ramfs-rootfs-initramfs.txt. switch_root
+ * alternative: Innstead of un-mounting the old rootfs, it is over mounted by moving
+ * the new root to it.
+ */
+
+ /* NOTE: Using mount move and chroot allows escaping back into the old root when
+ * proper capabilities are kept in the user namespace. It can be acheived by
+ * unmounting the new root and using setns to re-enter the mount namespace.
+ */
+ LOG_W(
+ "Using no_pivotroot is escapable when user posseses relevant capabilities, "
+ "Use it with care!");
+
+ if (chdir(destdir->c_str()) == -1) {
+ PLOG_E("chdir('%s')", destdir->c_str());
+ return false;
+ }
+
+ /* mount moving the new root on top of '/'. This operation is atomic and doesn't
+ involve un-mounting '/' at any stage */
+ if (mount(".", "/", NULL, MS_MOVE, NULL) == -1) {
+ PLOG_E("mount('/', %s, NULL, MS_MOVE, NULL)", destdir->c_str());
+ return false;
+ }
+
+ if (chroot(".") == -1) {
+ PLOG_E("chroot('%s')", destdir->c_str());
+ return false;
+ }
}
for (const auto& p : nsjconf->mountpts) {
@@ -425,6 +466,24 @@ static bool initNsInternal(nsjconf_t* nsjconf) {
return true;
}
+static bool initNsInternal(nsjconf_t* nsjconf) {
+ if (nsjconf->clone_newns) {
+ if (!initCloneNs(nsjconf)) {
+ return false;
+ }
+ } else {
+ if (!initNoCloneNs(nsjconf)) {
+ return false;
+ }
+ }
+
+ if (chdir(nsjconf->cwd.c_str()) == -1) {
+ PLOG_E("chdir('%s')", nsjconf->cwd.c_str());
+ return false;
+ }
+ return true;
+}
+
/*
* With mode MODE_STANDALONE_EXECVE it's required to mount /proc inside a new process,
* as the current process is still in the original PID namespace (man pid_namespaces)
@@ -434,7 +493,7 @@ bool initNs(nsjconf_t* nsjconf) {
return initNsInternal(nsjconf);
}
- pid_t pid = subproc::cloneProc(CLONE_FS | SIGCHLD);
+ pid_t pid = subproc::cloneProc(CLONE_FS, SIGCHLD);
if (pid == -1) {
return false;
}
@@ -459,7 +518,7 @@ static bool addMountPt(mount_t* mnt, const std::string& src, const std::string&
if (!src_env.empty()) {
const char* e = getenv(src_env.c_str());
if (e == NULL) {
- LOG_W("No such envvar:'%s'", src_env.c_str());
+ LOG_W("No such envar:'%s'", src_env.c_str());
return false;
}
mnt->src = e;
@@ -469,7 +528,7 @@ static bool addMountPt(mount_t* mnt, const std::string& src, const std::string&
if (!dst_env.empty()) {
const char* e = getenv(dst_env.c_str());
if (e == NULL) {
- LOG_W("No such envvar:'%s'", dst_env.c_str());
+ LOG_W("No such envar:'%s'", dst_env.c_str());
return false;
}
mnt->dst = e;
@@ -553,9 +612,9 @@ const std::string describeMountPt(const mount_t& mpt) {
.append("'");
if (mpt.is_dir) {
- descr.append(" is_dir:true");
+ descr.append(" dir:true");
} else {
- descr.append(" is_dir:false");
+ descr.append(" dir:false");
}
if (!mpt.is_mandatory) {
descr.append(" mandatory:false");
diff --git a/net.cc b/net.cc
index fb78e9b..87c3df9 100644
--- a/net.cc
+++ b/net.cc
@@ -23,6 +23,7 @@
#include <arpa/inet.h>
#include <errno.h>
+#include <fcntl.h>
#include <net/if.h>
#include <net/route.h>
#include <netinet/in.h>
@@ -50,7 +51,6 @@ namespace net {
#define IFACE_NAME "vs"
-#if defined(NSJAIL_NL3_WITH_MACVLAN)
#include <netlink/route/link.h>
#include <netlink/route/link/macvlan.h>
@@ -85,6 +85,12 @@ static bool cloneIface(
nl_addr_put(nladdr);
}
+ if ((err = rtnl_link_macvlan_set_mode(
+ rmv, rtnl_link_macvlan_str2mode(nsjconf->iface_vs_mo.c_str()))) < 0) {
+ LOG_E("rtnl_link_macvlan_set_mode(mode:'%s') failed: %s",
+ nsjconf->iface_vs_mo.c_str(), nl_geterror(err));
+ }
+
if ((err = rtnl_link_add(sk, rmv, NLM_F_CREATE)) < 0) {
LOG_E("rtnl_link_add(name:'%s' link:'%s'): %s", IFACE_NAME,
nsjconf->iface_vs.c_str(), nl_geterror(err));
@@ -116,7 +122,7 @@ static bool moveToNs(
int err = rtnl_link_change(sk, orig_link, new_link, RTM_SETLINK);
if (err < 0) {
- LOG_E("rtnl_link_change(): set NS of interface '%s' to PID=%d: %s", iface.c_str(),
+ LOG_E("rtnl_link_change(): set NS of interface '%s' to pid=%d: %s", iface.c_str(),
(int)pid, nl_geterror(err));
rtnl_link_put(new_link);
rtnl_link_put(orig_link);
@@ -169,52 +175,6 @@ bool initNsFromParent(nsjconf_t* nsjconf, int pid) {
nl_socket_free(sk);
return true;
}
-#else // defined(NSJAIL_NL3_WITH_MACVLAN)
-
-static bool moveToNs(const std::string& iface, pid_t pid) {
- const std::vector<std::string> argv{
- "/sbin/ip", "link", "set", iface, "netns", std::to_string(pid)};
- if (subproc::systemExe(argv, environ) != 0) {
- LOG_E("Couldn't put interface '%s' into NET ns of the PID=%d", iface.c_str(),
- (int)pid);
- return false;
- }
- return true;
-}
-
-bool initNsFromParent(nsjconf_t* nsjconf, int pid) {
- if (!nsjconf->clone_newnet) {
- return true;
- }
- for (const auto& iface : nsjconf->ifaces) {
- if (!moveToNs(iface, pid)) {
- return false;
- }
- }
- if (nsjconf->iface_vs.empty()) {
- return true;
- }
-
- LOG_D("Putting iface:'%s' into namespace of PID:%d (with /sbin/ip)",
- nsjconf->iface_vs.c_str(), pid);
-
- std::vector<std::string> argv;
-
- if (nsjconf->iface_vs_ma != "") {
- argv = {"/sbin/ip", "link", "add", "link", nsjconf->iface_vs, "name", IFACE_NAME,
- "netns", std::to_string(pid), "address", nsjconf->iface_vs_ma, "type",
- "macvlan", "mode", "bridge"};
- } else {
- argv = {"/sbin/ip", "link", "add", "link", nsjconf->iface_vs, "name", IFACE_NAME,
- "netns", std::to_string(pid), "type", "macvlan", "mode", "bridge"};
- }
- if (subproc::systemExe(argv, environ) != 0) {
- LOG_E("Couldn't create MACVTAP interface for '%s'", nsjconf->iface_vs.c_str());
- return false;
- }
- return true;
-}
-#endif // defined(NSJAIL_NL3_WITH_MACVLAN)
static bool isSocket(int fd) {
int optval;
@@ -228,6 +188,12 @@ static bool isSocket(int fd) {
bool limitConns(nsjconf_t* nsjconf, int connsock) {
/* 0 means 'unlimited' */
+ if (nsjconf->max_conns != 0 && nsjconf->pids.size() >= nsjconf->max_conns) {
+ LOG_W("Rejecting connection, max_conns limit reached: %u", nsjconf->max_conns);
+ return false;
+ }
+
+ /* 0 means 'unlimited' */
if (nsjconf->max_conns_per_ip == 0) {
return true;
}
@@ -237,8 +203,8 @@ bool limitConns(nsjconf_t* nsjconf, int connsock) {
unsigned cnt = 0;
for (const auto& pid : nsjconf->pids) {
- if (memcmp(addr.sin6_addr.s6_addr, pid.remote_addr.sin6_addr.s6_addr,
- sizeof(pid.remote_addr.sin6_addr.s6_addr)) == 0) {
+ if (memcmp(addr.sin6_addr.s6_addr, pid.second.remote_addr.sin6_addr.s6_addr,
+ sizeof(pid.second.remote_addr.sin6_addr.s6_addr)) == 0) {
cnt++;
}
}
@@ -252,7 +218,7 @@ bool limitConns(nsjconf_t* nsjconf, int connsock) {
}
int getRecvSocket(const char* bindhost, int port) {
- if (port < 1 || port > 65535) {
+ if (port < 0 || port > 65535) {
LOG_F(
"TCP port %d out of bounds (0 <= port <= 65535), specify one with --port "
"<port>",
@@ -279,6 +245,10 @@ int getRecvSocket(const char* bindhost, int port) {
PLOG_E("socket(AF_INET6)");
return -1;
}
+ if (fcntl(sockfd, F_SETFL, O_NONBLOCK)) {
+ PLOG_E("fcntl(%d, F_SETFL, O_NONBLOCK)", sockfd);
+ return -1;
+ }
int so = 1;
if (setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &so, sizeof(so)) == -1) {
PLOG_E("setsockopt(%d, SO_REUSEADDR)", sockfd);
@@ -311,7 +281,7 @@ int getRecvSocket(const char* bindhost, int port) {
int acceptConn(int listenfd) {
struct sockaddr_in6 cli_addr;
socklen_t socklen = sizeof(cli_addr);
- int connfd = accept(listenfd, (struct sockaddr*)&cli_addr, &socklen);
+ int connfd = accept4(listenfd, (struct sockaddr*)&cli_addr, &socklen, SOCK_NONBLOCK);
if (connfd == -1) {
if (errno != EINTR) {
PLOG_E("accept(%d)", listenfd);
diff --git a/nsjail.1 b/nsjail.1
index f8ffca5..439f8e1 100644
--- a/nsjail.1
+++ b/nsjail.1
@@ -44,10 +44,10 @@ Directory containing / of the jail (default: none)
Mount chroot dir (/) R/W (default: R/O)
.TP
\fB\-\-user\fR|\fB\-u\fR VALUE
-Username/uid of processess inside the jail (default: your current uid). You can also use inside_ns_uid:outside_ns_uid:count convention here. Can be specified multiple times
+Username/uid of processes inside the jail (default: your current uid). You can also use inside_ns_uid:outside_ns_uid:count convention here. Can be specified multiple times
.TP
\fB\-\-group\fR|\fB\-g\fR VALUE
-Groupname/gid of processess inside the jail (default: your current gid). You can also use inside_ns_gid:global_ns_gid:count convention here. Can be specified multiple times
+Groupname/gid of processes inside the jail (default: your current gid). You can also use inside_ns_gid:global_ns_gid:count convention here. Can be specified multiple times
.TP
\fB\-\-hostname\fR|\fB\-H\fR VALUE
UTS name (hostname) of the jail (default: 'NSJAIL')
@@ -61,6 +61,9 @@ TCP port to bind to (enables MODE_LISTEN_TCP) (default: 0)
\fB\-\-bindhost\fR VALUE
IP address to bind the port to (only in [MODE_LISTEN_TCP]), (default: '::')
.TP
+\fB\-\-max_conns\fR VALUE
+Maximum number of connections across all IPs (only in [MODE_LISTEN_TCP]), (default: 0 (unlimited))
+.TP
\fB\-\-max_conns_per_ip\fR|\fB\-i\fR VALUE
Maximum number of connections per one IP (only in [MODE_LISTEN_TCP]), (default: 0 (unlimited))
.TP
@@ -89,10 +92,10 @@ Log warning and more important messages only
Log fatal messages only
.TP
\fB\-\-keep_env\fR|\fB\-e\fR
-Pass all environment variables be passed process (default: all envvars are cleared)
+Pass all environment variables be passed process (default: all envars are cleared)
.TP
\fB\-\-env\fR|\fB\-E\fR VALUE
-Additional environment variable (can be used multiple times). If the envvar doesn't contain '=' (e.g. just the 'DISPLAY' string), the current envvar value will be used
+Additional environment variable (can be used multiple times). If the envar doesn't contain '=' (e.g. just the 'DISPLAY' string), the current envar value will be used
.TP
\fB\-\-keep_caps\fR
Don't drop any capabilities
@@ -136,6 +139,9 @@ RLIMIT_NPROC, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for th
\fB\-\-rlimit_stack\fR VALUE
RLIMIT_STACK in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM_INFINITY (default: 'soft')
.TP
+\fB\-\-disable_rlimits\fR
+Disable all rlimits, default to limits set by parent
+.TP
\fB\-\-persona_addr_compat_layout\fR
personality(ADDR_COMPAT_LAYOUT)
.TP
@@ -247,6 +253,12 @@ Location of cpu cgroup FS (default: '/sys/fs/cgroup/net_cls')
\fB\-\-cgroup_cpu_parent\fR VALUE
Which pre-existing cpu cgroup to use as a parent (default: 'NSJAIL')
.TP
+\fB\-\-cgroupv2_mount\fR VALUE
+Location of cgroup v2 directory (default: '/sys/fs/cgroup')
+.TP
+\fB\-\-use_cgroupv2\fR
+Use cgroup v2
+.TP
\fB\-\-iface_no_lo\fR
Don't bring the 'lo' interface up
.TP
diff --git a/nsjail.cc b/nsjail.cc
index 0b57033..be8b604 100644
--- a/nsjail.cc
+++ b/nsjail.cc
@@ -21,6 +21,8 @@
#include "nsjail.h"
+#include <fcntl.h>
+#include <poll.h>
#include <signal.h>
#include <stdbool.h>
#include <stdio.h>
@@ -31,7 +33,10 @@
#include <termios.h>
#include <unistd.h>
+#include <algorithm>
+#include <cerrno>
#include <memory>
+#include <vector>
#include "cmdline.h"
#include "logs.h"
@@ -47,10 +52,7 @@ static __thread int sigFatal = 0;
static __thread bool showProc = false;
static void sigHandler(int sig) {
- if (sig == SIGALRM) {
- return;
- }
- if (sig == SIGCHLD) {
+ if (sig == SIGALRM || sig == SIGCHLD || sig == SIGPIPE) {
return;
}
if (sig == SIGUSR1 || sig == SIGQUIT) {
@@ -74,7 +76,7 @@ static bool setSigHandler(int sig) {
if (sig == SIGTTIN || sig == SIGTTOU) {
sa.sa_handler = SIG_IGN;
- };
+ }
if (sigaction(sig, &sa, NULL) == -1) {
PLOG_E("sigaction(%d)", sig);
return false;
@@ -115,6 +117,104 @@ static bool setTimer(nsjconf_t* nsjconf) {
return true;
}
+static bool pipeTraffic(nsjconf_t* nsjconf, int listenfd) {
+ std::vector<struct pollfd> fds;
+ fds.reserve(nsjconf->pipes.size() * 3 + 1);
+ for (const auto& p : nsjconf->pipes) {
+ fds.push_back({
+ .fd = p.sock_fd,
+ .events = POLLIN | POLLOUT,
+ .revents = 0,
+ });
+ fds.push_back({
+ .fd = p.pipe_in,
+ .events = POLLOUT,
+ .revents = 0,
+ });
+ fds.push_back({
+ .fd = p.pipe_out,
+ .events = POLLIN,
+ .revents = 0,
+ });
+ }
+ fds.push_back({
+ .fd = listenfd,
+ .events = POLLIN,
+ .revents = 0,
+ });
+ LOG_D("Waiting for fd activity");
+ while (poll(fds.data(), fds.size(), -1) > 0) {
+ if (sigFatal > 0 || showProc) {
+ return false;
+ }
+ if (fds.back().revents != 0) {
+ LOG_D("New connection ready");
+ return true;
+ }
+ bool cleanup = false;
+ for (size_t i = 0; i < fds.size() - 1; ++i) {
+ if (fds[i].revents & POLLIN) {
+ fds[i].events &= ~POLLIN;
+ }
+ if (fds[i].revents & POLLOUT) {
+ fds[i].events &= ~POLLOUT;
+ }
+ }
+ for (size_t i = 0; i < fds.size() - 3; i += 3) {
+ const size_t pipe_no = i / 3;
+ int in, out;
+ const char* direction;
+ bool closed = false;
+ std::tuple<int, int, const char*> direction_map[] = {
+ std::make_tuple(i, i + 1, "in"),
+ std::make_tuple(i + 2, i, "out"),
+ };
+ for (const auto& entry : direction_map) {
+ std::tie(in, out, direction) = entry;
+ bool in_ready = (fds[in].events & POLLIN) == 0 ||
+ (fds[in].revents & POLLIN) == POLLIN;
+ bool out_ready = (fds[out].events & POLLOUT) == 0 ||
+ (fds[out].revents & POLLOUT) == POLLOUT;
+ if (in_ready && out_ready) {
+ LOG_D("#%zu piping data %s", pipe_no, direction);
+ ssize_t rv = splice(fds[in].fd, nullptr, fds[out].fd,
+ nullptr, 4096, SPLICE_F_NONBLOCK);
+ if (rv == -1 && errno != EAGAIN) {
+ PLOG_E("splice fd pair #%zu {%d, %d}\n", pipe_no,
+ fds[in].fd, fds[out].fd);
+ }
+ if (rv == 0) {
+ closed = true;
+ }
+ fds[in].events |= POLLIN;
+ fds[out].events |= POLLOUT;
+ }
+ if ((fds[in].revents & (POLLERR | POLLHUP)) != 0 ||
+ (fds[out].revents & (POLLERR | POLLHUP)) != 0) {
+ closed = true;
+ }
+ }
+ if (closed) {
+ LOG_D("#%zu connection closed", pipe_no);
+ cleanup = true;
+ close(nsjconf->pipes[pipe_no].sock_fd);
+ close(nsjconf->pipes[pipe_no].pipe_in);
+ close(nsjconf->pipes[pipe_no].pipe_out);
+ if (nsjconf->pipes[pipe_no].pid > 0) {
+ kill(nsjconf->pipes[pipe_no].pid, SIGKILL);
+ }
+ nsjconf->pipes[pipe_no] = {};
+ }
+ }
+ if (cleanup) {
+ break;
+ }
+ }
+ nsjconf->pipes.erase(std::remove(nsjconf->pipes.begin(), nsjconf->pipes.end(), pipemap_t{}),
+ nsjconf->pipes.end());
+ return false;
+}
+
static int listenMode(nsjconf_t* nsjconf) {
int listenfd = net::getRecvSocket(nsjconf->bindhost.c_str(), nsjconf->port);
if (listenfd == -1) {
@@ -131,10 +231,35 @@ static int listenMode(nsjconf_t* nsjconf) {
showProc = false;
subproc::displayProc(nsjconf);
}
- int connfd = net::acceptConn(listenfd);
- if (connfd >= 0) {
- subproc::runChild(nsjconf, connfd, connfd, connfd);
- close(connfd);
+ if (pipeTraffic(nsjconf, listenfd)) {
+ int connfd = net::acceptConn(listenfd);
+ if (connfd >= 0) {
+ int in[2];
+ int out[2];
+ if (pipe(in) != 0 || pipe(out) != 0) {
+ PLOG_E("pipe");
+ continue;
+ }
+
+ pid_t pid =
+ subproc::runChild(nsjconf, connfd, in[0], out[1], out[1]);
+
+ close(in[0]);
+ close(out[1]);
+
+ if (pid <= 0) {
+ close(in[1]);
+ close(out[0]);
+ close(connfd);
+ } else {
+ nsjconf->pipes.push_back({
+ .sock_fd = connfd,
+ .pipe_in = in[1],
+ .pipe_out = out[0],
+ .pid = pid,
+ });
+ }
+ }
}
subproc::reapProc(nsjconf);
}
@@ -142,7 +267,8 @@ static int listenMode(nsjconf_t* nsjconf) {
static int standaloneMode(nsjconf_t* nsjconf) {
for (;;) {
- if (!subproc::runChild(nsjconf, STDIN_FILENO, STDOUT_FILENO, STDERR_FILENO)) {
+ if (subproc::runChild(nsjconf, /* netfd= */ -1, STDIN_FILENO, STDOUT_FILENO,
+ STDERR_FILENO) == -1) {
LOG_E("Couldn't launch the child process");
return 0xff;
}
@@ -188,7 +314,10 @@ void setTC(int fd, const struct termios* trm) {
PLOG_W("ioctl(fd=%d, TCSETS) failed", fd);
return;
}
- LOG_D("Restored the previous state of the TTY");
+ if (tcflush(fd, TCIFLUSH) == -1) {
+ PLOG_W("tcflush(fd=%d, TCIFLUSH) failed", fd);
+ return;
+ }
}
} // namespace nsjail
@@ -200,10 +329,7 @@ int main(int argc, char* argv[]) {
if (!nsjconf) {
LOG_F("Couldn't parse cmdline options");
}
- if (!nsjconf->clone_newuser && geteuid() != 0) {
- LOG_W("--disable_clone_newuser might require root() privs");
- }
- if (nsjconf->daemonize && (daemon(0, 0) == -1)) {
+ if (nsjconf->daemonize && (daemon(/* nochdir= */ 1, /* noclose= */ 0) == -1)) {
PLOG_F("daemon");
}
cmdline::logParams(nsjconf.get());
@@ -226,7 +352,9 @@ int main(int argc, char* argv[]) {
sandbox::closePolicy(nsjconf.get());
/* Try to restore the underlying console's params in case some program has changed it */
- nsjail::setTC(STDIN_FILENO, trm.get());
+ if (!nsjconf->daemonize) {
+ nsjail::setTC(STDIN_FILENO, trm.get());
+ }
LOG_D("Returning with %d", ret);
return ret;
diff --git a/nsjail.h b/nsjail.h
index f91b8fd..fc8bf70 100644
--- a/nsjail.h
+++ b/nsjail.h
@@ -32,6 +32,7 @@
#include <time.h>
#include <unistd.h>
+#include <map>
#include <string>
#include <vector>
@@ -44,10 +45,10 @@ static const int nssigs[] = {
SIGTERM,
SIGTTIN,
SIGTTOU,
+ SIGPIPE,
};
struct pids_t {
- pid_t pid;
time_t start;
std::string remote_txt;
struct sockaddr_in6 remote_addr;
@@ -81,6 +82,16 @@ enum ns_mode_t {
MODE_STANDALONE_RERUN
};
+struct pipemap_t {
+ int sock_fd;
+ int pipe_in;
+ int pipe_out;
+ pid_t pid;
+ bool operator==(const pipemap_t& o) {
+ return sock_fd == o.sock_fd && pipe_in == o.pipe_in && pipe_out == o.pipe_out;
+ }
+};
+
struct nsjconf_t {
std::string exec_file;
bool use_execveat;
@@ -104,19 +115,26 @@ struct nsjconf_t {
uint64_t rl_nofile;
uint64_t rl_nproc;
uint64_t rl_stack;
+ uint64_t rl_mlock;
+ uint64_t rl_rtpr;
+ uint64_t rl_msgq;
+ bool disable_rl;
unsigned long personality;
bool clone_newnet;
bool clone_newuser;
bool clone_newns;
+ bool no_pivotroot;
bool clone_newpid;
bool clone_newipc;
bool clone_newuts;
bool clone_newcgroup;
+ bool clone_newtime;
enum ns_mode_t mode;
bool is_root_rw;
bool is_silent;
bool stderr_to_null;
bool skip_setsid;
+ unsigned int max_conns;
unsigned int max_conns_per_ip;
std::string proc_path;
bool is_proc_rw;
@@ -126,6 +144,7 @@ struct nsjconf_t {
std::string iface_vs_nm;
std::string iface_vs_gw;
std::string iface_vs_ma;
+ std::string iface_vs_mo;
std::string cgroup_mem_mount;
std::string cgroup_mem_parent;
size_t cgroup_mem_max;
@@ -138,20 +157,25 @@ struct nsjconf_t {
std::string cgroup_cpu_mount;
std::string cgroup_cpu_parent;
unsigned int cgroup_cpu_ms_per_sec;
+ std::string cgroupv2_mount;
+ bool use_cgroupv2;
std::string kafel_file_path;
std::string kafel_string;
struct sock_fprog seccomp_fprog;
bool seccomp_log;
+ int nice_level;
long num_cpus;
uid_t orig_uid;
+ uid_t orig_euid;
std::vector<mount_t> mountpts;
- std::vector<pids_t> pids;
+ std::map<pid_t, pids_t> pids;
std::vector<idmap_t> uids;
std::vector<idmap_t> gids;
std::vector<std::string> envs;
std::vector<int> openfds;
std::vector<int> caps;
std::vector<std::string> ifaces;
+ std::vector<pipemap_t> pipes;
};
#endif /* _NSJAIL_H */
diff --git a/pid.cc b/pid.cc
index 593018b..8165c03 100644
--- a/pid.cc
+++ b/pid.cc
@@ -48,7 +48,7 @@ bool initNs(nsjconf_t* nsjconf) {
* first clone/fork will work, and the rest will fail with ENOMEM (see 'man pid_namespaces'
* for details on this behavior)
*/
- pid_t pid = subproc::cloneProc(CLONE_FS);
+ pid_t pid = subproc::cloneProc(CLONE_FS, 0);
if (pid == -1) {
PLOG_E("Couldn't create a dummy init process");
return false;
diff --git a/sandbox.cc b/sandbox.cc
index d987bfb..1f1ac6c 100644
--- a/sandbox.cc
+++ b/sandbox.cc
@@ -33,6 +33,7 @@ extern "C" {
#include "kafel.h"
}
#include "logs.h"
+#include "util.h"
namespace sandbox {
@@ -65,9 +66,9 @@ static bool prepareAndCommit(nsjconf_t* nsjconf) {
"too old?)");
return false;
#else
- if (syscall(__NR_seccomp, (uintptr_t)SECCOMP_SET_MODE_FILTER,
+ if (util::syscall(__NR_seccomp, (uintptr_t)SECCOMP_SET_MODE_FILTER,
(uintptr_t)(SECCOMP_FILTER_FLAG_TSYNC | SECCOMP_FILTER_FLAG_LOG),
- &nsjconf->seccomp_fprog) == -1) {
+ (uintptr_t)&nsjconf->seccomp_fprog) == -1) {
PLOG_E(
"seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC | "
"SECCOMP_FILTER_FLAG_LOG) failed");
diff --git a/subproc.cc b/subproc.cc
index dc05383..bd2bdfe 100644
--- a/subproc.cc
+++ b/subproc.cc
@@ -45,6 +45,7 @@
#include <vector>
#include "cgroup.h"
+#include "cgroup2.h"
#include "contain.h"
#include "logs.h"
#include "macros.h"
@@ -58,6 +59,9 @@ namespace subproc {
#if !defined(CLONE_NEWCGROUP)
#define CLONE_NEWCGROUP 0x02000000
#endif /* !defined(CLONE_NEWCGROUP) */
+#if !defined(CLONE_NEWTIME)
+#define CLONE_NEWTIME 0x00000080
+#endif /* !defined(CLONE_NEWTIME) */
static const std::string cloneFlagsToStr(uintptr_t flags) {
std::string res;
@@ -66,43 +70,50 @@ static const std::string cloneFlagsToStr(uintptr_t flags) {
const uintptr_t flag;
const char* const name;
} static const cloneFlags[] = {
- NS_VALSTR_STRUCT(CLONE_VM),
- NS_VALSTR_STRUCT(CLONE_FS),
- NS_VALSTR_STRUCT(CLONE_FILES),
- NS_VALSTR_STRUCT(CLONE_SIGHAND),
- NS_VALSTR_STRUCT(CLONE_PTRACE),
- NS_VALSTR_STRUCT(CLONE_VFORK),
- NS_VALSTR_STRUCT(CLONE_PARENT),
- NS_VALSTR_STRUCT(CLONE_THREAD),
- NS_VALSTR_STRUCT(CLONE_NEWNS),
- NS_VALSTR_STRUCT(CLONE_SYSVSEM),
- NS_VALSTR_STRUCT(CLONE_SETTLS),
- NS_VALSTR_STRUCT(CLONE_PARENT_SETTID),
- NS_VALSTR_STRUCT(CLONE_CHILD_CLEARTID),
- NS_VALSTR_STRUCT(CLONE_DETACHED),
- NS_VALSTR_STRUCT(CLONE_UNTRACED),
- NS_VALSTR_STRUCT(CLONE_CHILD_SETTID),
- NS_VALSTR_STRUCT(CLONE_NEWCGROUP),
- NS_VALSTR_STRUCT(CLONE_NEWUTS),
- NS_VALSTR_STRUCT(CLONE_NEWIPC),
- NS_VALSTR_STRUCT(CLONE_NEWUSER),
- NS_VALSTR_STRUCT(CLONE_NEWPID),
- NS_VALSTR_STRUCT(CLONE_NEWNET),
- NS_VALSTR_STRUCT(CLONE_IO),
+ NS_VALSTR_STRUCT(CLONE_NEWTIME),
+ NS_VALSTR_STRUCT(CLONE_VM),
+ NS_VALSTR_STRUCT(CLONE_FS),
+ NS_VALSTR_STRUCT(CLONE_FILES),
+ NS_VALSTR_STRUCT(CLONE_SIGHAND),
+#if !defined(CLONE_PIDFD)
+#define CLONE_PIDFD 0x00001000
+#endif
+ NS_VALSTR_STRUCT(CLONE_PIDFD),
+ NS_VALSTR_STRUCT(CLONE_PTRACE),
+ NS_VALSTR_STRUCT(CLONE_VFORK),
+ NS_VALSTR_STRUCT(CLONE_PARENT),
+ NS_VALSTR_STRUCT(CLONE_THREAD),
+ NS_VALSTR_STRUCT(CLONE_NEWNS),
+ NS_VALSTR_STRUCT(CLONE_SYSVSEM),
+ NS_VALSTR_STRUCT(CLONE_SETTLS),
+ NS_VALSTR_STRUCT(CLONE_PARENT_SETTID),
+ NS_VALSTR_STRUCT(CLONE_CHILD_CLEARTID),
+ NS_VALSTR_STRUCT(CLONE_DETACHED),
+ NS_VALSTR_STRUCT(CLONE_UNTRACED),
+ NS_VALSTR_STRUCT(CLONE_CHILD_SETTID),
+ NS_VALSTR_STRUCT(CLONE_NEWCGROUP),
+ NS_VALSTR_STRUCT(CLONE_NEWUTS),
+ NS_VALSTR_STRUCT(CLONE_NEWIPC),
+ NS_VALSTR_STRUCT(CLONE_NEWUSER),
+ NS_VALSTR_STRUCT(CLONE_NEWPID),
+ NS_VALSTR_STRUCT(CLONE_NEWNET),
+ NS_VALSTR_STRUCT(CLONE_IO),
};
- uintptr_t knownFlagMask = CSIGNAL;
+ uintptr_t knownFlagMask = 0;
for (const auto& i : cloneFlags) {
if (flags & i.flag) {
- res.append(i.name).append("|");
+ if (!res.empty()) {
+ res.append("|");
+ }
+ res.append(i.name);
}
knownFlagMask |= i.flag;
}
if (flags & ~(knownFlagMask)) {
- util::StrAppend(&res, "%#tx|", flags & ~(knownFlagMask));
+ util::StrAppend(&res, "|%#tx", flags & ~(knownFlagMask));
}
- res.append(util::sigName(flags & CSIGNAL).c_str());
return res;
}
@@ -128,7 +139,8 @@ static bool resetEnv(void) {
static const char kSubprocDoneChar = 'D';
static const char kSubprocErrorChar = 'E';
-static void subprocNewProc(nsjconf_t* nsjconf, int fd_in, int fd_out, int fd_err, int pipefd) {
+static void subprocNewProc(
+ nsjconf_t* nsjconf, int netfd, int fd_in, int fd_out, int fd_err, int pipefd) {
if (!contain::setupFD(nsjconf, fd_in, fd_out, fd_err)) {
return;
}
@@ -141,7 +153,12 @@ static void subprocNewProc(nsjconf_t* nsjconf, int fd_in, int fd_out, int fd_err
LOG_E("Couldn't initialize net user namespace");
return;
}
- if (!cgroup::initNsFromParent(nsjconf, getpid())) {
+ if (nsjconf->use_cgroupv2) {
+ if (!cgroup2::initNsFromParent(nsjconf, getpid())) {
+ LOG_E("Couldn't initialize net user namespace");
+ return;
+ }
+ } else if (!cgroup::initNsFromParent(nsjconf, getpid())) {
LOG_E("Couldn't initialize net user namespace");
return;
}
@@ -164,7 +181,7 @@ static void subprocNewProc(nsjconf_t* nsjconf, int fd_in, int fd_out, int fd_err
putenv(const_cast<char*>(env.c_str()));
}
- auto connstr = net::connToText(fd_in, /* remote= */ true, NULL);
+ auto connstr = net::connToText(netfd, /* remote= */ true, NULL);
LOG_I("Executing '%s' for '%s'", nsjconf->exec_file.c_str(), connstr.c_str());
std::vector<const char*> argv;
@@ -181,8 +198,8 @@ static void subprocNewProc(nsjconf_t* nsjconf, int fd_in, int fd_out, int fd_err
if (nsjconf->use_execveat) {
#if defined(__NR_execveat)
- syscall(__NR_execveat, (uintptr_t)nsjconf->exec_fd, "", (char* const*)argv.data(),
- environ, (uintptr_t)AT_EMPTY_PATH);
+ util::syscall(__NR_execveat, nsjconf->exec_fd, (uintptr_t) "",
+ (uintptr_t)argv.data(), (uintptr_t)environ, AT_EMPTY_PATH);
#else /* defined(__NR_execveat) */
LOG_E("Your system doesn't support execveat() syscall");
return;
@@ -197,7 +214,6 @@ static void subprocNewProc(nsjconf_t* nsjconf, int fd_in, int fd_out, int fd_err
static void addProc(nsjconf_t* nsjconf, pid_t pid, int sock) {
pids_t p;
- p.pid = pid;
p.start = time(NULL);
p.remote_txt = net::connToText(sock, /* remote= */ true, &p.remote_addr);
@@ -205,24 +221,27 @@ static void addProc(nsjconf_t* nsjconf, pid_t pid, int sock) {
snprintf(fname, sizeof(fname), "/proc/%d/syscall", (int)pid);
p.pid_syscall_fd = TEMP_FAILURE_RETRY(open(fname, O_RDONLY | O_CLOEXEC));
- nsjconf->pids.push_back(p);
+ if (nsjconf->pids.find(pid) != nsjconf->pids.end()) {
+ LOG_F("pid=%d already exists", pid);
+ }
+ nsjconf->pids.insert(std::make_pair(pid, p));
- LOG_D("Added pid '%d' with start time '%u' to the queue for IP: '%s'", p.pid,
+ LOG_D("Added pid=%d with start time '%u' to the queue for IP: '%s'", pid,
(unsigned int)p.start, p.remote_txt.c_str());
}
static void removeProc(nsjconf_t* nsjconf, pid_t pid) {
- for (auto p = nsjconf->pids.begin(); p != nsjconf->pids.end(); ++p) {
- if (p->pid == pid) {
- LOG_D("Removing pid '%d' from the queue (IP:'%s', start time:'%s')", p->pid,
- p->remote_txt.c_str(), util::timeToStr(p->start).c_str());
- close(p->pid_syscall_fd);
- nsjconf->pids.erase(p);
-
- return;
- }
+ if (nsjconf->pids.find(pid) == nsjconf->pids.end()) {
+ LOG_W("pid=%d doesn't exist ?", pid);
+ return;
}
- LOG_W("PID: %d not found (?)", pid);
+
+ const auto& p = nsjconf->pids[pid];
+ LOG_D("Removed pid=%d from the queue (IP:'%s', start time:'%s')", pid, p.remote_txt.c_str(),
+ util::timeToStr(p.start).c_str());
+
+ close(p.pid_syscall_fd);
+ nsjconf->pids.erase(pid);
}
int countProc(nsjconf_t* nsjconf) {
@@ -233,37 +252,35 @@ void displayProc(nsjconf_t* nsjconf) {
LOG_I("Total number of spawned namespaces: %d", countProc(nsjconf));
time_t now = time(NULL);
for (const auto& pid : nsjconf->pids) {
- time_t diff = now - pid.start;
+ time_t diff = now - pid.second.start;
uint64_t left = nsjconf->tlimit ? nsjconf->tlimit - (uint64_t)diff : 0;
- LOG_I("PID: %d, Remote host: %s, Run time: %ld sec. (time left: %" PRId64 " sec.)",
- pid.pid, pid.remote_txt.c_str(), (long)diff, left);
+ LOG_I("pid=%d, Remote host: %s, Run time: %ld sec. (time left: %s s.)", pid.first,
+ pid.second.remote_txt.c_str(), (long)diff,
+ nsjconf->tlimit ? std::to_string(left).c_str() : "unlimited");
}
}
-static const pids_t* getPidElem(nsjconf_t* nsjconf, pid_t pid) {
- for (const auto& p : nsjconf->pids) {
- if (p.pid == pid) {
- return &p;
- }
- }
- return NULL;
-}
-
static void seccompViolation(nsjconf_t* nsjconf, siginfo_t* si) {
- LOG_W("PID: %d commited a syscall/seccomp violation and exited with SIGSYS", si->si_pid);
+ LOG_W("pid=%d committed a syscall/seccomp violation and exited with SIGSYS", si->si_pid);
- const pids_t* p = getPidElem(nsjconf, si->si_pid);
- if (p == NULL) {
- LOG_W("PID:%d SiSyscall: %d, SiCode: %d, SiErrno: %d, SiSigno: %d", (int)si->si_pid,
- si->si_syscall, si->si_code, si->si_errno, si->si_signo);
- LOG_E("Couldn't find pid element in the subproc list for PID: %d", (int)si->si_pid);
+ const auto& p = nsjconf->pids.find(si->si_pid);
+ if (p == nsjconf->pids.end()) {
+ LOG_W(
+ "pid=%d SiSyscall: %d, SiCode: %d, SiErrno: %d, SiSigno: %d. (If "
+ "SiSyscall==31, then it's most likely the SIGSYS value. See 'dmesg' or "
+ "'journalctl -ek' for possible auditd report with more data)",
+ (int)si->si_pid, si->si_syscall, si->si_code, si->si_errno, si->si_signo);
+ LOG_E("Couldn't find pid element in the subproc list for pid=%d", (int)si->si_pid);
return;
}
char buf[4096];
- ssize_t rdsize = util::readFromFd(p->pid_syscall_fd, buf, sizeof(buf) - 1);
+ ssize_t rdsize = util::readFromFd(p->second.pid_syscall_fd, buf, sizeof(buf) - 1);
if (rdsize < 1) {
- LOG_W("PID: %d, SiSyscall: %d, SiCode: %d, SiErrno: %d, SiSigno: %d",
+ LOG_W(
+ "pid=%d, SiSyscall: %d, SiCode: %d, SiErrno: %d, SiSigno: %d. (If "
+ "SiSyscall==31, then it's most likely the SIGSYS value. See 'dmesg' or "
+ "'journalctl -ek' for possible auditd report with more data)",
(int)si->si_pid, si->si_syscall, si->si_code, si->si_errno, si->si_signo);
return;
}
@@ -275,18 +292,22 @@ static void seccompViolation(nsjconf_t* nsjconf, siginfo_t* si) {
&arg4, &arg5, &arg6, &sp, &pc);
if (ret == 9) {
LOG_W(
- "PID: %d, Syscall number: %td, Arguments: %#tx, %#tx, %#tx, %#tx, %#tx, %#tx, "
+ "pid=%d, Syscall number: %td, Arguments: %#tx, %#tx, %#tx, %#tx, %#tx, %#tx, "
"SP: %#tx, PC: %#tx, si_syscall: %d, si_errno: %#x",
(int)si->si_pid, sc, arg1, arg2, arg3, arg4, arg5, arg6, sp, pc, si->si_syscall,
si->si_errno);
} else if (ret == 3) {
LOG_W(
- "PID: %d, SiSyscall: %d, SiCode: %d, SiErrno: %d, SiSigno: %d, SP: %#tx, PC: "
- "%#tx",
+ "pid=%d, SiSyscall: %d, SiCode: %d, SiErrno: %d, SiSigno: %d, SP: %#tx, PC: "
+ "%#tx (If SiSyscall==31, then it's most likely the SIGSYS value. See 'dmesg' "
+ "or 'journalctl -ek' for possible auditd report with more data)",
(int)si->si_pid, si->si_syscall, si->si_code, si->si_errno, si->si_signo, arg1,
arg2);
} else {
- LOG_W("PID: %d, SiSyscall: %d, SiCode: %d, SiErrno: %d, Syscall string '%s'",
+ LOG_W(
+ "pid=%d, SiSyscall: %d, SiCode: %d, SiErrno: %d, Syscall string '%s'. (If "
+ "SiSyscall==31, then it's most likely the SIGSYS value. See 'dmesg' or "
+ "'journalctl -ek' for possible auditd report with more data)",
(int)si->si_pid, si->si_syscall, si->si_code, si->si_errno, buf);
}
}
@@ -295,22 +316,26 @@ static int reapProc(nsjconf_t* nsjconf, pid_t pid, bool should_wait = false) {
int status;
if (wait4(pid, &status, should_wait ? 0 : WNOHANG, NULL) == pid) {
- cgroup::finishFromParent(nsjconf, pid);
+ if (nsjconf->use_cgroupv2) {
+ cgroup2::finishFromParent(nsjconf, pid);
+ } else {
+ cgroup::finishFromParent(nsjconf, pid);
+ }
std::string remote_txt = "[UNKNOWN]";
- const pids_t* elem = getPidElem(nsjconf, pid);
- if (elem) {
- remote_txt = elem->remote_txt;
+ const auto& p = nsjconf->pids.find(pid);
+ if (p != nsjconf->pids.end()) {
+ remote_txt = p->second.remote_txt;
}
if (WIFEXITED(status)) {
- LOG_I("PID: %d (%s) exited with status: %d, (PIDs left: %d)", pid,
+ LOG_I("pid=%d (%s) exited with status: %d, (PIDs left: %d)", pid,
remote_txt.c_str(), WEXITSTATUS(status), countProc(nsjconf) - 1);
removeProc(nsjconf, pid);
return WEXITSTATUS(status);
}
if (WIFSIGNALED(status)) {
- LOG_I("PID: %d (%s) terminated with signal: %s (%d), (PIDs left: %d)", pid,
+ LOG_I("pid=%d (%s) terminated with signal: %s (%d), (PIDs left: %d)", pid,
remote_txt.c_str(), util::sigName(WTERMSIG(status)).c_str(),
WTERMSIG(status), countProc(nsjconf) - 1);
removeProc(nsjconf, pid);
@@ -343,20 +368,19 @@ int reapProc(nsjconf_t* nsjconf) {
if (nsjconf->tlimit == 0) {
continue;
}
- pid_t pid = p.pid;
- time_t diff = now - p.start;
+ pid_t pid = p.first;
+ time_t diff = now - p.second.start;
if ((uint64_t)diff >= nsjconf->tlimit) {
- LOG_I("PID: %d run time >= time limit (%ld >= %" PRIu64
- ") (%s). Killing it",
- pid, (long)diff, nsjconf->tlimit, p.remote_txt.c_str());
+ LOG_I("pid=%d run time >= time limit (%ld >= %" PRIu64 ") (%s). Killing it",
+ pid, (long)diff, nsjconf->tlimit, p.second.remote_txt.c_str());
/*
* Probably a kernel bug - some processes cannot be killed with KILL if
* they're namespaced, and in a stopped state
*/
kill(pid, SIGCONT);
- LOG_D("Sent SIGCONT to PID: %d", pid);
+ LOG_D("Sent SIGCONT to pid=%d", pid);
kill(pid, SIGKILL);
- LOG_D("Sent SIGKILL to PID: %d", pid);
+ LOG_D("Sent SIGKILL to pid=%d", pid);
}
}
return rv;
@@ -364,7 +388,7 @@ int reapProc(nsjconf_t* nsjconf) {
void killAndReapAll(nsjconf_t* nsjconf) {
while (!nsjconf->pids.empty()) {
- pid_t pid = nsjconf->pids.front().pid;
+ pid_t pid = nsjconf->pids.begin()->first;
if (kill(pid, SIGKILL) == 0) {
reapProc(nsjconf, pid, true);
} else {
@@ -375,15 +399,22 @@ void killAndReapAll(nsjconf_t* nsjconf) {
static bool initParent(nsjconf_t* nsjconf, pid_t pid, int pipefd) {
if (!net::initNsFromParent(nsjconf, pid)) {
- LOG_E("Couldn't initialize net namespace for pid '%d'", pid);
+ LOG_E("Couldn't initialize net namespace for pid=%d", pid);
return false;
}
- if (!cgroup::initNsFromParent(nsjconf, pid)) {
- LOG_E("Couldn't initialize cgroup user namespace for pid '%d'", pid);
+
+ if (nsjconf->use_cgroupv2) {
+ if (!cgroup2::initNsFromParent(nsjconf, pid)) {
+ LOG_E("Couldn't initialize cgroup 2 user namespace for pid=%d", pid);
+ exit(0xff);
+ }
+ } else if (!cgroup::initNsFromParent(nsjconf, pid)) {
+ LOG_E("Couldn't initialize cgroup user namespace for pid=%d", pid);
exit(0xff);
}
+
if (!user::initNsFromParent(nsjconf, pid)) {
- LOG_E("Couldn't initialize user namespace for pid %d", pid);
+ LOG_E("Couldn't initialize user namespace for pid=%d", pid);
return false;
}
if (!util::writeToFd(pipefd, &kSubprocDoneChar, sizeof(kSubprocDoneChar))) {
@@ -393,9 +424,9 @@ static bool initParent(nsjconf_t* nsjconf, pid_t pid, int pipefd) {
return true;
}
-bool runChild(nsjconf_t* nsjconf, int fd_in, int fd_out, int fd_err) {
- if (!net::limitConns(nsjconf, fd_in)) {
- return true;
+pid_t runChild(nsjconf_t* nsjconf, int netfd, int fd_in, int fd_out, int fd_err) {
+ if (!net::limitConns(nsjconf, netfd)) {
+ return 0;
}
unsigned long flags = 0UL;
flags |= (nsjconf->clone_newnet ? CLONE_NEWNET : 0);
@@ -405,54 +436,48 @@ bool runChild(nsjconf_t* nsjconf, int fd_in, int fd_out, int fd_err) {
flags |= (nsjconf->clone_newipc ? CLONE_NEWIPC : 0);
flags |= (nsjconf->clone_newuts ? CLONE_NEWUTS : 0);
flags |= (nsjconf->clone_newcgroup ? CLONE_NEWCGROUP : 0);
+ flags |= (nsjconf->clone_newtime ? CLONE_NEWTIME : 0);
if (nsjconf->mode == MODE_STANDALONE_EXECVE) {
+ LOG_D("unshare(flags: %s)", cloneFlagsToStr(flags).c_str());
if (unshare(flags) == -1) {
PLOG_F("unshare(%s)", cloneFlagsToStr(flags).c_str());
}
- subprocNewProc(nsjconf, fd_in, fd_out, fd_err, -1);
+ subprocNewProc(nsjconf, netfd, fd_in, fd_out, fd_err, -1);
LOG_F("Launching new process failed");
}
- flags |= SIGCHLD;
- LOG_D("Creating new process with clone flags:%s", cloneFlagsToStr(flags).c_str());
+ LOG_D("Creating new process with clone flags:%s and exit_signal:SIGCHLD",
+ cloneFlagsToStr(flags).c_str());
int sv[2];
if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, sv) == -1) {
PLOG_E("socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC) failed");
- return false;
+ return -1;
}
int child_fd = sv[0];
int parent_fd = sv[1];
- pid_t pid = cloneProc(flags);
+ pid_t pid = cloneProc(flags, SIGCHLD);
if (pid == 0) {
close(parent_fd);
- subprocNewProc(nsjconf, fd_in, fd_out, fd_err, child_fd);
+ subprocNewProc(nsjconf, netfd, fd_in, fd_out, fd_err, child_fd);
util::writeToFd(child_fd, &kSubprocErrorChar, sizeof(kSubprocErrorChar));
LOG_F("Launching child process failed");
}
close(child_fd);
if (pid == -1) {
- if (flags & CLONE_NEWCGROUP) {
- PLOG_E(
- "nsjail tried to use the CLONE_NEWCGROUP clone flag, which is "
- "supported under kernel versions >= 4.6 only. Try disabling this flag");
- }
- PLOG_E(
- "clone(flags=%s) failed. You probably need root privileges if your system "
- "doesn't support CLONE_NEWUSER. Alternatively, you might want to recompile "
- "your kernel with support for namespaces or check the current value of the "
- "kernel.unprivileged_userns_clone sysctl",
- cloneFlagsToStr(flags).c_str());
+ auto saved_errno = errno;
+ PLOG_W("clone(flags=%s) failed", cloneFlagsToStr(flags).c_str());
close(parent_fd);
- return false;
+ errno = saved_errno;
+ return pid;
}
- addProc(nsjconf, pid, fd_in);
+ addProc(nsjconf, pid, netfd);
if (!initParent(nsjconf, pid, parent_fd)) {
close(parent_fd);
- return false;
+ return -1;
}
char rcvChar;
@@ -460,11 +485,11 @@ bool runChild(nsjconf_t* nsjconf, int fd_in, int fd_out, int fd_err) {
rcvChar == kSubprocErrorChar) {
LOG_W("Received error message from the child process before it has been executed");
close(parent_fd);
- return false;
+ return -1;
}
close(parent_fd);
- return true;
+ return pid;
}
/*
@@ -485,9 +510,45 @@ static int cloneFunc(void* arg __attribute__((unused))) {
* update the internal PID/TID caches, what can lead to invalid values being returned by getpid()
* or incorrect PID/TIDs used in raise()/abort() functions
*/
-pid_t cloneProc(uintptr_t flags) {
+pid_t cloneProc(uintptr_t flags, int exit_signal) {
+ exit_signal &= CSIGNAL;
+
if (flags & CLONE_VM) {
LOG_E("Cannot use clone(flags & CLONE_VM)");
+ errno = 0;
+ return -1;
+ }
+
+ if (flags & CLONE_NEWTIME) {
+ LOG_W(
+ "CLONE_NEWTIME reuqested, but it's only supported with the unshare() mode "
+ "(-Me)");
+ }
+
+#if defined(__NR_clone3)
+ struct clone_args ca = {
+ .flags = (uint64_t)flags,
+ .pidfd = 0,
+ .child_tid = 0,
+ .parent_tid = 0,
+ .exit_signal = (uint64_t)exit_signal,
+ .stack = 0,
+ .stack_size = 0,
+ .tls = 0,
+ .set_tid = 0,
+ .set_tid_size = 0,
+ .cgroup = 0,
+ };
+
+ pid_t ret = util::syscall(__NR_clone3, (uintptr_t)&ca, sizeof(ca));
+ if (ret != -1 || errno != ENOSYS) {
+ return ret;
+ }
+#endif /* defined(__NR_clone3) */
+
+ if (flags & CLONE_NEWTIME) {
+ LOG_E("CLONE_NEWTIME was requested but clone3() is not supported");
+ errno = 0;
return -1;
}
@@ -500,7 +561,7 @@ pid_t cloneProc(uintptr_t flags) {
*/
void* stack = &cloneStack[sizeof(cloneStack) / 2];
/* Parent */
- return clone(cloneFunc, stack, flags, NULL, NULL, NULL);
+ return clone(cloneFunc, stack, flags | exit_signal, NULL, NULL, NULL);
}
/* Child */
return 0;
@@ -557,7 +618,7 @@ int systemExe(const std::vector<std::string>& args, char** env) {
}
if (WIFEXITED(status)) {
int exit_code = WEXITSTATUS(status);
- LOG_D("PID %d exited with exit code: %d", pid, exit_code);
+ LOG_D("pid=%d exited with exit code: %d", pid, exit_code);
if (exec_failed) {
return -1;
} else if (exit_code == 0) {
@@ -568,7 +629,7 @@ int systemExe(const std::vector<std::string>& args, char** env) {
}
if (WIFSIGNALED(status)) {
int exit_signal = WTERMSIG(status);
- LOG_W("PID %d killed by signal: %d (%s)", pid, exit_signal,
+ LOG_W("pid=%d killed by signal: %d (%s)", pid, exit_signal,
util::sigName(exit_signal).c_str());
return 2;
}
diff --git a/subproc.h b/subproc.h
index 33e2b5c..d3e1696 100644
--- a/subproc.h
+++ b/subproc.h
@@ -33,14 +33,15 @@
namespace subproc {
-bool runChild(nsjconf_t* nsjconf, int fd_in, int fd_out, int fd_err);
+/* 0 - network connection limit reached, -1 - error */
+pid_t runChild(nsjconf_t* nsjconf, int listen_fd, int fd_in, int fd_out, int fd_err);
int countProc(nsjconf_t* nsjconf);
void displayProc(nsjconf_t* nsjconf);
void killAndReapAll(nsjconf_t* nsjconf);
/* Returns the exit code of the first failing subprocess, or 0 if none fail */
int reapProc(nsjconf_t* nsjconf);
int systemExe(const std::vector<std::string>& args, char** env);
-pid_t cloneProc(uintptr_t flags);
+pid_t cloneProc(uintptr_t flags, int exit_signal);
} // namespace subproc
diff --git a/user.cc b/user.cc
index 4053884..a335e2d 100644
--- a/user.cc
+++ b/user.cc
@@ -43,17 +43,33 @@
#include "subproc.h"
#include "util.h"
+#define STR_(x) #x
+#define STR(x) STR_(x)
+
+constexpr char kNewUidPath[] =
+#ifdef NEWUIDMAP_PATH
+ STR(NEWUIDMAP_PATH);
+#else
+ "/usr/bin/newuidmap";
+#endif
+constexpr char kNewGidPath[] =
+#ifdef NEWGIDMAP_PATH
+ STR(NEWGIDMAP_PATH);
+#else
+ "/usr/bin/newgidmap";
+#endif
+
namespace user {
static bool setResGid(gid_t gid) {
LOG_D("setresgid(%d)", gid);
#if defined(__NR_setresgid32)
- if (syscall(__NR_setresgid32, (uintptr_t)gid, (uintptr_t)gid, (uintptr_t)gid) == -1) {
+ if (util::syscall(__NR_setresgid32, (uintptr_t)gid, (uintptr_t)gid, (uintptr_t)gid) == -1) {
PLOG_W("setresgid32(%d)", (int)gid);
return false;
}
#else /* defined(__NR_setresgid32) */
- if (syscall(__NR_setresgid, (uintptr_t)gid, (uintptr_t)gid, (uintptr_t)gid) == -1) {
+ if (util::syscall(__NR_setresgid, (uintptr_t)gid, (uintptr_t)gid, (uintptr_t)gid) == -1) {
PLOG_W("setresgid(%d)", gid);
return false;
}
@@ -64,12 +80,12 @@ static bool setResGid(gid_t gid) {
static bool setResUid(uid_t uid) {
LOG_D("setresuid(%d)", uid);
#if defined(__NR_setresuid32)
- if (syscall(__NR_setresuid32, (uintptr_t)uid, (uintptr_t)uid, (uintptr_t)uid) == -1) {
+ if (util::syscall(__NR_setresuid32, (uintptr_t)uid, (uintptr_t)uid, (uintptr_t)uid) == -1) {
PLOG_W("setresuid32(%d)", (int)uid);
return false;
}
#else /* defined(__NR_setresuid32) */
- if (syscall(__NR_setresuid, (uintptr_t)uid, (uintptr_t)uid, (uintptr_t)uid) == -1) {
+ if (util::syscall(__NR_setresuid, (uintptr_t)uid, (uintptr_t)uid, (uintptr_t)uid) == -1) {
PLOG_W("setresuid(%d)", uid);
return false;
}
@@ -77,18 +93,27 @@ static bool setResUid(uid_t uid) {
return true;
}
-static bool setGroups(pid_t pid) {
+static bool hasGidMapSelf(nsjconf_t* nsjconf) {
+ for (const auto& gid : nsjconf->gids) {
+ if (!gid.is_newidmap) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool setGroupsDeny(nsjconf_t* nsjconf, pid_t pid) {
/*
* No need to write 'deny' to /proc/pid/setgroups if our euid==0, as writing to
* uid_map/gid_map will succeed anyway
*/
- if (geteuid() == 0) {
+ if (!nsjconf->clone_newuser || nsjconf->orig_euid == 0 || !hasGidMapSelf(nsjconf)) {
return true;
}
char fname[PATH_MAX];
snprintf(fname, sizeof(fname), "/proc/%d/setgroups", pid);
- const char* denystr = "deny";
+ const char* const denystr = "deny";
if (!util::writeBufToFile(fname, denystr, strlen(denystr), O_WRONLY | O_CLOEXEC)) {
LOG_E("util::writeBufToFile('%s', '%s') failed", fname, denystr);
return false;
@@ -152,11 +177,11 @@ static bool gidMapSelf(nsjconf_t* nsjconf, pid_t pid) {
return true;
}
-/* Use /usr/bin/newgidmap for writing the gid map */
-static bool gidMapExternal(nsjconf_t* nsjconf, pid_t pid UNUSED) {
+/* Use newgidmap for writing the gid map */
+static bool gidMapExternal(nsjconf_t* nsjconf, pid_t pid) {
bool use = false;
- std::vector<std::string> argv = {"/usr/bin/newgidmap", std::to_string(pid)};
+ std::vector<std::string> argv = {kNewGidPath, std::to_string(pid)};
for (const auto& gid : nsjconf->gids) {
if (!gid.is_newidmap) {
continue;
@@ -171,18 +196,18 @@ static bool gidMapExternal(nsjconf_t* nsjconf, pid_t pid UNUSED) {
return true;
}
if (subproc::systemExe(argv, environ) != 0) {
- LOG_E("'/usr/bin/newgidmap' failed");
+ LOG_E("'%s' failed", kNewGidPath);
return false;
}
return true;
}
-/* Use /usr/bin/newuidmap for writing the uid map */
-static bool uidMapExternal(nsjconf_t* nsjconf, pid_t pid UNUSED) {
+/* Use newuidmap for writing the uid map */
+static bool uidMapExternal(nsjconf_t* nsjconf, pid_t pid) {
bool use = false;
- std::vector<std::string> argv = {"/usr/bin/newuidmap", std::to_string(pid)};
+ std::vector<std::string> argv = {kNewUidPath, std::to_string(pid)};
for (const auto& uid : nsjconf->uids) {
if (!uid.is_newidmap) {
continue;
@@ -197,7 +222,7 @@ static bool uidMapExternal(nsjconf_t* nsjconf, pid_t pid UNUSED) {
return true;
}
if (subproc::systemExe(argv, environ) != 0) {
- LOG_E("'/usr/bin/newuidmap' failed");
+ LOG_E("'%s' failed", kNewUidPath);
return false;
}
@@ -214,7 +239,7 @@ static bool uidGidMap(nsjconf_t* nsjconf, pid_t pid) {
}
bool initNsFromParent(nsjconf_t* nsjconf, pid_t pid) {
- if (!setGroups(pid)) {
+ if (!setGroupsDeny(nsjconf, pid)) {
return false;
}
if (!nsjconf->clone_newuser) {
@@ -227,13 +252,8 @@ bool initNsFromParent(nsjconf_t* nsjconf, pid_t pid) {
}
bool initNsFromChild(nsjconf_t* nsjconf) {
- /*
- * Best effort because of /proc/self/setgroups
- */
- LOG_D("setgroups(0, NULL)");
- const gid_t* group_list = NULL;
- if (setgroups(0, group_list) == -1) {
- PLOG_D("setgroups(NULL) failed");
+ if (!nsjconf->clone_newuser && nsjconf->orig_euid != 0) {
+ return true;
}
/*
@@ -246,12 +266,48 @@ bool initNsFromChild(nsjconf_t* nsjconf) {
return false;
}
+ /*
+ * Best effort because of /proc/self/setgroups. We deny
+ * setgroups(2) calls only if user namespaces are in use.
+ */
+ std::vector<gid_t> groups;
+ std::string groupsString = "[";
+ if (!nsjconf->clone_newuser && nsjconf->gids.size() > 1) {
+ for (auto it = nsjconf->gids.begin() + 1; it != nsjconf->gids.end(); it++) {
+ groups.push_back(it->inside_id);
+ groupsString += std::to_string(it->inside_id);
+ if (it < nsjconf->gids.end() - 1) groupsString += ", ";
+ }
+ }
+ groupsString += "]";
+
if (!setResGid(nsjconf->gids[0].inside_id)) {
- PLOG_E("setresgid(%u)", nsjconf->gids[0].inside_id);
+ PLOG_E("setresgid(%lu)", (unsigned long)nsjconf->gids[0].inside_id);
return false;
}
+
+ LOG_D("setgroups(%zu, %s)", groups.size(), groupsString.c_str());
+ if (setgroups(groups.size(), groups.data()) == -1) {
+ /* Indicate error if specific groups were requested */
+ if (groups.size() > 0) {
+ PLOG_E("setgroups(%zu, %s) failed", groups.size(), groupsString.c_str());
+ return false;
+ }
+ PLOG_D("setgroups(%zu, %s) failed", groups.size(), groupsString.c_str());
+ }
+
if (!setResUid(nsjconf->uids[0].inside_id)) {
- PLOG_E("setresuid(%u)", nsjconf->uids[0].inside_id);
+ PLOG_E("setresuid(%lu)", (unsigned long)nsjconf->uids[0].inside_id);
+ return false;
+ }
+
+ /*
+ * Disable securebits again to avoid spawned programs
+ * unexpectedly retaining capabilities after a UID/GID
+ * change.
+ */
+ if (prctl(PR_SET_SECUREBITS, 0UL, 0UL, 0UL, 0UL) == -1) {
+ PLOG_E("prctl(PR_SET_SECUREBITS, 0)");
return false;
}
diff --git a/util.cc b/util.cc
index 35e1749..c3088b6 100644
--- a/util.cc
+++ b/util.cc
@@ -64,8 +64,7 @@ ssize_t readFromFd(int fd, void* buf, size_t len) {
}
ssize_t readFromFile(const char* fname, void* buf, size_t len) {
- int fd;
- TEMP_FAILURE_RETRY(fd = open(fname, O_RDONLY | O_CLOEXEC));
+ int fd = TEMP_FAILURE_RETRY(open(fname, O_RDONLY | O_CLOEXEC));
if (fd == -1) {
LOG_E("open('%s', O_RDONLY|O_CLOEXEC)", fname);
return -1;
@@ -212,11 +211,11 @@ static const uint64_t c = 1442695040888963407ULL;
static void rndInitThread(void) {
#if defined(__NR_getrandom)
- if (syscall(__NR_getrandom, &rndX, sizeof(rndX), 0) == sizeof(rndX)) {
+ if (util::syscall(__NR_getrandom, (uintptr_t)&rndX, sizeof(rndX), 0) == sizeof(rndX)) {
return;
}
#endif /* defined(__NR_getrandom) */
- int fd = open("/dev/urandom", O_RDONLY | O_CLOEXEC);
+ int fd = TEMP_FAILURE_RETRY(open("/dev/urandom", O_RDONLY | O_CLOEXEC));
if (fd == -1) {
PLOG_D(
"Couldn't open /dev/urandom for reading. Using gettimeofday "
@@ -317,4 +316,9 @@ std::vector<std::string> strSplit(const std::string str, char delim) {
return vec;
}
+long syscall(long sysno, uintptr_t a0, uintptr_t a1, uintptr_t a2, uintptr_t a3, uintptr_t a4,
+ uintptr_t a5) {
+ return ::syscall(sysno, a0, a1, a2, a3, a4, a5);
+}
+
} // namespace util
diff --git a/util.h b/util.h
index 357b606..38bb48d 100644
--- a/util.h
+++ b/util.h
@@ -22,6 +22,7 @@
#ifndef NS_UTIL_H
#define NS_UTIL_H
+#include <inttypes.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
@@ -53,6 +54,8 @@ uint64_t rnd64(void);
const std::string sigName(int signo);
const std::string timeToStr(time_t t);
std::vector<std::string> strSplit(const std::string str, char delim);
+long syscall(long sysno, uintptr_t a0 = 0, uintptr_t a1 = 0, uintptr_t a2 = 0, uintptr_t a3 = 0,
+ uintptr_t a4 = 0, uintptr_t a5 = 0);
} // namespace util