aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--METADATA4
-rw-r--r--README.md202
-rw-r--r--cgroup.cc25
-rw-r--r--cgroup2.cc20
-rw-r--r--cmdline.cc30
-rw-r--r--config.cc8
-rw-r--r--config.proto6
-rw-r--r--configs/apache.cfg2
-rw-r--r--configs/bash-with-fake-geteuid.cfg2
-rw-r--r--configs/demo-dont-use-chrome-with-net.cfg2
-rw-r--r--configs/firefox-with-cloned-net.cfg2
-rw-r--r--configs/firefox-with-net-wayland.cfg2
-rw-r--r--configs/firefox-with-net.cfg2
-rw-r--r--configs/home-documents-with-xorg-no-net.cfg2
-rw-r--r--configs/imagemagick-convert.cfg4
-rw-r--r--configs/xchat-with-net.cfg2
-rw-r--r--configs/znc-with-net.cfg2
-rw-r--r--contain.cc17
-rw-r--r--logs.cc4
-rw-r--r--logs.h28
-rw-r--r--macros.h2
-rw-r--r--mnt.cc61
-rw-r--r--nsjail.18
-rw-r--r--nsjail.h3
-rw-r--r--subproc.cc65
25 files changed, 310 insertions, 195 deletions
diff --git a/METADATA b/METADATA
index 865cf83..3211db4 100644
--- a/METADATA
+++ b/METADATA
@@ -12,7 +12,7 @@ third_party {
type: GIT
value: "https://github.com/google/nsjail"
}
- version: "c7a313123b3dcb845ed3822b99ad9db69a6a82c8"
- last_upgrade_date { year: 2019 month: 1 day: 10 }
+ version: "6483728e2490c1fc497a81bba5682515eb489cf8"
+ last_upgrade_date { year: 2022 month: 3 day: 29 }
license_type: NOTICE
}
diff --git a/README.md b/README.md
index f3f2ccf..37e61a4 100644
--- a/README.md
+++ b/README.md
@@ -339,177 +339,203 @@ The command-line options should be self-explanatory, while the proto-buf config
Usage: ./nsjail [options] -- path_to_command [args]
Options:
--help|-h
- Help plz..
+ Help plz..
--mode|-M VALUE
- Execution mode (default: 'o' [MODE_STANDALONE_ONCE]):
- l: Wait for connections on a TCP port (specified with --port) [MODE_LISTEN_TCP]
- o: Launch a single process on the console using clone/execve [MODE_STANDALONE_ONCE]
- e: Launch a single process on the console using execve [MODE_STANDALONE_EXECVE]
- r: Launch a single process on the console with clone/execve, keep doing it forever [MODE_STANDALONE_RERUN]
+ Execution mode (default: 'o' [MODE_STANDALONE_ONCE]):
+ l: Wait for connections on a TCP port (specified with --port) [MODE_LISTEN_TCP]
+ o: Launch a single process on the console using clone/execve [MODE_STANDALONE_ONCE]
+ e: Launch a single process on the console using execve [MODE_STANDALONE_EXECVE]
+ r: Launch a single process on the console with clone/execve, keep doing it forever [MODE_STANDALONE_RERUN]
--config|-C VALUE
- Configuration file in the config.proto ProtoBuf format (see configs/ directory for examples)
+ Configuration file in the config.proto ProtoBuf format (see configs/ directory for examples)
--exec_file|-x VALUE
- File to exec (default: argv[0])
+ File to exec (default: argv[0])
--execute_fd
- Use execveat() to execute a file-descriptor instead of executing the binary path. In such case argv[0]/exec_file denotes a file path before mount namespacing
+ Use execveat() to execute a file-descriptor instead of executing the binary path. In such case argv[0]/exec_file denotes a file path before mount namespacing
--chroot|-c VALUE
- Directory containing / of the jail (default: none)
+ Directory containing / of the jail (default: none)
+ --no_pivotroot
+ When creating a mount namespace, use mount(MS_MOVE) and chroot rather than pivot_root. Usefull when pivot_root is disallowed (e.g. initramfs). Note: escapable is some configuration
--rw
- Mount chroot dir (/) R/W (default: R/O)
+ Mount chroot dir (/) R/W (default: R/O)
--user|-u VALUE
- Username/uid of processes inside the jail (default: your current uid). You can also use inside_ns_uid:outside_ns_uid:count convention here. Can be specified multiple times
+ Username/uid of processes inside the jail (default: your current uid). You can also use inside_ns_uid:outside_ns_uid:count convention here. Can be specified multiple times
--group|-g VALUE
- Groupname/gid of processes inside the jail (default: your current gid). You can also use inside_ns_gid:global_ns_gid:count convention here. Can be specified multiple times
+ Groupname/gid of processes inside the jail (default: your current gid). You can also use inside_ns_gid:global_ns_gid:count convention here. Can be specified multiple times
--hostname|-H VALUE
- UTS name (hostname) of the jail (default: 'NSJAIL')
+ UTS name (hostname) of the jail (default: 'NSJAIL')
--cwd|-D VALUE
- Directory in the namespace the process will run (default: '/')
+ Directory in the namespace the process will run (default: '/')
--port|-p VALUE
- TCP port to bind to (enables MODE_LISTEN_TCP) (default: 0)
+ TCP port to bind to (enables MODE_LISTEN_TCP) (default: 0)
--bindhost VALUE
- IP address to bind the port to (only in [MODE_LISTEN_TCP]), (default: '::')
+ IP address to bind the port to (only in [MODE_LISTEN_TCP]), (default: '::')
--max_conns VALUE
- Maximum number of connections across all IPs (only in [MODE_LISTEN_TCP]), (default: 0 (unlimited))
+ Maximum number of connections across all IPs (only in [MODE_LISTEN_TCP]), (default: 0 (unlimited))
--max_conns_per_ip|-i VALUE
- Maximum number of connections per one IP (only in [MODE_LISTEN_TCP]), (default: 0 (unlimited))
+ Maximum number of connections per one IP (only in [MODE_LISTEN_TCP]), (default: 0 (unlimited))
--log|-l VALUE
- Log file (default: use log_fd)
+ Log file (default: use log_fd)
--log_fd|-L VALUE
- Log FD (default: 2)
+ Log FD (default: 2)
--time_limit|-t VALUE
- Maximum time that a jail can exist, in seconds (default: 600)
+ Maximum time that a jail can exist, in seconds (default: 600)
--max_cpus VALUE
- Maximum number of CPUs a single jailed process can use (default: 0 'no limit')
+ Maximum number of CPUs a single jailed process can use (default: 0 'no limit')
--daemon|-d
- Daemonize after start
+ Daemonize after start
--verbose|-v
- Verbose output
+ Verbose output
--quiet|-q
- Log warning and more important messages only
+ Log warning and more important messages only
--really_quiet|-Q
- Log fatal messages only
+ Log fatal messages only
--keep_env|-e
- Pass all environment variables to the child process (default: all envvars are cleared)
+ Pass all environment variables to the child process (default: all envars are cleared)
--env|-E VALUE
- Additional environment variable (can be used multiple times)
+ Additional environment variable (can be used multiple times). If the envar doesn't contain '=' (e.g. just the 'DISPLAY' string), the current envar value will be used
--keep_caps
- Don't drop any capabilities
+ Don't drop any capabilities
--cap VALUE
- Retain this capability, e.g. CAP_PTRACE (can be specified multiple times)
+ Retain this capability, e.g. CAP_PTRACE (can be specified multiple times)
--silent
- Redirect child process' fd:0/1/2 to /dev/null
+ Redirect child process' fd:0/1/2 to /dev/null
--stderr_to_null
- Redirect FD=2 (STDERR_FILENO) to /dev/null
+ Redirect child process' fd:2 (STDERR_FILENO) to /dev/null
--skip_setsid
- Don't call setsid(), allows for terminal signal handling in the sandboxed process. Dangerous
+ Don't call setsid(), allows for terminal signal handling in the sandboxed process. Dangerous
--pass_fd VALUE
- Don't close this FD before executing the child process (can be specified multiple times), by default: 0/1/2 are kept open
+ Don't close this FD before executing the child process (can be specified multiple times), by default: 0/1/2 are kept open
--disable_no_new_privs
- Don't set the prctl(NO_NEW_PRIVS, 1) (DANGEROUS)
+ Don't set the prctl(NO_NEW_PRIVS, 1) (DANGEROUS)
--rlimit_as VALUE
- RLIMIT_AS in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 512)
+ RLIMIT_AS in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 4096)
--rlimit_core VALUE
- RLIMIT_CORE in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 0)
+ RLIMIT_CORE in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 0)
--rlimit_cpu VALUE
- RLIMIT_CPU, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 600)
+ RLIMIT_CPU, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 600)
--rlimit_fsize VALUE
- RLIMIT_FSIZE in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 1)
+ RLIMIT_FSIZE in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 1)
--rlimit_nofile VALUE
- RLIMIT_NOFILE, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 32)
+ RLIMIT_NOFILE, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 32)
--rlimit_nproc VALUE
- RLIMIT_NPROC, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 'soft')
+ RLIMIT_NPROC, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 'soft')
--rlimit_stack VALUE
- RLIMIT_STACK in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 'soft')
+ RLIMIT_STACK in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 'soft')
+ --rlimit_memlock VALUE
+ RLIMIT_MEMLOCK in KB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 'soft')
+ --rlimit_rtprio VALUE
+ RLIMIT_RTPRIO, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 'soft')
+ --rlimit_msgqueue VALUE
+ RLIMIT_MSGQUEUE in bytes, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 'soft')
+ --disable_rlimits
+ Disable all rlimits, default to limits set by parent
--persona_addr_compat_layout
- personality(ADDR_COMPAT_LAYOUT)
+ personality(ADDR_COMPAT_LAYOUT)
--persona_mmap_page_zero
- personality(MMAP_PAGE_ZERO)
+ personality(MMAP_PAGE_ZERO)
--persona_read_implies_exec
- personality(READ_IMPLIES_EXEC)
+ personality(READ_IMPLIES_EXEC)
--persona_addr_limit_3gb
- personality(ADDR_LIMIT_3GB)
+ personality(ADDR_LIMIT_3GB)
--persona_addr_no_randomize
- personality(ADDR_NO_RANDOMIZE)
+ personality(ADDR_NO_RANDOMIZE)
--disable_clone_newnet|-N
- Don't use CLONE_NEWNET. Enable global networking inside the jail
+ Don't use CLONE_NEWNET. Enable global networking inside the jail
--disable_clone_newuser
- Don't use CLONE_NEWUSER. Requires euid==0
+ Don't use CLONE_NEWUSER. Requires euid==0
--disable_clone_newns
- Don't use CLONE_NEWNS
+ Don't use CLONE_NEWNS
--disable_clone_newpid
- Don't use CLONE_NEWPID
+ Don't use CLONE_NEWPID
--disable_clone_newipc
- Don't use CLONE_NEWIPC
+ Don't use CLONE_NEWIPC
--disable_clone_newuts
- Don't use CLONE_NEWUTS
+ Don't use CLONE_NEWUTS
--disable_clone_newcgroup
- Don't use CLONE_NEWCGROUP. Might be required for kernel versions < 4.6
+ Don't use CLONE_NEWCGROUP. Might be required for kernel versions < 4.6
+ --enable_clone_newtime
+ Use CLONE_NEWTIME. Supported with kernel versions >= 5.3
--uid_mapping|-U VALUE
- Add a custom uid mapping of the form inside_uid:outside_uid:count. Setting this requires newuidmap (set-uid) to be present
+ Add a custom uid mapping of the form inside_uid:outside_uid:count. Setting this requires newuidmap (set-uid) to be present
--gid_mapping|-G VALUE
- Add a custom gid mapping of the form inside_gid:outside_gid:count. Setting this requires newgidmap (set-uid) to be present
+ Add a custom gid mapping of the form inside_gid:outside_gid:count. Setting this requires newgidmap (set-uid) to be present
--bindmount_ro|-R VALUE
- List of mountpoints to be mounted --bind (ro) inside the container. Can be specified multiple times. Supports 'source' syntax, or 'source:dest'
+ List of mountpoints to be mounted --bind (ro) inside the container. Can be specified multiple times. Supports 'source' syntax, or 'source:dest'
--bindmount|-B VALUE
- List of mountpoints to be mounted --bind (rw) inside the container. Can be specified multiple times. Supports 'source' syntax, or 'source:dest'
+ List of mountpoints to be mounted --bind (rw) inside the container. Can be specified multiple times. Supports 'source' syntax, or 'source:dest'
--tmpfsmount|-T VALUE
- List of mountpoints to be mounted as tmpfs (R/W) inside the container. Can be specified multiple times. Supports 'dest' syntax. Alternatively, use '-m none:dest:tmpfs:size=8388608'
+ List of mountpoints to be mounted as tmpfs (R/W) inside the container. Can be specified multiple times. Supports 'dest' syntax. Alternatively, use '-m none:dest:tmpfs:size=8388608'
--mount|-m VALUE
- Arbitrary mount, format src:dst:fs_type:options
+ Arbitrary mount, format src:dst:fs_type:options
--symlink|-s VALUE
- Symlink, format src:dst
+ Symlink, format src:dst
--disable_proc
- Disable mounting procfs in the jail
+ Disable mounting procfs in the jail
--proc_path VALUE
- Path used to mount procfs (default: '/proc')
+ Path used to mount procfs (default: '/proc')
--proc_rw
- Is procfs mounted as R/W (default: R/O)
+ Is procfs mounted as R/W (default: R/O)
--seccomp_policy|-P VALUE
- Path to file containing seccomp-bpf policy (see kafel/)
+ Path to file containing seccomp-bpf policy (see kafel/)
--seccomp_string VALUE
- String with kafel seccomp-bpf policy (see kafel/)
+ String with kafel seccomp-bpf policy (see kafel/)
--seccomp_log
- Use SECCOMP_FILTER_FLAG_LOG. Log all actions except SECCOMP_RET_ALLOW). Supported since kernel version 4.14
+ Use SECCOMP_FILTER_FLAG_LOG. Log all actions except SECCOMP_RET_ALLOW). Supported since kernel version 4.14
+ --nice_level VALUE
+ Set jailed process niceness (-20 is highest -priority, 19 is lowest). By default, set to 19
--cgroup_mem_max VALUE
- Maximum number of bytes to use in the group (default: '0' - disabled)
+ Maximum number of bytes to use in the group (default: '0' - disabled)
+ --cgroup_mem_memsw_max VALUE
+ Maximum number of memory+swap bytes to use (default: '0' - disabled)
+ --cgroup_mem_swap_max VALUE
+ Maximum number of swap bytes to use (default: '-1' - disabled)
--cgroup_mem_mount VALUE
- Location of memory cgroup FS (default: '/sys/fs/cgroup/memory')
+ Location of memory cgroup FS (default: '/sys/fs/cgroup/memory')
--cgroup_mem_parent VALUE
- Which pre-existing memory cgroup to use as a parent (default: 'NSJAIL')
+ Which pre-existing memory cgroup to use as a parent (default: 'NSJAIL')
--cgroup_pids_max VALUE
- Maximum number of pids in a cgroup (default: '0' - disabled)
+ Maximum number of pids in a cgroup (default: '0' - disabled)
--cgroup_pids_mount VALUE
- Location of pids cgroup FS (default: '/sys/fs/cgroup/pids')
+ Location of pids cgroup FS (default: '/sys/fs/cgroup/pids')
--cgroup_pids_parent VALUE
- Which pre-existing pids cgroup to use as a parent (default: 'NSJAIL')
+ Which pre-existing pids cgroup to use as a parent (default: 'NSJAIL')
--cgroup_net_cls_classid VALUE
- Class identifier of network packets in the group (default: '0' - disabled)
+ Class identifier of network packets in the group (default: '0' - disabled)
--cgroup_net_cls_mount VALUE
- Location of net_cls cgroup FS (default: '/sys/fs/cgroup/net_cls')
+ Location of net_cls cgroup FS (default: '/sys/fs/cgroup/net_cls')
--cgroup_net_cls_parent VALUE
- Which pre-existing net_cls cgroup to use as a parent (default: 'NSJAIL')
+ Which pre-existing net_cls cgroup to use as a parent (default: 'NSJAIL')
--cgroup_cpu_ms_per_sec VALUE
- Number of milliseconds of CPU time per second that the process group can use (default: '0' - no limit)
+ Number of milliseconds of CPU time per second that the process group can use (default: '0' - no limit)
--cgroup_cpu_mount VALUE
- Location of cpu cgroup FS (default: '/sys/fs/cgroup/cpu')
+ Location of cpu cgroup FS (default: '/sys/fs/cgroup/cpu')
--cgroup_cpu_parent VALUE
- Which pre-existing cpu cgroup to use as a parent (default: 'NSJAIL')
+ Which pre-existing cpu cgroup to use as a parent (default: 'NSJAIL')
+ --cgroupv2_mount VALUE
+ Location of cgroupv2 directory (default: '/sys/fs/cgroup')
+ --use_cgroupv2
+ Use cgroup v2
--iface_no_lo
- Don't bring the 'lo' interface up
+ Don't bring the 'lo' interface up
--iface_own VALUE
- Move this existing network interface into the new NET namespace. Can be specified multiple times
+ Move this existing network interface into the new NET namespace. Can be specified multiple times
--macvlan_iface|-I VALUE
- Interface which will be cloned (MACVLAN) and put inside the subprocess' namespace as 'vs'
+ Interface which will be cloned (MACVLAN) and put inside the subprocess' namespace as 'vs'
--macvlan_vs_ip VALUE
- IP of the 'vs' interface (e.g. "192.168.0.1")
+ IP of the 'vs' interface (e.g. "192.168.0.1")
--macvlan_vs_nm VALUE
- Netmask of the 'vs' interface (e.g. "255.255.255.0")
+ Netmask of the 'vs' interface (e.g. "255.255.255.0")
--macvlan_vs_gw VALUE
- Default GW for the 'vs' interface (e.g. "192.168.0.1")
+ Default GW for the 'vs' interface (e.g. "192.168.0.1")
--macvlan_vs_ma VALUE
- MAC-address of the 'vs' interface (e.g. "ba:ad:ba:be:45:00")
+ MAC-address of the 'vs' interface (e.g. "ba:ad:ba:be:45:00")
+ --macvlan_vs_mo VALUE
+ Mode of the 'vs' interface. Can be either 'private', 'vepa', 'bridge' or 'passthru' (default: 'private')
+ --disable_tsc
+ Disable rdtsc and rdtscp instructions. WARNING: To make it effective, you also need to forbid `prctl(PR_SET_TSC, PR_TSC_ENABLE, ...)` in seccomp rules! (x86 and x86_64 only). Dynamic binaries produced by GCC seem to rely on RDTSC, but static ones should work.
- Examples:
+Examples:
Wait on a port 31337 for connections, and run /bin/sh
nsjail -Ml --port 31337 --chroot / -- /bin/sh -i
Re-run echo command as a sub-process
diff --git a/cgroup.cc b/cgroup.cc
index 15c7649..c5ce485 100644
--- a/cgroup.cc
+++ b/cgroup.cc
@@ -65,7 +65,12 @@ static bool addPidToTaskList(const std::string& cgroup_path, pid_t pid) {
}
static bool initNsFromParentMem(nsjconf_t* nsjconf, pid_t pid) {
- if (nsjconf->cgroup_mem_max == (size_t)0) {
+ size_t memsw_max = nsjconf->cgroup_mem_memsw_max;
+ if (nsjconf->cgroup_mem_swap_max >= (ssize_t)0) {
+ memsw_max = nsjconf->cgroup_mem_swap_max + nsjconf->cgroup_mem_max;
+ }
+
+ if (nsjconf->cgroup_mem_max == (size_t)0 && memsw_max == (size_t)0) {
return true;
}
@@ -73,16 +78,24 @@ static bool initNsFromParentMem(nsjconf_t* nsjconf, pid_t pid) {
"/NSJAIL." + std::to_string(pid);
RETURN_ON_FAILURE(createCgroup(mem_cgroup_path, pid));
- std::string mem_max_str = std::to_string(nsjconf->cgroup_mem_max);
- RETURN_ON_FAILURE(writeToCgroup(
- mem_cgroup_path + "/memory.limit_in_bytes", mem_max_str, "memory cgroup max limit"));
-
/*
* Use OOM-killer instead of making processes hang/sleep
*/
RETURN_ON_FAILURE(writeToCgroup(
mem_cgroup_path + "/memory.oom_control", "0", "memory cgroup oom control"));
+ if (nsjconf->cgroup_mem_max > (size_t)0) {
+ std::string mem_max_str = std::to_string(nsjconf->cgroup_mem_max);
+ RETURN_ON_FAILURE(writeToCgroup(mem_cgroup_path + "/memory.limit_in_bytes",
+ mem_max_str, "memory cgroup max limit"));
+ }
+
+ if (memsw_max > (size_t)0) {
+ std::string mem_memsw_max_str = std::to_string(memsw_max);
+ RETURN_ON_FAILURE(writeToCgroup(mem_cgroup_path + "/memory.memsw.limit_in_bytes",
+ mem_memsw_max_str, "memory+Swap cgroup max limit"));
+ }
+
return addPidToTaskList(mem_cgroup_path, pid);
}
@@ -159,7 +172,7 @@ static void removeCgroup(const std::string& cgroup_path) {
}
void finishFromParent(nsjconf_t* nsjconf, pid_t pid) {
- if (nsjconf->cgroup_mem_max != (size_t)0) {
+ if (nsjconf->cgroup_mem_max != (size_t)0 || nsjconf->cgroup_mem_memsw_max != (size_t)0) {
std::string mem_cgroup_path = nsjconf->cgroup_mem_mount + '/' +
nsjconf->cgroup_mem_parent + "/NSJAIL." +
std::to_string(pid);
diff --git a/cgroup2.cc b/cgroup2.cc
index 6b0dc09..1902c5e 100644
--- a/cgroup2.cc
+++ b/cgroup2.cc
@@ -84,14 +84,30 @@ static void removeCgroup(const std::string &cgroup_path) {
}
static bool initNsFromParentMem(nsjconf_t *nsjconf, pid_t pid) {
- if (nsjconf->cgroup_mem_max == (size_t)0) {
+ ssize_t swap_max = nsjconf->cgroup_mem_swap_max;
+ if (nsjconf->cgroup_mem_memsw_max > (size_t)0) {
+ swap_max = nsjconf->cgroup_mem_memsw_max - nsjconf->cgroup_mem_max;
+ }
+
+ if (nsjconf->cgroup_mem_max == (size_t)0 && swap_max < (ssize_t)0) {
return true;
}
std::string cgroup_path = getCgroupPath(nsjconf, pid);
RETURN_ON_FAILURE(createCgroup(cgroup_path, pid));
RETURN_ON_FAILURE(addPidToProcList(cgroup_path, pid));
- return writeToCgroup(cgroup_path, "memory.max", std::to_string(nsjconf->cgroup_mem_max));
+
+ if (nsjconf->cgroup_mem_max > (size_t)0) {
+ RETURN_ON_FAILURE(writeToCgroup(
+ cgroup_path, "memory.max", std::to_string(nsjconf->cgroup_mem_max)));
+ }
+
+ if (swap_max >= (ssize_t)0) {
+ RETURN_ON_FAILURE(
+ writeToCgroup(cgroup_path, "memory.swap.max", std::to_string(swap_max)));
+ }
+
+ return true;
}
static bool initNsFromParentPids(nsjconf_t *nsjconf, pid_t pid) {
diff --git a/cmdline.cc b/cmdline.cc
index a2b825b..9d3ab12 100644
--- a/cmdline.cc
+++ b/cmdline.cc
@@ -43,6 +43,7 @@
#include <unistd.h>
#include <memory>
+#include <sstream>
#include <string>
#include <vector>
@@ -103,7 +104,7 @@ struct custom_option custom_opts[] = {
{ { "skip_setsid", no_argument, NULL, 0x0504 }, "Don't call setsid(), allows for terminal signal handling in the sandboxed process. Dangerous" },
{ { "pass_fd", required_argument, NULL, 0x0505 }, "Don't close this FD before executing the child process (can be specified multiple times), by default: 0/1/2 are kept open" },
{ { "disable_no_new_privs", no_argument, NULL, 0x0507 }, "Don't set the prctl(NO_NEW_PRIVS, 1) (DANGEROUS)" },
- { { "rlimit_as", required_argument, NULL, 0x0201 }, "RLIMIT_AS in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 512)" },
+ { { "rlimit_as", required_argument, NULL, 0x0201 }, "RLIMIT_AS in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 4096)" },
{ { "rlimit_core", required_argument, NULL, 0x0202 }, "RLIMIT_CORE in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 0)" },
{ { "rlimit_cpu", required_argument, NULL, 0x0203 }, "RLIMIT_CPU, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 600)" },
{ { "rlimit_fsize", required_argument, NULL, 0x0204 }, "RLIMIT_FSIZE in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 1)" },
@@ -142,6 +143,8 @@ struct custom_option custom_opts[] = {
{ { "seccomp_log", no_argument, NULL, 0x0902 }, "Use SECCOMP_FILTER_FLAG_LOG. Log all actions except SECCOMP_RET_ALLOW). Supported since kernel version 4.14" },
{ { "nice_level", required_argument, NULL, 0x0903 }, "Set jailed process niceness (-20 is highest -priority, 19 is lowest). By default, set to 19" },
{ { "cgroup_mem_max", required_argument, NULL, 0x0801 }, "Maximum number of bytes to use in the group (default: '0' - disabled)" },
+ { { "cgroup_mem_memsw_max", required_argument, NULL, 0x0804 }, "Maximum number of memory+swap bytes to use (default: '0' - disabled)" },
+ { { "cgroup_mem_swap_max", required_argument, NULL, 0x0805 }, "Maximum number of swap bytes to use (default: '-1' - disabled)" },
{ { "cgroup_mem_mount", required_argument, NULL, 0x0802 }, "Location of memory cgroup FS (default: '/sys/fs/cgroup/memory')" },
{ { "cgroup_mem_parent", required_argument, NULL, 0x0803 }, "Which pre-existing memory cgroup to use as a parent (default: 'NSJAIL')" },
{ { "cgroup_pids_max", required_argument, NULL, 0x0811 }, "Maximum number of pids in a cgroup (default: '0' - disabled)" },
@@ -163,6 +166,7 @@ struct custom_option custom_opts[] = {
{ { "macvlan_vs_gw", required_argument, NULL, 0x703 }, "Default GW for the 'vs' interface (e.g. \"192.168.0.1\")" },
{ { "macvlan_vs_ma", required_argument, NULL, 0x705 }, "MAC-address of the 'vs' interface (e.g. \"ba:ad:ba:be:45:00\")" },
{ { "macvlan_vs_mo", required_argument, NULL, 0x706 }, "Mode of the 'vs' interface. Can be either 'private', 'vepa', 'bridge' or 'passthru' (default: 'private')" },
+ { { "disable_tsc", no_argument, NULL, 0x707 }, "Disable rdtsc and rdtscp instructions. WARNING: To make it effective, you also need to forbid `prctl(PR_SET_TSC, PR_TSC_ENABLE, ...)` in seccomp rules! (x86 and x86_64 only). Dynamic binaries produced by GCC seem to rely on RDTSC, but static ones should work." },
};
// clang-format on
@@ -456,6 +460,8 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) {
nsjconf->cgroup_mem_mount = "/sys/fs/cgroup/memory";
nsjconf->cgroup_mem_parent = "NSJAIL";
nsjconf->cgroup_mem_max = (size_t)0;
+ nsjconf->cgroup_mem_memsw_max = (size_t)0;
+ nsjconf->cgroup_mem_swap_max = (ssize_t)-1;
nsjconf->cgroup_pids_mount = "/sys/fs/cgroup/pids";
nsjconf->cgroup_pids_parent = "NSJAIL";
nsjconf->cgroup_pids_max = 0U;
@@ -473,6 +479,7 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) {
nsjconf->iface_vs_gw = "0.0.0.0";
nsjconf->iface_vs_ma = "";
nsjconf->iface_vs_mo = "private";
+ nsjconf->disable_tsc = false;
nsjconf->orig_uid = getuid();
nsjconf->orig_euid = geteuid();
nsjconf->num_cpus = sysconf(_SC_NPROCESSORS_ONLN);
@@ -777,7 +784,12 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) {
dst = src;
}
std::string fs_type = argFromVec(subopts, 2);
- std::string options = argFromVec(subopts, 3);
+ std::stringstream optionsStream;
+ optionsStream << argFromVec(subopts, 3);
+ for (std::size_t i = 4; i < subopts.size(); ++i) {
+ optionsStream << ":" << subopts[i];
+ }
+ std::string options = optionsStream.str();
if (!mnt::addMountPtTail(nsjconf.get(), src, dst, /* fstype= */ fs_type,
/* options= */ options, /* flags= */ 0,
/* is_dir= */ mnt::NS_DIR_MAYBE, /* is_mandatory= */ true,
@@ -846,6 +858,9 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) {
case 0x706:
nsjconf->iface_vs_mo = parseMACVlanMode(optarg);
break;
+ case 0x707:
+ nsjconf->disable_tsc = true;
+ break;
case 0x801:
nsjconf->cgroup_mem_max = (size_t)strtoull(optarg, NULL, 0);
break;
@@ -855,6 +870,12 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) {
case 0x803:
nsjconf->cgroup_mem_parent = optarg;
break;
+ case 0x804:
+ nsjconf->cgroup_mem_memsw_max = (size_t)strtoull(optarg, NULL, 0);
+ break;
+ case 0x805:
+ nsjconf->cgroup_mem_swap_max = (ssize_t)strtoll(optarg, NULL, 0);
+ break;
case 0x811:
nsjconf->cgroup_pids_max = (unsigned int)strtoul(optarg, NULL, 0);
break;
@@ -918,6 +939,11 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) {
}
setupUsers(nsjconf.get());
+ if (nsjconf->cgroup_mem_memsw_max > (size_t)0 &&
+ nsjconf->cgroup_mem_swap_max >= (ssize_t)0) {
+ LOG_F("cannot set both cgroup_mem_memsw_max and cgroup_mem_swap_max");
+ }
+
return nsjconf;
}
diff --git a/config.cc b/config.cc
index 551e59d..0179697 100644
--- a/config.cc
+++ b/config.cc
@@ -251,6 +251,8 @@ static bool configParseInternal(nsjconf_t* nsjconf, const nsjail::NsJailConfig&
nsjconf->nice_level = njc.nice_level();
nsjconf->cgroup_mem_max = njc.cgroup_mem_max();
+ nsjconf->cgroup_mem_memsw_max = njc.cgroup_mem_memsw_max();
+ nsjconf->cgroup_mem_swap_max = njc.cgroup_mem_swap_max();
nsjconf->cgroup_mem_mount = njc.cgroup_mem_mount();
nsjconf->cgroup_mem_parent = njc.cgroup_mem_parent();
nsjconf->cgroup_pids_max = njc.cgroup_pids_max();
@@ -278,6 +280,8 @@ static bool configParseInternal(nsjconf_t* nsjconf, const nsjail::NsJailConfig&
nsjconf->iface_vs_ma = njc.macvlan_vs_ma();
nsjconf->iface_vs_mo = njc.macvlan_vs_mo();
+ nsjconf->disable_tsc = njc.disable_tsc();
+
if (njc.has_exec_bin()) {
if (njc.exec_bin().has_path()) {
nsjconf->exec_file = njc.exec_bin().path();
@@ -322,11 +326,11 @@ bool parseFile(nsjconf_t* nsjconf, const char* file) {
return false;
}
if (!configParseInternal(nsjconf, nsc)) {
- LOG_W("Couldn't parse the ProtoBuf");
+ LOG_W("Couldn't parse the ProtoBuf from '%s'", file);
return false;
}
- LOG_D("Parsed config:\n'%s'", nsc.DebugString().c_str());
+ LOG_D("Parsed config from '%s':\n'%s'", file, nsc.DebugString().c_str());
return true;
}
diff --git a/config.proto b/config.proto
index 96a2b10..47847cc 100644
--- a/config.proto
+++ b/config.proto
@@ -211,6 +211,10 @@ message NsJailConfig {
/* If > 0, maximum cumulative size of RAM used inside any jail */
optional uint64 cgroup_mem_max = 67 [default = 0]; /* In bytes */
+ /* If > 0, maximum cumulative size of RAM + swap used inside any jail */
+ optional uint64 cgroup_mem_memsw_max = 91 [default = 0]; /* In bytes */
+ /* If >= 0, maximum cumulative size of swap used inside any jail */
+ optional int64 cgroup_mem_swap_max = 92 [default = -1]; /* In bytes */
/* Mount point for cgroups-memory in your system */
optional string cgroup_mem_mount = 68 [default = "/sys/fs/cgroup/memory"];
/* Writeable directory (for the nsjail user) under cgroup_mem_mount */
@@ -262,4 +266,6 @@ message NsJailConfig {
/* Binary path (with arguments) to be executed. If not specified here, it
can be specified with cmd-line as "-- /path/to/command arg1 arg2" */
optional Exe exec_bin = 90;
+
+ optional bool disable_tsc = 93 [default = false];
}
diff --git a/configs/apache.cfg b/configs/apache.cfg
index a1f2ff6..7f954c8 100644
--- a/configs/apache.cfg
+++ b/configs/apache.cfg
@@ -120,7 +120,7 @@ mount {
is_bind: true
}
-seccomp_string: " KILL {"
+seccomp_string: " KILL_PROCESS {"
seccomp_string: " ptrace,"
seccomp_string: " process_vm_readv,"
seccomp_string: " process_vm_writev"
diff --git a/configs/bash-with-fake-geteuid.cfg b/configs/bash-with-fake-geteuid.cfg
index 99a36af..ef873e1 100644
--- a/configs/bash-with-fake-geteuid.cfg
+++ b/configs/bash-with-fake-geteuid.cfg
@@ -177,7 +177,7 @@ mount {
seccomp_string: "ERRNO(1337) { geteuid } "
seccomp_string: "ERRNO(0) { ptrace } "
-seccomp_string: "KILL { syslog } "
+seccomp_string: "KILL_PROCESS { syslog } "
seccomp_string: "DEFAULT ALLOW "
exec_bin {
diff --git a/configs/demo-dont-use-chrome-with-net.cfg b/configs/demo-dont-use-chrome-with-net.cfg
index c6c6a5f..bf96ea3 100644
--- a/configs/demo-dont-use-chrome-with-net.cfg
+++ b/configs/demo-dont-use-chrome-with-net.cfg
@@ -167,7 +167,7 @@ mount {
is_bind: true
}
-seccomp_string: " KILL {"
+seccomp_string: " KILL_PROCESS {"
seccomp_string: " ptrace,"
seccomp_string: " process_vm_readv,"
seccomp_string: " process_vm_writev"
diff --git a/configs/firefox-with-cloned-net.cfg b/configs/firefox-with-cloned-net.cfg
index 180ed9a..b949018 100644
--- a/configs/firefox-with-cloned-net.cfg
+++ b/configs/firefox-with-cloned-net.cfg
@@ -168,7 +168,7 @@ mount {
is_bind: true
}
-seccomp_string: "KILL {"
+seccomp_string: "KILL_PROCESS {"
seccomp_string: " ptrace,"
seccomp_string: " process_vm_readv,"
seccomp_string: " process_vm_writev"
diff --git a/configs/firefox-with-net-wayland.cfg b/configs/firefox-with-net-wayland.cfg
index b132018..4a2edf6 100644
--- a/configs/firefox-with-net-wayland.cfg
+++ b/configs/firefox-with-net-wayland.cfg
@@ -29,7 +29,7 @@ envar: "FONTCONFIG_FILE=/etc/fonts/fonts.conf"
envar: "FC_CONFIG_FILE=/etc/fonts/fonts.conf"
envar: "MOZ_ENABLE_WAYLAND=1"
envar: "XDG_RUNTIME_DIR=/user/run/"
-envar: "WAYLAND_DISPLAY=wayland-0"
+envar: "WAYLAND_DISPLAY"
rlimit_as: 4096
rlimit_cpu: 1000
diff --git a/configs/firefox-with-net.cfg b/configs/firefox-with-net.cfg
index b88f8ea..633a5c1 100644
--- a/configs/firefox-with-net.cfg
+++ b/configs/firefox-with-net.cfg
@@ -160,7 +160,7 @@ mount {
is_bind: true
}
-seccomp_string: "KILL {"
+seccomp_string: "KILL_PROCESS {"
seccomp_string: " ptrace,"
seccomp_string: " process_vm_readv,"
seccomp_string: " process_vm_writev"
diff --git a/configs/home-documents-with-xorg-no-net.cfg b/configs/home-documents-with-xorg-no-net.cfg
index 83cfb42..a701b1e 100644
--- a/configs/home-documents-with-xorg-no-net.cfg
+++ b/configs/home-documents-with-xorg-no-net.cfg
@@ -142,7 +142,7 @@ mount {
is_bind: true
}
-seccomp_string: "KILL {"
+seccomp_string: "KILL_PROCESS {"
seccomp_string: " ptrace,"
seccomp_string: " process_vm_readv,"
seccomp_string: " process_vm_writev"
diff --git a/configs/imagemagick-convert.cfg b/configs/imagemagick-convert.cfg
index 479b293..8a3fe57 100644
--- a/configs/imagemagick-convert.cfg
+++ b/configs/imagemagick-convert.cfg
@@ -9,7 +9,7 @@ description: "Run as:"
description: ""
description: "./nsjail --config imagemagick-convert.cfg -- /usr/bin/convert jpg:/user/Documents/input.jpg png:/user/Documents/output.png "
description: "or "
-description: "./nsjail --config imagemagick-convert.cfg -- /usr/bin/convert jpg:- png:- <file.jpg >file.png
+description: "./nsjail --config imagemagick-convert.cfg -- /usr/bin/convert jpg:- png:- <file.jpg >file.png"
mode: ONCE
hostname: "IM-CONVERT"
@@ -83,7 +83,7 @@ seccomp_string: " arch_prctl, sched_getaffinity, set_tid_address,"
seccomp_string: " clock_gettime, set_robust_list, exit_group,"
seccomp_string: " clone, getcwd, pread64, readlink, prlimit64, madvise"
seccomp_string: "}"
-seccomp_string: "DEFAULT KILL"
+seccomp_string: "DEFAULT KILL_PROCESS"
exec_bin {
path: ""
diff --git a/configs/xchat-with-net.cfg b/configs/xchat-with-net.cfg
index 04c361b..19725d1 100644
--- a/configs/xchat-with-net.cfg
+++ b/configs/xchat-with-net.cfg
@@ -132,7 +132,7 @@ mount {
is_bind: true
}
-seccomp_string: "KILL {"
+seccomp_string: "KILL_PROCESS {"
seccomp_string: " ptrace,"
seccomp_string: " process_vm_readv,"
seccomp_string: " process_vm_writev"
diff --git a/configs/znc-with-net.cfg b/configs/znc-with-net.cfg
index bdcc53e..13c5107 100644
--- a/configs/znc-with-net.cfg
+++ b/configs/znc-with-net.cfg
@@ -122,7 +122,7 @@ mount {
mandatory: true
}
-seccomp_string: "KILL {"
+seccomp_string: "KILL_PROCESS {"
seccomp_string: " ptrace,"
seccomp_string: " process_vm_readv,"
seccomp_string: " process_vm_writev"
diff --git a/contain.cc b/contain.cc
index b5120cc..5a27cae 100644
--- a/contain.cc
+++ b/contain.cc
@@ -119,6 +119,22 @@ static bool containCPU(nsjconf_t* nsjconf) {
return cpu::initCpu(nsjconf);
}
+static bool containTSC(nsjconf_t* nsjconf) {
+ if (nsjconf->disable_tsc) {
+#if defined(__x86_64__) || defined(__i386__)
+ if (prctl(PR_SET_TSC, PR_TSC_SIGSEGV, 0, 0, 0) == -1) {
+ PLOG_E("prctl(PR_SET_TSC, PR_TSC_SIGSEGV)");
+ return false;
+ }
+#else /* defined(__x86_64__) || defined(__i386__) */
+ LOG_W(
+ "prctl(PR_SET_TSC, PR_TSC_SIGSEGV) requested, but it's supported under "
+ "x86/x86-64 CPU architectures only. Ignoring it!");
+#endif /* defined(__x86_64__) || defined(__i386__) */
+ }
+ return true;
+}
+
static bool containSetLimits(nsjconf_t* nsjconf) {
if (nsjconf->disable_rl) {
return true;
@@ -326,6 +342,7 @@ bool containProc(nsjconf_t* nsjconf) {
/* */
/* As non-root */
RETURN_ON_FAILURE(containCPU(nsjconf));
+ RETURN_ON_FAILURE(containTSC(nsjconf));
RETURN_ON_FAILURE(containSetLimits(nsjconf));
RETURN_ON_FAILURE(containPrepareEnv(nsjconf));
RETURN_ON_FAILURE(containMakeFdsCOE(nsjconf));
diff --git a/logs.cc b/logs.cc
index d377505..bc3f502 100644
--- a/logs.cc
+++ b/logs.cc
@@ -156,9 +156,7 @@ void logMsg(enum llevel_t ll, const char* fn, int ln, bool perr, const char* fmt
msg.append("\n");
/* End printing logs */
- if (write(_log_fd, msg.c_str(), msg.size()) == -1) {
- dprintf(_log_fd, "%s", msg.c_str());
- }
+ TEMP_FAILURE_RETRY(write(_log_fd, msg.c_str(), msg.size()));
if (ll == FATAL) {
exit(0xff);
diff --git a/logs.h b/logs.h
index 36e9813..e2e9d4b 100644
--- a/logs.h
+++ b/logs.h
@@ -29,21 +29,21 @@
namespace logs {
-#define LOG_HELP(...) logs::logMsg(logs::HELP, __PRETTY_FUNCTION__, __LINE__, false, __VA_ARGS__);
+#define LOG_HELP(...) logs::logMsg(logs::HELP, __FUNCTION__, __LINE__, false, __VA_ARGS__);
#define LOG_HELP_BOLD(...) \
- logs::logMsg(logs::HELP_BOLD, __PRETTY_FUNCTION__, __LINE__, false, __VA_ARGS__);
-
-#define LOG_D(...) logs::logMsg(logs::DEBUG, __PRETTY_FUNCTION__, __LINE__, false, __VA_ARGS__);
-#define LOG_I(...) logs::logMsg(logs::INFO, __PRETTY_FUNCTION__, __LINE__, false, __VA_ARGS__);
-#define LOG_W(...) logs::logMsg(logs::WARNING, __PRETTY_FUNCTION__, __LINE__, false, __VA_ARGS__);
-#define LOG_E(...) logs::logMsg(logs::ERROR, __PRETTY_FUNCTION__, __LINE__, false, __VA_ARGS__);
-#define LOG_F(...) logs::logMsg(logs::FATAL, __PRETTY_FUNCTION__, __LINE__, false, __VA_ARGS__);
-
-#define PLOG_D(...) logs::logMsg(logs::DEBUG, __PRETTY_FUNCTION__, __LINE__, true, __VA_ARGS__);
-#define PLOG_I(...) logs::logMsg(logs::INFO, __PRETTY_FUNCTION__, __LINE__, true, __VA_ARGS__);
-#define PLOG_W(...) logs::logMsg(logs::WARNING, __PRETTY_FUNCTION__, __LINE__, true, __VA_ARGS__);
-#define PLOG_E(...) logs::logMsg(logs::ERROR, __PRETTY_FUNCTION__, __LINE__, true, __VA_ARGS__);
-#define PLOG_F(...) logs::logMsg(logs::FATAL, __PRETTY_FUNCTION__, __LINE__, true, __VA_ARGS__);
+ logs::logMsg(logs::HELP_BOLD, __FUNCTION__, __LINE__, false, __VA_ARGS__);
+
+#define LOG_D(...) logs::logMsg(logs::DEBUG, __FUNCTION__, __LINE__, false, __VA_ARGS__);
+#define LOG_I(...) logs::logMsg(logs::INFO, __FUNCTION__, __LINE__, false, __VA_ARGS__);
+#define LOG_W(...) logs::logMsg(logs::WARNING, __FUNCTION__, __LINE__, false, __VA_ARGS__);
+#define LOG_E(...) logs::logMsg(logs::ERROR, __FUNCTION__, __LINE__, false, __VA_ARGS__);
+#define LOG_F(...) logs::logMsg(logs::FATAL, __FUNCTION__, __LINE__, false, __VA_ARGS__);
+
+#define PLOG_D(...) logs::logMsg(logs::DEBUG, __FUNCTION__, __LINE__, true, __VA_ARGS__);
+#define PLOG_I(...) logs::logMsg(logs::INFO, __FUNCTION__, __LINE__, true, __VA_ARGS__);
+#define PLOG_W(...) logs::logMsg(logs::WARNING, __FUNCTION__, __LINE__, true, __VA_ARGS__);
+#define PLOG_E(...) logs::logMsg(logs::ERROR, __FUNCTION__, __LINE__, true, __VA_ARGS__);
+#define PLOG_F(...) logs::logMsg(logs::FATAL, __FUNCTION__, __LINE__, true, __VA_ARGS__);
enum llevel_t {
DEBUG = 0,
diff --git a/macros.h b/macros.h
index 80e4b61..c9a7ca1 100644
--- a/macros.h
+++ b/macros.h
@@ -65,6 +65,6 @@ static void __attribute__ ((unused)) __clang_cleanup_func(void (^*dfunc) (void))
#endif
#define NS_VALSTR_STRUCT(x) \
- { x, #x }
+ { (uint64_t) x, #x }
#endif /* NS_COMMON_H */
diff --git a/mnt.cc b/mnt.cc
index 1ccd626..13952c7 100644
--- a/mnt.cc
+++ b/mnt.cc
@@ -57,36 +57,42 @@ static const std::string flagsToStr(uintptr_t flags) {
std::string res;
struct {
- const uintptr_t flag;
+ const uint64_t flag;
const char* const name;
} static const mountFlags[] = {
- NS_VALSTR_STRUCT(MS_RDONLY),
- NS_VALSTR_STRUCT(MS_NOSUID),
- NS_VALSTR_STRUCT(MS_NODEV),
- NS_VALSTR_STRUCT(MS_NOEXEC),
- NS_VALSTR_STRUCT(MS_SYNCHRONOUS),
- NS_VALSTR_STRUCT(MS_REMOUNT),
- NS_VALSTR_STRUCT(MS_MANDLOCK),
- NS_VALSTR_STRUCT(MS_DIRSYNC),
- NS_VALSTR_STRUCT(MS_NOATIME),
- NS_VALSTR_STRUCT(MS_NODIRATIME),
- NS_VALSTR_STRUCT(MS_BIND),
- NS_VALSTR_STRUCT(MS_MOVE),
- NS_VALSTR_STRUCT(MS_REC),
- NS_VALSTR_STRUCT(MS_SILENT),
- NS_VALSTR_STRUCT(MS_POSIXACL),
- NS_VALSTR_STRUCT(MS_UNBINDABLE),
- NS_VALSTR_STRUCT(MS_PRIVATE),
- NS_VALSTR_STRUCT(MS_SLAVE),
- NS_VALSTR_STRUCT(MS_SHARED),
- NS_VALSTR_STRUCT(MS_RELATIME),
- NS_VALSTR_STRUCT(MS_KERNMOUNT),
- NS_VALSTR_STRUCT(MS_I_VERSION),
- NS_VALSTR_STRUCT(MS_STRICTATIME),
- NS_VALSTR_STRUCT(MS_LAZYTIME),
+ NS_VALSTR_STRUCT(MS_RDONLY),
+ NS_VALSTR_STRUCT(MS_NOSUID),
+ NS_VALSTR_STRUCT(MS_NODEV),
+ NS_VALSTR_STRUCT(MS_NOEXEC),
+ NS_VALSTR_STRUCT(MS_SYNCHRONOUS),
+ NS_VALSTR_STRUCT(MS_REMOUNT),
+ NS_VALSTR_STRUCT(MS_MANDLOCK),
+ NS_VALSTR_STRUCT(MS_DIRSYNC),
+ NS_VALSTR_STRUCT(MS_NOATIME),
+ NS_VALSTR_STRUCT(MS_NODIRATIME),
+ NS_VALSTR_STRUCT(MS_BIND),
+ NS_VALSTR_STRUCT(MS_MOVE),
+ NS_VALSTR_STRUCT(MS_REC),
+ NS_VALSTR_STRUCT(MS_SILENT),
+ NS_VALSTR_STRUCT(MS_POSIXACL),
+ NS_VALSTR_STRUCT(MS_UNBINDABLE),
+ NS_VALSTR_STRUCT(MS_PRIVATE),
+ NS_VALSTR_STRUCT(MS_SLAVE),
+ NS_VALSTR_STRUCT(MS_SHARED),
+ NS_VALSTR_STRUCT(MS_RELATIME),
+ NS_VALSTR_STRUCT(MS_KERNMOUNT),
+ NS_VALSTR_STRUCT(MS_I_VERSION),
+ NS_VALSTR_STRUCT(MS_STRICTATIME),
+ NS_VALSTR_STRUCT(MS_LAZYTIME),
+#if defined(MS_ACTIVE)
+ NS_VALSTR_STRUCT(MS_ACTIVE),
+#endif /* defined(MS_ACTIVE) */
+#if defined(MS_NOUSER)
+ NS_VALSTR_STRUCT(MS_NOUSER),
+#endif /* defined(MS_NOUSER) */
};
- uintptr_t knownFlagMask = 0U;
+ uint64_t knownFlagMask = 0U;
for (const auto& i : mountFlags) {
if (flags & i.flag) {
if (!res.empty()) {
@@ -144,7 +150,7 @@ static bool mountPt(mount_t* mpt, const char* newroot, const char* tmpdir) {
LOG_D("symlink('%s', '%s')", srcpath, dstpath);
if (symlink(srcpath, dstpath) == -1) {
if (mpt->is_mandatory) {
- PLOG_W("symlink('%s', '%s')", srcpath, dstpath);
+ PLOG_E("symlink('%s', '%s')", srcpath, dstpath);
return false;
} else {
PLOG_W("symlink('%s', '%s'), but it's not mandatory, continuing",
@@ -390,6 +396,7 @@ static bool initCloneNs(nsjconf_t* nsjconf) {
for (auto& p : nsjconf->mountpts) {
if (!mountPt(&p, destdir->c_str(), tmpdir->c_str()) && p.is_mandatory) {
+ LOG_E("Couldn't mount '%s'", p.dst.c_str());
return false;
}
}
diff --git a/nsjail.1 b/nsjail.1
index 439f8e1..4b7d4c4 100644
--- a/nsjail.1
+++ b/nsjail.1
@@ -119,7 +119,7 @@ Don't close this FD before executing the child process (can be specified multipl
Don't set the prctl(NO_NEW_PRIVS, 1) (DANGEROUS)
.TP
\fB\-\-rlimit_as\fR VALUE
-RLIMIT_AS in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM_INFINITY (default: 512)
+RLIMIT_AS in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM_INFINITY (default: 4096)
.TP
\fB\-\-rlimit_core\fR VALUE
RLIMIT_CORE in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current limit, 'inf' for RLIM_INFINITY (default: 0)
@@ -220,6 +220,12 @@ Use SECCOMP_FILTER_FLAG_LOG. Log all actions except SECCOMP_RET_ALLOW. Supported
\fB\-\-cgroup_mem_max\fR VALUE
Maximum number of bytes to use in the group (default: '0' \- disabled)
.TP
+\fB\-\-cgroup_mem_memsw_max\fR VALUE
+Maximum number of memory+Swap bytes to use in the group (default: '0' \- disabled)
+.TP
+\fB\-\-cgroup_mem_swap_max\fR VALUE
+Maximum number of swap bytes to use in the group (default: '-1' \- disabled)
+.TP
\fB\-\-cgroup_mem_mount\fR VALUE
Location of memory cgroup FS (default: '/sys/fs/cgroup/memory')
.TP
diff --git a/nsjail.h b/nsjail.h
index fc8bf70..9203143 100644
--- a/nsjail.h
+++ b/nsjail.h
@@ -145,9 +145,12 @@ struct nsjconf_t {
std::string iface_vs_gw;
std::string iface_vs_ma;
std::string iface_vs_mo;
+ bool disable_tsc;
std::string cgroup_mem_mount;
std::string cgroup_mem_parent;
size_t cgroup_mem_max;
+ size_t cgroup_mem_memsw_max;
+ ssize_t cgroup_mem_swap_max;
std::string cgroup_pids_mount;
std::string cgroup_pids_parent;
unsigned int cgroup_pids_max;
diff --git a/subproc.cc b/subproc.cc
index bd2bdfe..4d20975 100644
--- a/subproc.cc
+++ b/subproc.cc
@@ -67,7 +67,7 @@ static const std::string cloneFlagsToStr(uintptr_t flags) {
std::string res;
struct {
- const uintptr_t flag;
+ const uint64_t flag;
const char* const name;
} static const cloneFlags[] = {
NS_VALSTR_STRUCT(CLONE_NEWTIME),
@@ -100,7 +100,7 @@ static const std::string cloneFlagsToStr(uintptr_t flags) {
NS_VALSTR_STRUCT(CLONE_IO),
};
- uintptr_t knownFlagMask = 0;
+ uint64_t knownFlagMask = 0;
for (const auto& i : cloneFlags) {
if (flags & i.flag) {
if (!res.empty()) {
@@ -266,10 +266,11 @@ static void seccompViolation(nsjconf_t* nsjconf, siginfo_t* si) {
const auto& p = nsjconf->pids.find(si->si_pid);
if (p == nsjconf->pids.end()) {
LOG_W(
- "pid=%d SiSyscall: %d, SiCode: %d, SiErrno: %d, SiSigno: %d. (If "
- "SiSyscall==31, then it's most likely the SIGSYS value. See 'dmesg' or "
- "'journalctl -ek' for possible auditd report with more data)",
- (int)si->si_pid, si->si_syscall, si->si_code, si->si_errno, si->si_signo);
+ "pid=%d SiStatus:%d SiUid:%d SiUtime:%ld SiStime:%ld (If "
+ "SiStatus==31 (SIGSYS), then see 'dmesg' or 'journalctl -ek' for possible "
+ "auditd report with more data)",
+ (int)si->si_pid, si->si_status, si->si_uid, (long)si->si_utime,
+ (long)si->si_stime);
LOG_E("Couldn't find pid element in the subproc list for pid=%d", (int)si->si_pid);
return;
}
@@ -278,10 +279,11 @@ static void seccompViolation(nsjconf_t* nsjconf, siginfo_t* si) {
ssize_t rdsize = util::readFromFd(p->second.pid_syscall_fd, buf, sizeof(buf) - 1);
if (rdsize < 1) {
LOG_W(
- "pid=%d, SiSyscall: %d, SiCode: %d, SiErrno: %d, SiSigno: %d. (If "
- "SiSyscall==31, then it's most likely the SIGSYS value. See 'dmesg' or "
- "'journalctl -ek' for possible auditd report with more data)",
- (int)si->si_pid, si->si_syscall, si->si_code, si->si_errno, si->si_signo);
+ "pid=%d SiStatus:%d SiUid:%d SiUtime:%ld SiStime:%ld (If "
+ "SiStatus==31 (SIGSYS), then see 'dmesg' or 'journalctl -ek' for possible "
+ "auditd report with more data)",
+ (int)si->si_pid, si->si_status, si->si_uid, (long)si->si_utime,
+ (long)si->si_stime);
return;
}
buf[rdsize - 1] = '\0';
@@ -292,23 +294,24 @@ static void seccompViolation(nsjconf_t* nsjconf, siginfo_t* si) {
&arg4, &arg5, &arg6, &sp, &pc);
if (ret == 9) {
LOG_W(
- "pid=%d, Syscall number: %td, Arguments: %#tx, %#tx, %#tx, %#tx, %#tx, %#tx, "
- "SP: %#tx, PC: %#tx, si_syscall: %d, si_errno: %#x",
- (int)si->si_pid, sc, arg1, arg2, arg3, arg4, arg5, arg6, sp, pc, si->si_syscall,
- si->si_errno);
+ "pid=%d, Syscall number:%td, Arguments:%#tx, %#tx, %#tx, %#tx, %#tx, %#tx, "
+ "SP:%#tx, PC:%#tx, si_status:%d",
+ (int)si->si_pid, sc, arg1, arg2, arg3, arg4, arg5, arg6, sp, pc, si->si_status);
} else if (ret == 3) {
LOG_W(
- "pid=%d, SiSyscall: %d, SiCode: %d, SiErrno: %d, SiSigno: %d, SP: %#tx, PC: "
- "%#tx (If SiSyscall==31, then it's most likely the SIGSYS value. See 'dmesg' "
- "or 'journalctl -ek' for possible auditd report with more data)",
- (int)si->si_pid, si->si_syscall, si->si_code, si->si_errno, si->si_signo, arg1,
- arg2);
+ "pid=%d SiStatus:%d SiUid:%d SiUtime:%ld SiStime:%ld SP:%#tx, PC:%#tx (If "
+ "SiStatus==31 (SIGSYS), then see 'dmesg' or 'journalctl -ek' for possible "
+ "auditd report with more data)",
+ (int)si->si_pid, si->si_status, si->si_uid, (long)si->si_utime,
+ (long)si->si_stime, arg1, arg2);
+ return;
} else {
LOG_W(
- "pid=%d, SiSyscall: %d, SiCode: %d, SiErrno: %d, Syscall string '%s'. (If "
- "SiSyscall==31, then it's most likely the SIGSYS value. See 'dmesg' or "
- "'journalctl -ek' for possible auditd report with more data)",
- (int)si->si_pid, si->si_syscall, si->si_code, si->si_errno, buf);
+ "pid=%d SiStatus:%d SiUid:%d SiUtime:%ld SiStime:%ld (If "
+ "SiStatus==31 (SIGSYS), then see 'dmesg' or 'journalctl -ek' for possible "
+ "auditd report with more data)",
+ (int)si->si_pid, si->si_status, si->si_uid, (long)si->si_utime,
+ (long)si->si_stime);
}
}
@@ -526,19 +529,9 @@ pid_t cloneProc(uintptr_t flags, int exit_signal) {
}
#if defined(__NR_clone3)
- struct clone_args ca = {
- .flags = (uint64_t)flags,
- .pidfd = 0,
- .child_tid = 0,
- .parent_tid = 0,
- .exit_signal = (uint64_t)exit_signal,
- .stack = 0,
- .stack_size = 0,
- .tls = 0,
- .set_tid = 0,
- .set_tid_size = 0,
- .cgroup = 0,
- };
+ struct clone_args ca = {};
+ ca.flags = (uint64_t)flags;
+ ca.exit_signal = (uint64_t)exit_signal;
pid_t ret = util::syscall(__NR_clone3, (uintptr_t)&ca, sizeof(ca));
if (ret != -1 || errno != ENOSYS) {