diff options
-rw-r--r-- | METADATA | 4 | ||||
-rw-r--r-- | README.md | 202 | ||||
-rw-r--r-- | cgroup.cc | 25 | ||||
-rw-r--r-- | cgroup2.cc | 20 | ||||
-rw-r--r-- | cmdline.cc | 30 | ||||
-rw-r--r-- | config.cc | 8 | ||||
-rw-r--r-- | config.proto | 6 | ||||
-rw-r--r-- | configs/apache.cfg | 2 | ||||
-rw-r--r-- | configs/bash-with-fake-geteuid.cfg | 2 | ||||
-rw-r--r-- | configs/demo-dont-use-chrome-with-net.cfg | 2 | ||||
-rw-r--r-- | configs/firefox-with-cloned-net.cfg | 2 | ||||
-rw-r--r-- | configs/firefox-with-net-wayland.cfg | 2 | ||||
-rw-r--r-- | configs/firefox-with-net.cfg | 2 | ||||
-rw-r--r-- | configs/home-documents-with-xorg-no-net.cfg | 2 | ||||
-rw-r--r-- | configs/imagemagick-convert.cfg | 4 | ||||
-rw-r--r-- | configs/xchat-with-net.cfg | 2 | ||||
-rw-r--r-- | configs/znc-with-net.cfg | 2 | ||||
-rw-r--r-- | contain.cc | 17 | ||||
-rw-r--r-- | logs.cc | 4 | ||||
-rw-r--r-- | logs.h | 28 | ||||
-rw-r--r-- | macros.h | 2 | ||||
-rw-r--r-- | mnt.cc | 61 | ||||
-rw-r--r-- | nsjail.1 | 8 | ||||
-rw-r--r-- | nsjail.h | 3 | ||||
-rw-r--r-- | subproc.cc | 65 |
25 files changed, 310 insertions, 195 deletions
@@ -12,7 +12,7 @@ third_party { type: GIT value: "https://github.com/google/nsjail" } - version: "c7a313123b3dcb845ed3822b99ad9db69a6a82c8" - last_upgrade_date { year: 2019 month: 1 day: 10 } + version: "6483728e2490c1fc497a81bba5682515eb489cf8" + last_upgrade_date { year: 2022 month: 3 day: 29 } license_type: NOTICE } @@ -339,177 +339,203 @@ The command-line options should be self-explanatory, while the proto-buf config Usage: ./nsjail [options] -- path_to_command [args] Options: --help|-h - Help plz.. + Help plz.. --mode|-M VALUE - Execution mode (default: 'o' [MODE_STANDALONE_ONCE]): - l: Wait for connections on a TCP port (specified with --port) [MODE_LISTEN_TCP] - o: Launch a single process on the console using clone/execve [MODE_STANDALONE_ONCE] - e: Launch a single process on the console using execve [MODE_STANDALONE_EXECVE] - r: Launch a single process on the console with clone/execve, keep doing it forever [MODE_STANDALONE_RERUN] + Execution mode (default: 'o' [MODE_STANDALONE_ONCE]): + l: Wait for connections on a TCP port (specified with --port) [MODE_LISTEN_TCP] + o: Launch a single process on the console using clone/execve [MODE_STANDALONE_ONCE] + e: Launch a single process on the console using execve [MODE_STANDALONE_EXECVE] + r: Launch a single process on the console with clone/execve, keep doing it forever [MODE_STANDALONE_RERUN] --config|-C VALUE - Configuration file in the config.proto ProtoBuf format (see configs/ directory for examples) + Configuration file in the config.proto ProtoBuf format (see configs/ directory for examples) --exec_file|-x VALUE - File to exec (default: argv[0]) + File to exec (default: argv[0]) --execute_fd - Use execveat() to execute a file-descriptor instead of executing the binary path. In such case argv[0]/exec_file denotes a file path before mount namespacing + Use execveat() to execute a file-descriptor instead of executing the binary path. In such case argv[0]/exec_file denotes a file path before mount namespacing --chroot|-c VALUE - Directory containing / of the jail (default: none) + Directory containing / of the jail (default: none) + --no_pivotroot + When creating a mount namespace, use mount(MS_MOVE) and chroot rather than pivot_root. Usefull when pivot_root is disallowed (e.g. initramfs). Note: escapable is some configuration --rw - Mount chroot dir (/) R/W (default: R/O) + Mount chroot dir (/) R/W (default: R/O) --user|-u VALUE - Username/uid of processes inside the jail (default: your current uid). You can also use inside_ns_uid:outside_ns_uid:count convention here. Can be specified multiple times + Username/uid of processes inside the jail (default: your current uid). You can also use inside_ns_uid:outside_ns_uid:count convention here. Can be specified multiple times --group|-g VALUE - Groupname/gid of processes inside the jail (default: your current gid). You can also use inside_ns_gid:global_ns_gid:count convention here. Can be specified multiple times + Groupname/gid of processes inside the jail (default: your current gid). You can also use inside_ns_gid:global_ns_gid:count convention here. Can be specified multiple times --hostname|-H VALUE - UTS name (hostname) of the jail (default: 'NSJAIL') + UTS name (hostname) of the jail (default: 'NSJAIL') --cwd|-D VALUE - Directory in the namespace the process will run (default: '/') + Directory in the namespace the process will run (default: '/') --port|-p VALUE - TCP port to bind to (enables MODE_LISTEN_TCP) (default: 0) + TCP port to bind to (enables MODE_LISTEN_TCP) (default: 0) --bindhost VALUE - IP address to bind the port to (only in [MODE_LISTEN_TCP]), (default: '::') + IP address to bind the port to (only in [MODE_LISTEN_TCP]), (default: '::') --max_conns VALUE - Maximum number of connections across all IPs (only in [MODE_LISTEN_TCP]), (default: 0 (unlimited)) + Maximum number of connections across all IPs (only in [MODE_LISTEN_TCP]), (default: 0 (unlimited)) --max_conns_per_ip|-i VALUE - Maximum number of connections per one IP (only in [MODE_LISTEN_TCP]), (default: 0 (unlimited)) + Maximum number of connections per one IP (only in [MODE_LISTEN_TCP]), (default: 0 (unlimited)) --log|-l VALUE - Log file (default: use log_fd) + Log file (default: use log_fd) --log_fd|-L VALUE - Log FD (default: 2) + Log FD (default: 2) --time_limit|-t VALUE - Maximum time that a jail can exist, in seconds (default: 600) + Maximum time that a jail can exist, in seconds (default: 600) --max_cpus VALUE - Maximum number of CPUs a single jailed process can use (default: 0 'no limit') + Maximum number of CPUs a single jailed process can use (default: 0 'no limit') --daemon|-d - Daemonize after start + Daemonize after start --verbose|-v - Verbose output + Verbose output --quiet|-q - Log warning and more important messages only + Log warning and more important messages only --really_quiet|-Q - Log fatal messages only + Log fatal messages only --keep_env|-e - Pass all environment variables to the child process (default: all envvars are cleared) + Pass all environment variables to the child process (default: all envars are cleared) --env|-E VALUE - Additional environment variable (can be used multiple times) + Additional environment variable (can be used multiple times). If the envar doesn't contain '=' (e.g. just the 'DISPLAY' string), the current envar value will be used --keep_caps - Don't drop any capabilities + Don't drop any capabilities --cap VALUE - Retain this capability, e.g. CAP_PTRACE (can be specified multiple times) + Retain this capability, e.g. CAP_PTRACE (can be specified multiple times) --silent - Redirect child process' fd:0/1/2 to /dev/null + Redirect child process' fd:0/1/2 to /dev/null --stderr_to_null - Redirect FD=2 (STDERR_FILENO) to /dev/null + Redirect child process' fd:2 (STDERR_FILENO) to /dev/null --skip_setsid - Don't call setsid(), allows for terminal signal handling in the sandboxed process. Dangerous + Don't call setsid(), allows for terminal signal handling in the sandboxed process. Dangerous --pass_fd VALUE - Don't close this FD before executing the child process (can be specified multiple times), by default: 0/1/2 are kept open + Don't close this FD before executing the child process (can be specified multiple times), by default: 0/1/2 are kept open --disable_no_new_privs - Don't set the prctl(NO_NEW_PRIVS, 1) (DANGEROUS) + Don't set the prctl(NO_NEW_PRIVS, 1) (DANGEROUS) --rlimit_as VALUE - RLIMIT_AS in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 512) + RLIMIT_AS in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 4096) --rlimit_core VALUE - RLIMIT_CORE in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 0) + RLIMIT_CORE in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 0) --rlimit_cpu VALUE - RLIMIT_CPU, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 600) + RLIMIT_CPU, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 600) --rlimit_fsize VALUE - RLIMIT_FSIZE in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 1) + RLIMIT_FSIZE in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 1) --rlimit_nofile VALUE - RLIMIT_NOFILE, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 32) + RLIMIT_NOFILE, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 32) --rlimit_nproc VALUE - RLIMIT_NPROC, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 'soft') + RLIMIT_NPROC, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 'soft') --rlimit_stack VALUE - RLIMIT_STACK in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 'soft') + RLIMIT_STACK in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 'soft') + --rlimit_memlock VALUE + RLIMIT_MEMLOCK in KB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 'soft') + --rlimit_rtprio VALUE + RLIMIT_RTPRIO, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 'soft') + --rlimit_msgqueue VALUE + RLIMIT_MSGQUEUE in bytes, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 'soft') + --disable_rlimits + Disable all rlimits, default to limits set by parent --persona_addr_compat_layout - personality(ADDR_COMPAT_LAYOUT) + personality(ADDR_COMPAT_LAYOUT) --persona_mmap_page_zero - personality(MMAP_PAGE_ZERO) + personality(MMAP_PAGE_ZERO) --persona_read_implies_exec - personality(READ_IMPLIES_EXEC) + personality(READ_IMPLIES_EXEC) --persona_addr_limit_3gb - personality(ADDR_LIMIT_3GB) + personality(ADDR_LIMIT_3GB) --persona_addr_no_randomize - personality(ADDR_NO_RANDOMIZE) + personality(ADDR_NO_RANDOMIZE) --disable_clone_newnet|-N - Don't use CLONE_NEWNET. Enable global networking inside the jail + Don't use CLONE_NEWNET. Enable global networking inside the jail --disable_clone_newuser - Don't use CLONE_NEWUSER. Requires euid==0 + Don't use CLONE_NEWUSER. Requires euid==0 --disable_clone_newns - Don't use CLONE_NEWNS + Don't use CLONE_NEWNS --disable_clone_newpid - Don't use CLONE_NEWPID + Don't use CLONE_NEWPID --disable_clone_newipc - Don't use CLONE_NEWIPC + Don't use CLONE_NEWIPC --disable_clone_newuts - Don't use CLONE_NEWUTS + Don't use CLONE_NEWUTS --disable_clone_newcgroup - Don't use CLONE_NEWCGROUP. Might be required for kernel versions < 4.6 + Don't use CLONE_NEWCGROUP. Might be required for kernel versions < 4.6 + --enable_clone_newtime + Use CLONE_NEWTIME. Supported with kernel versions >= 5.3 --uid_mapping|-U VALUE - Add a custom uid mapping of the form inside_uid:outside_uid:count. Setting this requires newuidmap (set-uid) to be present + Add a custom uid mapping of the form inside_uid:outside_uid:count. Setting this requires newuidmap (set-uid) to be present --gid_mapping|-G VALUE - Add a custom gid mapping of the form inside_gid:outside_gid:count. Setting this requires newgidmap (set-uid) to be present + Add a custom gid mapping of the form inside_gid:outside_gid:count. Setting this requires newgidmap (set-uid) to be present --bindmount_ro|-R VALUE - List of mountpoints to be mounted --bind (ro) inside the container. Can be specified multiple times. Supports 'source' syntax, or 'source:dest' + List of mountpoints to be mounted --bind (ro) inside the container. Can be specified multiple times. Supports 'source' syntax, or 'source:dest' --bindmount|-B VALUE - List of mountpoints to be mounted --bind (rw) inside the container. Can be specified multiple times. Supports 'source' syntax, or 'source:dest' + List of mountpoints to be mounted --bind (rw) inside the container. Can be specified multiple times. Supports 'source' syntax, or 'source:dest' --tmpfsmount|-T VALUE - List of mountpoints to be mounted as tmpfs (R/W) inside the container. Can be specified multiple times. Supports 'dest' syntax. Alternatively, use '-m none:dest:tmpfs:size=8388608' + List of mountpoints to be mounted as tmpfs (R/W) inside the container. Can be specified multiple times. Supports 'dest' syntax. Alternatively, use '-m none:dest:tmpfs:size=8388608' --mount|-m VALUE - Arbitrary mount, format src:dst:fs_type:options + Arbitrary mount, format src:dst:fs_type:options --symlink|-s VALUE - Symlink, format src:dst + Symlink, format src:dst --disable_proc - Disable mounting procfs in the jail + Disable mounting procfs in the jail --proc_path VALUE - Path used to mount procfs (default: '/proc') + Path used to mount procfs (default: '/proc') --proc_rw - Is procfs mounted as R/W (default: R/O) + Is procfs mounted as R/W (default: R/O) --seccomp_policy|-P VALUE - Path to file containing seccomp-bpf policy (see kafel/) + Path to file containing seccomp-bpf policy (see kafel/) --seccomp_string VALUE - String with kafel seccomp-bpf policy (see kafel/) + String with kafel seccomp-bpf policy (see kafel/) --seccomp_log - Use SECCOMP_FILTER_FLAG_LOG. Log all actions except SECCOMP_RET_ALLOW). Supported since kernel version 4.14 + Use SECCOMP_FILTER_FLAG_LOG. Log all actions except SECCOMP_RET_ALLOW). Supported since kernel version 4.14 + --nice_level VALUE + Set jailed process niceness (-20 is highest -priority, 19 is lowest). By default, set to 19 --cgroup_mem_max VALUE - Maximum number of bytes to use in the group (default: '0' - disabled) + Maximum number of bytes to use in the group (default: '0' - disabled) + --cgroup_mem_memsw_max VALUE + Maximum number of memory+swap bytes to use (default: '0' - disabled) + --cgroup_mem_swap_max VALUE + Maximum number of swap bytes to use (default: '-1' - disabled) --cgroup_mem_mount VALUE - Location of memory cgroup FS (default: '/sys/fs/cgroup/memory') + Location of memory cgroup FS (default: '/sys/fs/cgroup/memory') --cgroup_mem_parent VALUE - Which pre-existing memory cgroup to use as a parent (default: 'NSJAIL') + Which pre-existing memory cgroup to use as a parent (default: 'NSJAIL') --cgroup_pids_max VALUE - Maximum number of pids in a cgroup (default: '0' - disabled) + Maximum number of pids in a cgroup (default: '0' - disabled) --cgroup_pids_mount VALUE - Location of pids cgroup FS (default: '/sys/fs/cgroup/pids') + Location of pids cgroup FS (default: '/sys/fs/cgroup/pids') --cgroup_pids_parent VALUE - Which pre-existing pids cgroup to use as a parent (default: 'NSJAIL') + Which pre-existing pids cgroup to use as a parent (default: 'NSJAIL') --cgroup_net_cls_classid VALUE - Class identifier of network packets in the group (default: '0' - disabled) + Class identifier of network packets in the group (default: '0' - disabled) --cgroup_net_cls_mount VALUE - Location of net_cls cgroup FS (default: '/sys/fs/cgroup/net_cls') + Location of net_cls cgroup FS (default: '/sys/fs/cgroup/net_cls') --cgroup_net_cls_parent VALUE - Which pre-existing net_cls cgroup to use as a parent (default: 'NSJAIL') + Which pre-existing net_cls cgroup to use as a parent (default: 'NSJAIL') --cgroup_cpu_ms_per_sec VALUE - Number of milliseconds of CPU time per second that the process group can use (default: '0' - no limit) + Number of milliseconds of CPU time per second that the process group can use (default: '0' - no limit) --cgroup_cpu_mount VALUE - Location of cpu cgroup FS (default: '/sys/fs/cgroup/cpu') + Location of cpu cgroup FS (default: '/sys/fs/cgroup/cpu') --cgroup_cpu_parent VALUE - Which pre-existing cpu cgroup to use as a parent (default: 'NSJAIL') + Which pre-existing cpu cgroup to use as a parent (default: 'NSJAIL') + --cgroupv2_mount VALUE + Location of cgroupv2 directory (default: '/sys/fs/cgroup') + --use_cgroupv2 + Use cgroup v2 --iface_no_lo - Don't bring the 'lo' interface up + Don't bring the 'lo' interface up --iface_own VALUE - Move this existing network interface into the new NET namespace. Can be specified multiple times + Move this existing network interface into the new NET namespace. Can be specified multiple times --macvlan_iface|-I VALUE - Interface which will be cloned (MACVLAN) and put inside the subprocess' namespace as 'vs' + Interface which will be cloned (MACVLAN) and put inside the subprocess' namespace as 'vs' --macvlan_vs_ip VALUE - IP of the 'vs' interface (e.g. "192.168.0.1") + IP of the 'vs' interface (e.g. "192.168.0.1") --macvlan_vs_nm VALUE - Netmask of the 'vs' interface (e.g. "255.255.255.0") + Netmask of the 'vs' interface (e.g. "255.255.255.0") --macvlan_vs_gw VALUE - Default GW for the 'vs' interface (e.g. "192.168.0.1") + Default GW for the 'vs' interface (e.g. "192.168.0.1") --macvlan_vs_ma VALUE - MAC-address of the 'vs' interface (e.g. "ba:ad:ba:be:45:00") + MAC-address of the 'vs' interface (e.g. "ba:ad:ba:be:45:00") + --macvlan_vs_mo VALUE + Mode of the 'vs' interface. Can be either 'private', 'vepa', 'bridge' or 'passthru' (default: 'private') + --disable_tsc + Disable rdtsc and rdtscp instructions. WARNING: To make it effective, you also need to forbid `prctl(PR_SET_TSC, PR_TSC_ENABLE, ...)` in seccomp rules! (x86 and x86_64 only). Dynamic binaries produced by GCC seem to rely on RDTSC, but static ones should work. - Examples: +Examples: Wait on a port 31337 for connections, and run /bin/sh nsjail -Ml --port 31337 --chroot / -- /bin/sh -i Re-run echo command as a sub-process @@ -65,7 +65,12 @@ static bool addPidToTaskList(const std::string& cgroup_path, pid_t pid) { } static bool initNsFromParentMem(nsjconf_t* nsjconf, pid_t pid) { - if (nsjconf->cgroup_mem_max == (size_t)0) { + size_t memsw_max = nsjconf->cgroup_mem_memsw_max; + if (nsjconf->cgroup_mem_swap_max >= (ssize_t)0) { + memsw_max = nsjconf->cgroup_mem_swap_max + nsjconf->cgroup_mem_max; + } + + if (nsjconf->cgroup_mem_max == (size_t)0 && memsw_max == (size_t)0) { return true; } @@ -73,16 +78,24 @@ static bool initNsFromParentMem(nsjconf_t* nsjconf, pid_t pid) { "/NSJAIL." + std::to_string(pid); RETURN_ON_FAILURE(createCgroup(mem_cgroup_path, pid)); - std::string mem_max_str = std::to_string(nsjconf->cgroup_mem_max); - RETURN_ON_FAILURE(writeToCgroup( - mem_cgroup_path + "/memory.limit_in_bytes", mem_max_str, "memory cgroup max limit")); - /* * Use OOM-killer instead of making processes hang/sleep */ RETURN_ON_FAILURE(writeToCgroup( mem_cgroup_path + "/memory.oom_control", "0", "memory cgroup oom control")); + if (nsjconf->cgroup_mem_max > (size_t)0) { + std::string mem_max_str = std::to_string(nsjconf->cgroup_mem_max); + RETURN_ON_FAILURE(writeToCgroup(mem_cgroup_path + "/memory.limit_in_bytes", + mem_max_str, "memory cgroup max limit")); + } + + if (memsw_max > (size_t)0) { + std::string mem_memsw_max_str = std::to_string(memsw_max); + RETURN_ON_FAILURE(writeToCgroup(mem_cgroup_path + "/memory.memsw.limit_in_bytes", + mem_memsw_max_str, "memory+Swap cgroup max limit")); + } + return addPidToTaskList(mem_cgroup_path, pid); } @@ -159,7 +172,7 @@ static void removeCgroup(const std::string& cgroup_path) { } void finishFromParent(nsjconf_t* nsjconf, pid_t pid) { - if (nsjconf->cgroup_mem_max != (size_t)0) { + if (nsjconf->cgroup_mem_max != (size_t)0 || nsjconf->cgroup_mem_memsw_max != (size_t)0) { std::string mem_cgroup_path = nsjconf->cgroup_mem_mount + '/' + nsjconf->cgroup_mem_parent + "/NSJAIL." + std::to_string(pid); @@ -84,14 +84,30 @@ static void removeCgroup(const std::string &cgroup_path) { } static bool initNsFromParentMem(nsjconf_t *nsjconf, pid_t pid) { - if (nsjconf->cgroup_mem_max == (size_t)0) { + ssize_t swap_max = nsjconf->cgroup_mem_swap_max; + if (nsjconf->cgroup_mem_memsw_max > (size_t)0) { + swap_max = nsjconf->cgroup_mem_memsw_max - nsjconf->cgroup_mem_max; + } + + if (nsjconf->cgroup_mem_max == (size_t)0 && swap_max < (ssize_t)0) { return true; } std::string cgroup_path = getCgroupPath(nsjconf, pid); RETURN_ON_FAILURE(createCgroup(cgroup_path, pid)); RETURN_ON_FAILURE(addPidToProcList(cgroup_path, pid)); - return writeToCgroup(cgroup_path, "memory.max", std::to_string(nsjconf->cgroup_mem_max)); + + if (nsjconf->cgroup_mem_max > (size_t)0) { + RETURN_ON_FAILURE(writeToCgroup( + cgroup_path, "memory.max", std::to_string(nsjconf->cgroup_mem_max))); + } + + if (swap_max >= (ssize_t)0) { + RETURN_ON_FAILURE( + writeToCgroup(cgroup_path, "memory.swap.max", std::to_string(swap_max))); + } + + return true; } static bool initNsFromParentPids(nsjconf_t *nsjconf, pid_t pid) { @@ -43,6 +43,7 @@ #include <unistd.h> #include <memory> +#include <sstream> #include <string> #include <vector> @@ -103,7 +104,7 @@ struct custom_option custom_opts[] = { { { "skip_setsid", no_argument, NULL, 0x0504 }, "Don't call setsid(), allows for terminal signal handling in the sandboxed process. Dangerous" }, { { "pass_fd", required_argument, NULL, 0x0505 }, "Don't close this FD before executing the child process (can be specified multiple times), by default: 0/1/2 are kept open" }, { { "disable_no_new_privs", no_argument, NULL, 0x0507 }, "Don't set the prctl(NO_NEW_PRIVS, 1) (DANGEROUS)" }, - { { "rlimit_as", required_argument, NULL, 0x0201 }, "RLIMIT_AS in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 512)" }, + { { "rlimit_as", required_argument, NULL, 0x0201 }, "RLIMIT_AS in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 4096)" }, { { "rlimit_core", required_argument, NULL, 0x0202 }, "RLIMIT_CORE in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 0)" }, { { "rlimit_cpu", required_argument, NULL, 0x0203 }, "RLIMIT_CPU, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 600)" }, { { "rlimit_fsize", required_argument, NULL, 0x0204 }, "RLIMIT_FSIZE in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM64_INFINITY (default: 1)" }, @@ -142,6 +143,8 @@ struct custom_option custom_opts[] = { { { "seccomp_log", no_argument, NULL, 0x0902 }, "Use SECCOMP_FILTER_FLAG_LOG. Log all actions except SECCOMP_RET_ALLOW). Supported since kernel version 4.14" }, { { "nice_level", required_argument, NULL, 0x0903 }, "Set jailed process niceness (-20 is highest -priority, 19 is lowest). By default, set to 19" }, { { "cgroup_mem_max", required_argument, NULL, 0x0801 }, "Maximum number of bytes to use in the group (default: '0' - disabled)" }, + { { "cgroup_mem_memsw_max", required_argument, NULL, 0x0804 }, "Maximum number of memory+swap bytes to use (default: '0' - disabled)" }, + { { "cgroup_mem_swap_max", required_argument, NULL, 0x0805 }, "Maximum number of swap bytes to use (default: '-1' - disabled)" }, { { "cgroup_mem_mount", required_argument, NULL, 0x0802 }, "Location of memory cgroup FS (default: '/sys/fs/cgroup/memory')" }, { { "cgroup_mem_parent", required_argument, NULL, 0x0803 }, "Which pre-existing memory cgroup to use as a parent (default: 'NSJAIL')" }, { { "cgroup_pids_max", required_argument, NULL, 0x0811 }, "Maximum number of pids in a cgroup (default: '0' - disabled)" }, @@ -163,6 +166,7 @@ struct custom_option custom_opts[] = { { { "macvlan_vs_gw", required_argument, NULL, 0x703 }, "Default GW for the 'vs' interface (e.g. \"192.168.0.1\")" }, { { "macvlan_vs_ma", required_argument, NULL, 0x705 }, "MAC-address of the 'vs' interface (e.g. \"ba:ad:ba:be:45:00\")" }, { { "macvlan_vs_mo", required_argument, NULL, 0x706 }, "Mode of the 'vs' interface. Can be either 'private', 'vepa', 'bridge' or 'passthru' (default: 'private')" }, + { { "disable_tsc", no_argument, NULL, 0x707 }, "Disable rdtsc and rdtscp instructions. WARNING: To make it effective, you also need to forbid `prctl(PR_SET_TSC, PR_TSC_ENABLE, ...)` in seccomp rules! (x86 and x86_64 only). Dynamic binaries produced by GCC seem to rely on RDTSC, but static ones should work." }, }; // clang-format on @@ -456,6 +460,8 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) { nsjconf->cgroup_mem_mount = "/sys/fs/cgroup/memory"; nsjconf->cgroup_mem_parent = "NSJAIL"; nsjconf->cgroup_mem_max = (size_t)0; + nsjconf->cgroup_mem_memsw_max = (size_t)0; + nsjconf->cgroup_mem_swap_max = (ssize_t)-1; nsjconf->cgroup_pids_mount = "/sys/fs/cgroup/pids"; nsjconf->cgroup_pids_parent = "NSJAIL"; nsjconf->cgroup_pids_max = 0U; @@ -473,6 +479,7 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) { nsjconf->iface_vs_gw = "0.0.0.0"; nsjconf->iface_vs_ma = ""; nsjconf->iface_vs_mo = "private"; + nsjconf->disable_tsc = false; nsjconf->orig_uid = getuid(); nsjconf->orig_euid = geteuid(); nsjconf->num_cpus = sysconf(_SC_NPROCESSORS_ONLN); @@ -777,7 +784,12 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) { dst = src; } std::string fs_type = argFromVec(subopts, 2); - std::string options = argFromVec(subopts, 3); + std::stringstream optionsStream; + optionsStream << argFromVec(subopts, 3); + for (std::size_t i = 4; i < subopts.size(); ++i) { + optionsStream << ":" << subopts[i]; + } + std::string options = optionsStream.str(); if (!mnt::addMountPtTail(nsjconf.get(), src, dst, /* fstype= */ fs_type, /* options= */ options, /* flags= */ 0, /* is_dir= */ mnt::NS_DIR_MAYBE, /* is_mandatory= */ true, @@ -846,6 +858,9 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) { case 0x706: nsjconf->iface_vs_mo = parseMACVlanMode(optarg); break; + case 0x707: + nsjconf->disable_tsc = true; + break; case 0x801: nsjconf->cgroup_mem_max = (size_t)strtoull(optarg, NULL, 0); break; @@ -855,6 +870,12 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) { case 0x803: nsjconf->cgroup_mem_parent = optarg; break; + case 0x804: + nsjconf->cgroup_mem_memsw_max = (size_t)strtoull(optarg, NULL, 0); + break; + case 0x805: + nsjconf->cgroup_mem_swap_max = (ssize_t)strtoll(optarg, NULL, 0); + break; case 0x811: nsjconf->cgroup_pids_max = (unsigned int)strtoul(optarg, NULL, 0); break; @@ -918,6 +939,11 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) { } setupUsers(nsjconf.get()); + if (nsjconf->cgroup_mem_memsw_max > (size_t)0 && + nsjconf->cgroup_mem_swap_max >= (ssize_t)0) { + LOG_F("cannot set both cgroup_mem_memsw_max and cgroup_mem_swap_max"); + } + return nsjconf; } @@ -251,6 +251,8 @@ static bool configParseInternal(nsjconf_t* nsjconf, const nsjail::NsJailConfig& nsjconf->nice_level = njc.nice_level(); nsjconf->cgroup_mem_max = njc.cgroup_mem_max(); + nsjconf->cgroup_mem_memsw_max = njc.cgroup_mem_memsw_max(); + nsjconf->cgroup_mem_swap_max = njc.cgroup_mem_swap_max(); nsjconf->cgroup_mem_mount = njc.cgroup_mem_mount(); nsjconf->cgroup_mem_parent = njc.cgroup_mem_parent(); nsjconf->cgroup_pids_max = njc.cgroup_pids_max(); @@ -278,6 +280,8 @@ static bool configParseInternal(nsjconf_t* nsjconf, const nsjail::NsJailConfig& nsjconf->iface_vs_ma = njc.macvlan_vs_ma(); nsjconf->iface_vs_mo = njc.macvlan_vs_mo(); + nsjconf->disable_tsc = njc.disable_tsc(); + if (njc.has_exec_bin()) { if (njc.exec_bin().has_path()) { nsjconf->exec_file = njc.exec_bin().path(); @@ -322,11 +326,11 @@ bool parseFile(nsjconf_t* nsjconf, const char* file) { return false; } if (!configParseInternal(nsjconf, nsc)) { - LOG_W("Couldn't parse the ProtoBuf"); + LOG_W("Couldn't parse the ProtoBuf from '%s'", file); return false; } - LOG_D("Parsed config:\n'%s'", nsc.DebugString().c_str()); + LOG_D("Parsed config from '%s':\n'%s'", file, nsc.DebugString().c_str()); return true; } diff --git a/config.proto b/config.proto index 96a2b10..47847cc 100644 --- a/config.proto +++ b/config.proto @@ -211,6 +211,10 @@ message NsJailConfig { /* If > 0, maximum cumulative size of RAM used inside any jail */ optional uint64 cgroup_mem_max = 67 [default = 0]; /* In bytes */ + /* If > 0, maximum cumulative size of RAM + swap used inside any jail */ + optional uint64 cgroup_mem_memsw_max = 91 [default = 0]; /* In bytes */ + /* If >= 0, maximum cumulative size of swap used inside any jail */ + optional int64 cgroup_mem_swap_max = 92 [default = -1]; /* In bytes */ /* Mount point for cgroups-memory in your system */ optional string cgroup_mem_mount = 68 [default = "/sys/fs/cgroup/memory"]; /* Writeable directory (for the nsjail user) under cgroup_mem_mount */ @@ -262,4 +266,6 @@ message NsJailConfig { /* Binary path (with arguments) to be executed. If not specified here, it can be specified with cmd-line as "-- /path/to/command arg1 arg2" */ optional Exe exec_bin = 90; + + optional bool disable_tsc = 93 [default = false]; } diff --git a/configs/apache.cfg b/configs/apache.cfg index a1f2ff6..7f954c8 100644 --- a/configs/apache.cfg +++ b/configs/apache.cfg @@ -120,7 +120,7 @@ mount { is_bind: true } -seccomp_string: " KILL {" +seccomp_string: " KILL_PROCESS {" seccomp_string: " ptrace," seccomp_string: " process_vm_readv," seccomp_string: " process_vm_writev" diff --git a/configs/bash-with-fake-geteuid.cfg b/configs/bash-with-fake-geteuid.cfg index 99a36af..ef873e1 100644 --- a/configs/bash-with-fake-geteuid.cfg +++ b/configs/bash-with-fake-geteuid.cfg @@ -177,7 +177,7 @@ mount { seccomp_string: "ERRNO(1337) { geteuid } " seccomp_string: "ERRNO(0) { ptrace } " -seccomp_string: "KILL { syslog } " +seccomp_string: "KILL_PROCESS { syslog } " seccomp_string: "DEFAULT ALLOW " exec_bin { diff --git a/configs/demo-dont-use-chrome-with-net.cfg b/configs/demo-dont-use-chrome-with-net.cfg index c6c6a5f..bf96ea3 100644 --- a/configs/demo-dont-use-chrome-with-net.cfg +++ b/configs/demo-dont-use-chrome-with-net.cfg @@ -167,7 +167,7 @@ mount { is_bind: true } -seccomp_string: " KILL {" +seccomp_string: " KILL_PROCESS {" seccomp_string: " ptrace," seccomp_string: " process_vm_readv," seccomp_string: " process_vm_writev" diff --git a/configs/firefox-with-cloned-net.cfg b/configs/firefox-with-cloned-net.cfg index 180ed9a..b949018 100644 --- a/configs/firefox-with-cloned-net.cfg +++ b/configs/firefox-with-cloned-net.cfg @@ -168,7 +168,7 @@ mount { is_bind: true } -seccomp_string: "KILL {" +seccomp_string: "KILL_PROCESS {" seccomp_string: " ptrace," seccomp_string: " process_vm_readv," seccomp_string: " process_vm_writev" diff --git a/configs/firefox-with-net-wayland.cfg b/configs/firefox-with-net-wayland.cfg index b132018..4a2edf6 100644 --- a/configs/firefox-with-net-wayland.cfg +++ b/configs/firefox-with-net-wayland.cfg @@ -29,7 +29,7 @@ envar: "FONTCONFIG_FILE=/etc/fonts/fonts.conf" envar: "FC_CONFIG_FILE=/etc/fonts/fonts.conf" envar: "MOZ_ENABLE_WAYLAND=1" envar: "XDG_RUNTIME_DIR=/user/run/" -envar: "WAYLAND_DISPLAY=wayland-0" +envar: "WAYLAND_DISPLAY" rlimit_as: 4096 rlimit_cpu: 1000 diff --git a/configs/firefox-with-net.cfg b/configs/firefox-with-net.cfg index b88f8ea..633a5c1 100644 --- a/configs/firefox-with-net.cfg +++ b/configs/firefox-with-net.cfg @@ -160,7 +160,7 @@ mount { is_bind: true } -seccomp_string: "KILL {" +seccomp_string: "KILL_PROCESS {" seccomp_string: " ptrace," seccomp_string: " process_vm_readv," seccomp_string: " process_vm_writev" diff --git a/configs/home-documents-with-xorg-no-net.cfg b/configs/home-documents-with-xorg-no-net.cfg index 83cfb42..a701b1e 100644 --- a/configs/home-documents-with-xorg-no-net.cfg +++ b/configs/home-documents-with-xorg-no-net.cfg @@ -142,7 +142,7 @@ mount { is_bind: true } -seccomp_string: "KILL {" +seccomp_string: "KILL_PROCESS {" seccomp_string: " ptrace," seccomp_string: " process_vm_readv," seccomp_string: " process_vm_writev" diff --git a/configs/imagemagick-convert.cfg b/configs/imagemagick-convert.cfg index 479b293..8a3fe57 100644 --- a/configs/imagemagick-convert.cfg +++ b/configs/imagemagick-convert.cfg @@ -9,7 +9,7 @@ description: "Run as:" description: "" description: "./nsjail --config imagemagick-convert.cfg -- /usr/bin/convert jpg:/user/Documents/input.jpg png:/user/Documents/output.png " description: "or " -description: "./nsjail --config imagemagick-convert.cfg -- /usr/bin/convert jpg:- png:- <file.jpg >file.png +description: "./nsjail --config imagemagick-convert.cfg -- /usr/bin/convert jpg:- png:- <file.jpg >file.png" mode: ONCE hostname: "IM-CONVERT" @@ -83,7 +83,7 @@ seccomp_string: " arch_prctl, sched_getaffinity, set_tid_address," seccomp_string: " clock_gettime, set_robust_list, exit_group," seccomp_string: " clone, getcwd, pread64, readlink, prlimit64, madvise" seccomp_string: "}" -seccomp_string: "DEFAULT KILL" +seccomp_string: "DEFAULT KILL_PROCESS" exec_bin { path: "" diff --git a/configs/xchat-with-net.cfg b/configs/xchat-with-net.cfg index 04c361b..19725d1 100644 --- a/configs/xchat-with-net.cfg +++ b/configs/xchat-with-net.cfg @@ -132,7 +132,7 @@ mount { is_bind: true } -seccomp_string: "KILL {" +seccomp_string: "KILL_PROCESS {" seccomp_string: " ptrace," seccomp_string: " process_vm_readv," seccomp_string: " process_vm_writev" diff --git a/configs/znc-with-net.cfg b/configs/znc-with-net.cfg index bdcc53e..13c5107 100644 --- a/configs/znc-with-net.cfg +++ b/configs/znc-with-net.cfg @@ -122,7 +122,7 @@ mount { mandatory: true } -seccomp_string: "KILL {" +seccomp_string: "KILL_PROCESS {" seccomp_string: " ptrace," seccomp_string: " process_vm_readv," seccomp_string: " process_vm_writev" @@ -119,6 +119,22 @@ static bool containCPU(nsjconf_t* nsjconf) { return cpu::initCpu(nsjconf); } +static bool containTSC(nsjconf_t* nsjconf) { + if (nsjconf->disable_tsc) { +#if defined(__x86_64__) || defined(__i386__) + if (prctl(PR_SET_TSC, PR_TSC_SIGSEGV, 0, 0, 0) == -1) { + PLOG_E("prctl(PR_SET_TSC, PR_TSC_SIGSEGV)"); + return false; + } +#else /* defined(__x86_64__) || defined(__i386__) */ + LOG_W( + "prctl(PR_SET_TSC, PR_TSC_SIGSEGV) requested, but it's supported under " + "x86/x86-64 CPU architectures only. Ignoring it!"); +#endif /* defined(__x86_64__) || defined(__i386__) */ + } + return true; +} + static bool containSetLimits(nsjconf_t* nsjconf) { if (nsjconf->disable_rl) { return true; @@ -326,6 +342,7 @@ bool containProc(nsjconf_t* nsjconf) { /* */ /* As non-root */ RETURN_ON_FAILURE(containCPU(nsjconf)); + RETURN_ON_FAILURE(containTSC(nsjconf)); RETURN_ON_FAILURE(containSetLimits(nsjconf)); RETURN_ON_FAILURE(containPrepareEnv(nsjconf)); RETURN_ON_FAILURE(containMakeFdsCOE(nsjconf)); @@ -156,9 +156,7 @@ void logMsg(enum llevel_t ll, const char* fn, int ln, bool perr, const char* fmt msg.append("\n"); /* End printing logs */ - if (write(_log_fd, msg.c_str(), msg.size()) == -1) { - dprintf(_log_fd, "%s", msg.c_str()); - } + TEMP_FAILURE_RETRY(write(_log_fd, msg.c_str(), msg.size())); if (ll == FATAL) { exit(0xff); @@ -29,21 +29,21 @@ namespace logs { -#define LOG_HELP(...) logs::logMsg(logs::HELP, __PRETTY_FUNCTION__, __LINE__, false, __VA_ARGS__); +#define LOG_HELP(...) logs::logMsg(logs::HELP, __FUNCTION__, __LINE__, false, __VA_ARGS__); #define LOG_HELP_BOLD(...) \ - logs::logMsg(logs::HELP_BOLD, __PRETTY_FUNCTION__, __LINE__, false, __VA_ARGS__); - -#define LOG_D(...) logs::logMsg(logs::DEBUG, __PRETTY_FUNCTION__, __LINE__, false, __VA_ARGS__); -#define LOG_I(...) logs::logMsg(logs::INFO, __PRETTY_FUNCTION__, __LINE__, false, __VA_ARGS__); -#define LOG_W(...) logs::logMsg(logs::WARNING, __PRETTY_FUNCTION__, __LINE__, false, __VA_ARGS__); -#define LOG_E(...) logs::logMsg(logs::ERROR, __PRETTY_FUNCTION__, __LINE__, false, __VA_ARGS__); -#define LOG_F(...) logs::logMsg(logs::FATAL, __PRETTY_FUNCTION__, __LINE__, false, __VA_ARGS__); - -#define PLOG_D(...) logs::logMsg(logs::DEBUG, __PRETTY_FUNCTION__, __LINE__, true, __VA_ARGS__); -#define PLOG_I(...) logs::logMsg(logs::INFO, __PRETTY_FUNCTION__, __LINE__, true, __VA_ARGS__); -#define PLOG_W(...) logs::logMsg(logs::WARNING, __PRETTY_FUNCTION__, __LINE__, true, __VA_ARGS__); -#define PLOG_E(...) logs::logMsg(logs::ERROR, __PRETTY_FUNCTION__, __LINE__, true, __VA_ARGS__); -#define PLOG_F(...) logs::logMsg(logs::FATAL, __PRETTY_FUNCTION__, __LINE__, true, __VA_ARGS__); + logs::logMsg(logs::HELP_BOLD, __FUNCTION__, __LINE__, false, __VA_ARGS__); + +#define LOG_D(...) logs::logMsg(logs::DEBUG, __FUNCTION__, __LINE__, false, __VA_ARGS__); +#define LOG_I(...) logs::logMsg(logs::INFO, __FUNCTION__, __LINE__, false, __VA_ARGS__); +#define LOG_W(...) logs::logMsg(logs::WARNING, __FUNCTION__, __LINE__, false, __VA_ARGS__); +#define LOG_E(...) logs::logMsg(logs::ERROR, __FUNCTION__, __LINE__, false, __VA_ARGS__); +#define LOG_F(...) logs::logMsg(logs::FATAL, __FUNCTION__, __LINE__, false, __VA_ARGS__); + +#define PLOG_D(...) logs::logMsg(logs::DEBUG, __FUNCTION__, __LINE__, true, __VA_ARGS__); +#define PLOG_I(...) logs::logMsg(logs::INFO, __FUNCTION__, __LINE__, true, __VA_ARGS__); +#define PLOG_W(...) logs::logMsg(logs::WARNING, __FUNCTION__, __LINE__, true, __VA_ARGS__); +#define PLOG_E(...) logs::logMsg(logs::ERROR, __FUNCTION__, __LINE__, true, __VA_ARGS__); +#define PLOG_F(...) logs::logMsg(logs::FATAL, __FUNCTION__, __LINE__, true, __VA_ARGS__); enum llevel_t { DEBUG = 0, @@ -65,6 +65,6 @@ static void __attribute__ ((unused)) __clang_cleanup_func(void (^*dfunc) (void)) #endif #define NS_VALSTR_STRUCT(x) \ - { x, #x } + { (uint64_t) x, #x } #endif /* NS_COMMON_H */ @@ -57,36 +57,42 @@ static const std::string flagsToStr(uintptr_t flags) { std::string res; struct { - const uintptr_t flag; + const uint64_t flag; const char* const name; } static const mountFlags[] = { - NS_VALSTR_STRUCT(MS_RDONLY), - NS_VALSTR_STRUCT(MS_NOSUID), - NS_VALSTR_STRUCT(MS_NODEV), - NS_VALSTR_STRUCT(MS_NOEXEC), - NS_VALSTR_STRUCT(MS_SYNCHRONOUS), - NS_VALSTR_STRUCT(MS_REMOUNT), - NS_VALSTR_STRUCT(MS_MANDLOCK), - NS_VALSTR_STRUCT(MS_DIRSYNC), - NS_VALSTR_STRUCT(MS_NOATIME), - NS_VALSTR_STRUCT(MS_NODIRATIME), - NS_VALSTR_STRUCT(MS_BIND), - NS_VALSTR_STRUCT(MS_MOVE), - NS_VALSTR_STRUCT(MS_REC), - NS_VALSTR_STRUCT(MS_SILENT), - NS_VALSTR_STRUCT(MS_POSIXACL), - NS_VALSTR_STRUCT(MS_UNBINDABLE), - NS_VALSTR_STRUCT(MS_PRIVATE), - NS_VALSTR_STRUCT(MS_SLAVE), - NS_VALSTR_STRUCT(MS_SHARED), - NS_VALSTR_STRUCT(MS_RELATIME), - NS_VALSTR_STRUCT(MS_KERNMOUNT), - NS_VALSTR_STRUCT(MS_I_VERSION), - NS_VALSTR_STRUCT(MS_STRICTATIME), - NS_VALSTR_STRUCT(MS_LAZYTIME), + NS_VALSTR_STRUCT(MS_RDONLY), + NS_VALSTR_STRUCT(MS_NOSUID), + NS_VALSTR_STRUCT(MS_NODEV), + NS_VALSTR_STRUCT(MS_NOEXEC), + NS_VALSTR_STRUCT(MS_SYNCHRONOUS), + NS_VALSTR_STRUCT(MS_REMOUNT), + NS_VALSTR_STRUCT(MS_MANDLOCK), + NS_VALSTR_STRUCT(MS_DIRSYNC), + NS_VALSTR_STRUCT(MS_NOATIME), + NS_VALSTR_STRUCT(MS_NODIRATIME), + NS_VALSTR_STRUCT(MS_BIND), + NS_VALSTR_STRUCT(MS_MOVE), + NS_VALSTR_STRUCT(MS_REC), + NS_VALSTR_STRUCT(MS_SILENT), + NS_VALSTR_STRUCT(MS_POSIXACL), + NS_VALSTR_STRUCT(MS_UNBINDABLE), + NS_VALSTR_STRUCT(MS_PRIVATE), + NS_VALSTR_STRUCT(MS_SLAVE), + NS_VALSTR_STRUCT(MS_SHARED), + NS_VALSTR_STRUCT(MS_RELATIME), + NS_VALSTR_STRUCT(MS_KERNMOUNT), + NS_VALSTR_STRUCT(MS_I_VERSION), + NS_VALSTR_STRUCT(MS_STRICTATIME), + NS_VALSTR_STRUCT(MS_LAZYTIME), +#if defined(MS_ACTIVE) + NS_VALSTR_STRUCT(MS_ACTIVE), +#endif /* defined(MS_ACTIVE) */ +#if defined(MS_NOUSER) + NS_VALSTR_STRUCT(MS_NOUSER), +#endif /* defined(MS_NOUSER) */ }; - uintptr_t knownFlagMask = 0U; + uint64_t knownFlagMask = 0U; for (const auto& i : mountFlags) { if (flags & i.flag) { if (!res.empty()) { @@ -144,7 +150,7 @@ static bool mountPt(mount_t* mpt, const char* newroot, const char* tmpdir) { LOG_D("symlink('%s', '%s')", srcpath, dstpath); if (symlink(srcpath, dstpath) == -1) { if (mpt->is_mandatory) { - PLOG_W("symlink('%s', '%s')", srcpath, dstpath); + PLOG_E("symlink('%s', '%s')", srcpath, dstpath); return false; } else { PLOG_W("symlink('%s', '%s'), but it's not mandatory, continuing", @@ -390,6 +396,7 @@ static bool initCloneNs(nsjconf_t* nsjconf) { for (auto& p : nsjconf->mountpts) { if (!mountPt(&p, destdir->c_str(), tmpdir->c_str()) && p.is_mandatory) { + LOG_E("Couldn't mount '%s'", p.dst.c_str()); return false; } } @@ -119,7 +119,7 @@ Don't close this FD before executing the child process (can be specified multipl Don't set the prctl(NO_NEW_PRIVS, 1) (DANGEROUS) .TP \fB\-\-rlimit_as\fR VALUE -RLIMIT_AS in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM_INFINITY (default: 512) +RLIMIT_AS in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current soft limit, 'inf' for RLIM_INFINITY (default: 4096) .TP \fB\-\-rlimit_core\fR VALUE RLIMIT_CORE in MB, 'max' or 'hard' for the current hard limit, 'def' or 'soft' for the current limit, 'inf' for RLIM_INFINITY (default: 0) @@ -220,6 +220,12 @@ Use SECCOMP_FILTER_FLAG_LOG. Log all actions except SECCOMP_RET_ALLOW. Supported \fB\-\-cgroup_mem_max\fR VALUE Maximum number of bytes to use in the group (default: '0' \- disabled) .TP +\fB\-\-cgroup_mem_memsw_max\fR VALUE +Maximum number of memory+Swap bytes to use in the group (default: '0' \- disabled) +.TP +\fB\-\-cgroup_mem_swap_max\fR VALUE +Maximum number of swap bytes to use in the group (default: '-1' \- disabled) +.TP \fB\-\-cgroup_mem_mount\fR VALUE Location of memory cgroup FS (default: '/sys/fs/cgroup/memory') .TP @@ -145,9 +145,12 @@ struct nsjconf_t { std::string iface_vs_gw; std::string iface_vs_ma; std::string iface_vs_mo; + bool disable_tsc; std::string cgroup_mem_mount; std::string cgroup_mem_parent; size_t cgroup_mem_max; + size_t cgroup_mem_memsw_max; + ssize_t cgroup_mem_swap_max; std::string cgroup_pids_mount; std::string cgroup_pids_parent; unsigned int cgroup_pids_max; @@ -67,7 +67,7 @@ static const std::string cloneFlagsToStr(uintptr_t flags) { std::string res; struct { - const uintptr_t flag; + const uint64_t flag; const char* const name; } static const cloneFlags[] = { NS_VALSTR_STRUCT(CLONE_NEWTIME), @@ -100,7 +100,7 @@ static const std::string cloneFlagsToStr(uintptr_t flags) { NS_VALSTR_STRUCT(CLONE_IO), }; - uintptr_t knownFlagMask = 0; + uint64_t knownFlagMask = 0; for (const auto& i : cloneFlags) { if (flags & i.flag) { if (!res.empty()) { @@ -266,10 +266,11 @@ static void seccompViolation(nsjconf_t* nsjconf, siginfo_t* si) { const auto& p = nsjconf->pids.find(si->si_pid); if (p == nsjconf->pids.end()) { LOG_W( - "pid=%d SiSyscall: %d, SiCode: %d, SiErrno: %d, SiSigno: %d. (If " - "SiSyscall==31, then it's most likely the SIGSYS value. See 'dmesg' or " - "'journalctl -ek' for possible auditd report with more data)", - (int)si->si_pid, si->si_syscall, si->si_code, si->si_errno, si->si_signo); + "pid=%d SiStatus:%d SiUid:%d SiUtime:%ld SiStime:%ld (If " + "SiStatus==31 (SIGSYS), then see 'dmesg' or 'journalctl -ek' for possible " + "auditd report with more data)", + (int)si->si_pid, si->si_status, si->si_uid, (long)si->si_utime, + (long)si->si_stime); LOG_E("Couldn't find pid element in the subproc list for pid=%d", (int)si->si_pid); return; } @@ -278,10 +279,11 @@ static void seccompViolation(nsjconf_t* nsjconf, siginfo_t* si) { ssize_t rdsize = util::readFromFd(p->second.pid_syscall_fd, buf, sizeof(buf) - 1); if (rdsize < 1) { LOG_W( - "pid=%d, SiSyscall: %d, SiCode: %d, SiErrno: %d, SiSigno: %d. (If " - "SiSyscall==31, then it's most likely the SIGSYS value. See 'dmesg' or " - "'journalctl -ek' for possible auditd report with more data)", - (int)si->si_pid, si->si_syscall, si->si_code, si->si_errno, si->si_signo); + "pid=%d SiStatus:%d SiUid:%d SiUtime:%ld SiStime:%ld (If " + "SiStatus==31 (SIGSYS), then see 'dmesg' or 'journalctl -ek' for possible " + "auditd report with more data)", + (int)si->si_pid, si->si_status, si->si_uid, (long)si->si_utime, + (long)si->si_stime); return; } buf[rdsize - 1] = '\0'; @@ -292,23 +294,24 @@ static void seccompViolation(nsjconf_t* nsjconf, siginfo_t* si) { &arg4, &arg5, &arg6, &sp, &pc); if (ret == 9) { LOG_W( - "pid=%d, Syscall number: %td, Arguments: %#tx, %#tx, %#tx, %#tx, %#tx, %#tx, " - "SP: %#tx, PC: %#tx, si_syscall: %d, si_errno: %#x", - (int)si->si_pid, sc, arg1, arg2, arg3, arg4, arg5, arg6, sp, pc, si->si_syscall, - si->si_errno); + "pid=%d, Syscall number:%td, Arguments:%#tx, %#tx, %#tx, %#tx, %#tx, %#tx, " + "SP:%#tx, PC:%#tx, si_status:%d", + (int)si->si_pid, sc, arg1, arg2, arg3, arg4, arg5, arg6, sp, pc, si->si_status); } else if (ret == 3) { LOG_W( - "pid=%d, SiSyscall: %d, SiCode: %d, SiErrno: %d, SiSigno: %d, SP: %#tx, PC: " - "%#tx (If SiSyscall==31, then it's most likely the SIGSYS value. See 'dmesg' " - "or 'journalctl -ek' for possible auditd report with more data)", - (int)si->si_pid, si->si_syscall, si->si_code, si->si_errno, si->si_signo, arg1, - arg2); + "pid=%d SiStatus:%d SiUid:%d SiUtime:%ld SiStime:%ld SP:%#tx, PC:%#tx (If " + "SiStatus==31 (SIGSYS), then see 'dmesg' or 'journalctl -ek' for possible " + "auditd report with more data)", + (int)si->si_pid, si->si_status, si->si_uid, (long)si->si_utime, + (long)si->si_stime, arg1, arg2); + return; } else { LOG_W( - "pid=%d, SiSyscall: %d, SiCode: %d, SiErrno: %d, Syscall string '%s'. (If " - "SiSyscall==31, then it's most likely the SIGSYS value. See 'dmesg' or " - "'journalctl -ek' for possible auditd report with more data)", - (int)si->si_pid, si->si_syscall, si->si_code, si->si_errno, buf); + "pid=%d SiStatus:%d SiUid:%d SiUtime:%ld SiStime:%ld (If " + "SiStatus==31 (SIGSYS), then see 'dmesg' or 'journalctl -ek' for possible " + "auditd report with more data)", + (int)si->si_pid, si->si_status, si->si_uid, (long)si->si_utime, + (long)si->si_stime); } } @@ -526,19 +529,9 @@ pid_t cloneProc(uintptr_t flags, int exit_signal) { } #if defined(__NR_clone3) - struct clone_args ca = { - .flags = (uint64_t)flags, - .pidfd = 0, - .child_tid = 0, - .parent_tid = 0, - .exit_signal = (uint64_t)exit_signal, - .stack = 0, - .stack_size = 0, - .tls = 0, - .set_tid = 0, - .set_tid_size = 0, - .cgroup = 0, - }; + struct clone_args ca = {}; + ca.flags = (uint64_t)flags; + ca.exit_signal = (uint64_t)exit_signal; pid_t ret = util::syscall(__NR_clone3, (uintptr_t)&ca, sizeof(ca)); if (ret != -1 || errno != ENOSYS) { |