aboutsummaryrefslogtreecommitdiff
path: root/sandboxed_api/sandbox2/forkserver.cc
blob: 1106e02de51317c30ce73275dc71f1914390a000 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Implementation of the sandbox2::ForkServer class.

#include "sandboxed_api/sandbox2/forkserver.h"

#include <fcntl.h>
#include <linux/filter.h>
#include <linux/seccomp.h>
#include <sched.h>
#include <sys/eventfd.h>
#include <sys/prctl.h>
#include <sys/resource.h>
#include <sys/socket.h>
#include <sys/uio.h>
#include <sys/wait.h>
#include <syscall.h>
#include <unistd.h>

#include <cerrno>
#include <csignal>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <initializer_list>
#include <string>
#include <utility>
#include <vector>

#include "absl/base/attributes.h"
#include "absl/container/flat_hash_map.h"
#include "absl/container/flat_hash_set.h"
#include "absl/status/status.h"
#include "absl/status/statusor.h"
#include "absl/strings/match.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_join.h"
#include "absl/strings/str_split.h"
#include "absl/strings/string_view.h"
#include "sys/capability.h" // AOSP: match libcap exported includes
#include "sandboxed_api/sandbox2/client.h"
#include "sandboxed_api/sandbox2/comms.h"
#include "sandboxed_api/sandbox2/fork_client.h"
#include "sandboxed_api/sandbox2/forkserver.pb.h"
#include "sandboxed_api/sandbox2/namespace.h"
#include "sandboxed_api/sandbox2/policy.h"
#include "sandboxed_api/sandbox2/sanitizer.h"
#include "sandboxed_api/sandbox2/syscall.h"
#include "sandboxed_api/sandbox2/util.h"
#include "sandboxed_api/sandbox2/util/bpf_helper.h"
#include "sandboxed_api/util/fileops.h"
#include "sandboxed_api/util/raw_logging.h"
#include "sandboxed_api/util/strerror.h"

namespace sandbox2 {
namespace {

using ::sapi::StrError;
using ::sapi::file_util::fileops::FDCloser;

// "Moves" FDs in move_fds from current to target FD number while keeping FDs
// in keep_fds open - potentially moving them to another FD number as well in
// case of colisions.
// Ignores invalid (-1) fds.
void MoveFDs(std::initializer_list<std::pair<int*, int>> move_fds,
             std::initializer_list<int*> keep_fds) {
  absl::flat_hash_map<int, int*> fd_map;
  for (int* fd : keep_fds) {
    if (*fd != -1) {
      fd_map.emplace(*fd, fd);
    }
  }

  for (auto [old_fd, new_fd] : move_fds) {
    if (*old_fd != -1) {
      fd_map.emplace(*old_fd, old_fd);
    }
  }

  for (auto [old_fd, new_fd] : move_fds) {
    if (*old_fd == -1 || *old_fd == new_fd) {
      continue;
    }

    // Make sure we won't override another fd
    if (auto it = fd_map.find(new_fd); it != fd_map.end()) {
      int fd = dup(new_fd);
      SAPI_RAW_CHECK(fd != -1, "Duplicating an FD failed.");
      *it->second = fd;
      fd_map.emplace(fd, it->second);
      fd_map.erase(it);
    }

    if (dup2(*old_fd, new_fd) == -1) {
      SAPI_RAW_PLOG(FATAL, "Moving temporary to proper FD failed.");
    }

    close(*old_fd);
    fd_map.erase(*old_fd);
    *old_fd = new_fd;
  }
}

ABSL_ATTRIBUTE_NORETURN void RunInitProcess(pid_t main_pid, FDCloser pipe_fd) {
  if (prctl(PR_SET_NAME, "S2-INIT-PROC", 0, 0, 0) != 0) {
    SAPI_RAW_PLOG(WARNING, "prctl(PR_SET_NAME, 'S2-INIT-PROC')");
  }

  // Clear SA_NOCLDWAIT.
  struct sigaction sa;
  sa.sa_handler = SIG_DFL;
  sa.sa_flags = 0;
  sigemptyset(&sa.sa_mask);
  SAPI_RAW_CHECK(sigaction(SIGCHLD, &sa, nullptr) == 0,
                 "clearing SA_NOCLDWAIT");

  // Apply seccomp.
  std::vector<sock_filter> code = {
      LOAD_ARCH,
      JNE32(sandbox2::Syscall::GetHostAuditArch(), DENY),

      LOAD_SYSCALL_NR,
      SYSCALL(__NR_waitid, ALLOW),
      SYSCALL(__NR_exit, ALLOW),
  };
  if (pipe_fd.get() >= 0) {
    code.insert(code.end(),
                {SYSCALL(__NR_getrusage, ALLOW), SYSCALL(__NR_write, ALLOW)});
  }
  code.push_back(DENY);

  struct sock_fprog prog {
    .len = static_cast<uint16_t>(code.size()), .filter = code.data(),
  };

  SAPI_RAW_CHECK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == 0,
                 "Denying new privs");
  SAPI_RAW_CHECK(prctl(PR_SET_KEEPCAPS, 0) == 0, "Dropping caps");
  SAPI_RAW_CHECK(
      syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
              reinterpret_cast<uintptr_t>(&prog)) == 0,
      "Enabling seccomp filter");

  siginfo_t info;
  // Reap children.
  for (;;) {
    int rv = TEMP_FAILURE_RETRY(waitid(P_ALL, -1, &info, WEXITED | __WALL));
    if (rv != 0) {
      _exit(1);
    }

    if (info.si_pid == main_pid) {
      if (pipe_fd.get() >= 0) {
        write(pipe_fd.get(), &info.si_code, sizeof(info.si_code));
        write(pipe_fd.get(), &info.si_status, sizeof(info.si_status));

        rusage usage{};
        getrusage(RUSAGE_CHILDREN, &usage);
        write(pipe_fd.get(), &usage, sizeof(usage));
      }
      _exit(0);
    }
  }
}

absl::Status SendPid(int signaling_fd) {
  // Send our PID (the actual sandboxee process) via SCM_CREDENTIALS.
  // The ancillary message will be attached to the message as SO_PASSCRED is set
  // on the socket.
  char dummy = ' ';
  if (TEMP_FAILURE_RETRY(send(signaling_fd, &dummy, 1, 0)) != 1) {
    return absl::ErrnoToStatus(errno, "Sending PID: send()");
  }
  return absl::OkStatus();
}

absl::StatusOr<pid_t> ReceivePid(int signaling_fd) {
  union {
    struct cmsghdr cmh;
    char ctrl[CMSG_SPACE(sizeof(struct ucred))];
  } ucred_msg{};

  struct msghdr msgh {};
  struct iovec iov {};

  msgh.msg_iov = &iov;
  msgh.msg_iovlen = 1;
  msgh.msg_control = ucred_msg.ctrl;
  msgh.msg_controllen = sizeof(ucred_msg);

  char dummy;
  iov.iov_base = &dummy;
  iov.iov_len = sizeof(char);

  if (TEMP_FAILURE_RETRY(recvmsg(signaling_fd, &msgh, MSG_WAITALL)) != 1) {
    return absl::ErrnoToStatus(errno, "Receiving pid failed: recvmsg");
  }
  struct cmsghdr* cmsgp = CMSG_FIRSTHDR(&msgh);
  if (cmsgp->cmsg_len != CMSG_LEN(sizeof(struct ucred)) ||
      cmsgp->cmsg_level != SOL_SOCKET || cmsgp->cmsg_type != SCM_CREDENTIALS) {
    return absl::InternalError("Receiving pid failed");
  }
  auto* ucredp = reinterpret_cast<struct ucred*>(CMSG_DATA(cmsgp));
  return ucredp->pid;
}

absl::StatusOr<std::string> GetRootMountId(const std::string& proc_id) {
  std::ifstream mounts(absl::StrCat("/proc/", proc_id, "/mountinfo"));
  if (!mounts.good()) {
    return absl::InternalError("Failed to open mountinfo");
  }
  std::string line;
  while (std::getline(mounts, line)) {
    std::vector<absl::string_view> parts =
        absl::StrSplit(line, absl::MaxSplits(' ', 4));
    if (parts.size() >= 4 && parts[3] == "/") {
      return std::string(parts[0]);
    }
  }
  return absl::NotFoundError("Root entry not found in mountinfo");
}

bool IsLikelyChrooted() {
  absl::StatusOr<std::string> self_root_id = GetRootMountId("self");
  if (!self_root_id.ok()) {
    return absl::IsNotFound(self_root_id.status());
  }
  absl::StatusOr<std::string> init_root_id = GetRootMountId("1");
  if (!init_root_id.ok()) {
    return false;
  }
  return *self_root_id != *init_root_id;
}

}  // namespace

void ForkServer::PrepareExecveArgs(const ForkRequest& request,
                                   std::vector<std::string>* args,
                                   std::vector<std::string>* envp) {
  // Prepare arguments for execve.
  for (const auto& arg : request.args()) {
    args->push_back(arg);
  }

  // Prepare environment variables for execve.
  for (const auto& env : request.envs()) {
    envp->push_back(env);
  }

  // The child process should not start any fork-servers.
  envp->push_back(absl::StrCat(kForkServerDisableEnv, "=1"));

  constexpr char kSapiVlogLevel[] = "SAPI_VLOG_LEVEL";
  char* sapi_vlog = getenv(kSapiVlogLevel);
  if (sapi_vlog && strlen(sapi_vlog) > 0) {
    envp->push_back(absl::StrCat(kSapiVlogLevel, "=", sapi_vlog));
  }

  SAPI_RAW_VLOG(1, "Will execute args:['%s'], environment:['%s']",
                absl::StrJoin(*args, "', '").c_str(),
                absl::StrJoin(*envp, "', '").c_str());
}

void ForkServer::LaunchChild(const ForkRequest& request, int execve_fd,
                             uid_t uid, gid_t gid, FDCloser signaling_fd,
                             FDCloser status_fd, bool avoid_pivot_root) const {
  SAPI_RAW_CHECK(request.mode() != FORKSERVER_FORK_UNSPECIFIED,
                 "Forkserver mode is unspecified");

  const bool will_execve = execve_fd != -1;
  const bool should_sandbox = request.mode() == FORKSERVER_FORK_EXECVE_SANDBOX;

  absl::StatusOr<absl::flat_hash_set<int>> open_fds = sanitizer::GetListOfFDs();
  if (!open_fds.ok()) {
    SAPI_RAW_LOG(WARNING, "Could not get list of current open FDs: %s",
                 std::string(open_fds.status().message()).c_str());
    open_fds = absl::flat_hash_set<int>();
  }
  SanitizeEnvironment();

  InitializeNamespaces(request, uid, gid, avoid_pivot_root);

  auto caps = cap_init();
  SAPI_RAW_CHECK(cap_set_proc(caps) == 0, "while dropping capabilities");
  cap_free(caps);

  // A custom init process is only needed if a new PID NS is created.
  if (request.clone_flags() & CLONE_NEWPID) {
    // Spawn a child process
    pid_t child = util::ForkWithFlags(SIGCHLD);
    if (child < 0) {
      SAPI_RAW_PLOG(FATAL, "Could not spawn init process");
    }
    if (child != 0) {
      if (status_fd.get() >= 0) {
        open_fds->erase(status_fd.get());
      }
      // Close all open fds (equals to CloseAllFDsExcept but does not require
      // /proc to be available).
      for (const auto& fd : *open_fds) {
        close(fd);
      }
      RunInitProcess(child, std::move(status_fd));
    }
    // Send sandboxee pid
    auto status = SendPid(signaling_fd.get());
    SAPI_RAW_CHECK(status.ok(),
                   absl::StrCat("sending pid: ", status.message()).c_str());
  }
  signaling_fd.Close();
  status_fd.Close();

  Client c(comms_);

  // Prepare the arguments before sandboxing (if needed), as doing it after
  // sandoxing can cause syscall violations (e.g. related to memory management).
  std::vector<std::string> args;
  std::vector<std::string> envs;
  if (will_execve) {
    PrepareExecveArgs(request, &args, &envs);
  }

  // Sandboxing can be enabled either here - just before execve, or somewhere
  // inside the executed binary (e.g. after basic structures have been
  // initialized, and resources acquired). In the latter case, it's up to the
  // sandboxed binary to establish proper Comms channel (using
  // Comms::kSandbox2ClientCommsFD) and call sandbox2::Client::SandboxMeHere()
  if (should_sandbox) {
    // The following client calls are basically SandboxMeHere. We split it so
    // that we can set up the envp after we received the file descriptors but
    // before we enable the syscall filter.
    c.PrepareEnvironment(&execve_fd);
    if (comms_->GetConnectionFD() != Comms::kSandbox2ClientCommsFD) {
      envs.push_back(absl::StrCat(Comms::kSandbox2CommsFDEnvVar, "=",
                                  comms_->GetConnectionFD()));
    }
    envs.push_back(c.GetFdMapEnvVar());
  }

  // Convert args and envs before enabling sandbox (it'll allocate which might
  // be blocked).
  util::CharPtrArray argv = util::CharPtrArray::FromStringVector(args);
  util::CharPtrArray envp = util::CharPtrArray::FromStringVector(envs);

  if (should_sandbox) {
    c.EnableSandbox();
  }

  if (will_execve) {
    ExecuteProcess(execve_fd, argv.data(), envp.data());
  }
}

pid_t ForkServer::ServeRequest() {
  ForkRequest fork_request;
  if (!comms_->RecvProtoBuf(&fork_request)) {
    if (comms_->IsTerminated()) {
      return -1;
    }
    SAPI_RAW_LOG(FATAL, "Failed to receive ForkServer request");
  }
  int comms_fd;
  SAPI_RAW_CHECK(comms_->RecvFD(&comms_fd), "Failed to receive Comms FD");

  SAPI_RAW_CHECK(fork_request.mode() != FORKSERVER_FORK_UNSPECIFIED,
                 "Forkserver mode is unspecified");

  int exec_fd = -1;
  if (fork_request.mode() == FORKSERVER_FORK_EXECVE ||
      fork_request.mode() == FORKSERVER_FORK_EXECVE_SANDBOX) {
    SAPI_RAW_CHECK(comms_->RecvFD(&exec_fd), "Failed to receive Exec FD");
  }

  // Make the kernel notify us with SIGCHLD when the process terminates.
  // We use sigaction(SIGCHLD, flags=SA_NOCLDWAIT) in combination with
  // this to make sure the zombie process is reaped immediately.
  int clone_flags = fork_request.clone_flags() | SIGCHLD;

  // Store uid and gid since they will change if CLONE_NEWUSER is set.
  uid_t uid = getuid();
  uid_t gid = getgid();

  FDCloser pipe_fds[2];
  {
    int pfds[2] = {-1, -1};
    if (fork_request.monitor_type() == FORKSERVER_MONITOR_UNOTIFY) {
      SAPI_RAW_PCHECK(pipe(pfds) == 0, "creating status pipe");
    }
    pipe_fds[0] = FDCloser(pfds[0]);
    pipe_fds[1] = FDCloser(pfds[1]);
  }

  int socketpair_fds[2];
  SAPI_RAW_PCHECK(
      socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, socketpair_fds) == 0,
      "creating signaling socketpair");
  for (int i = 0; i < 2; i++) {
    int val = 1;
    SAPI_RAW_PCHECK(setsockopt(socketpair_fds[i], SOL_SOCKET, SO_PASSCRED, &val,
                               sizeof(val)) == 0,
                    "setsockopt failed");
  }

  FDCloser signaling_fds[] = {FDCloser(socketpair_fds[0]),
                              FDCloser(socketpair_fds[1])};

  // Note: init_pid will be overwritten with the actual init pid if the init
  //       process was started or stays at 0 if that is not needed - no pidns.
  pid_t init_pid = 0;
  pid_t sandboxee_pid = -1;
  bool avoid_pivot_root = clone_flags & (CLONE_NEWUSER | CLONE_NEWNS);
  if (avoid_pivot_root) {
    // Create initial namespaces only when they're first needed.
    // This allows sandbox2 to be still used without any namespaces support
    if (initial_mntns_fd_ == -1) {
      CreateInitialNamespaces();
    }
    // We first just fork a child, which will join the initial namespaces
    // Note: Not a regular fork() as one really needs to be single-threaded to
    //       setns and this is not the case with TSAN.
    pid_t pid = util::ForkWithFlags(SIGCHLD);
    SAPI_RAW_PCHECK(pid != -1, "fork failed");
    if (pid == 0) {
      SAPI_RAW_PCHECK(setns(initial_userns_fd_, CLONE_NEWUSER) != -1,
                      "joining initial user namespace");
      SAPI_RAW_PCHECK(setns(initial_mntns_fd_, CLONE_NEWNS) != -1,
                      "joining initial mnt namespace");
      close(initial_userns_fd_);
      close(initial_mntns_fd_);
      // Do not create new userns it will be unshared later
      sandboxee_pid =
          util::ForkWithFlags((clone_flags & ~CLONE_NEWUSER) | CLONE_PARENT);
      if (sandboxee_pid == -1) {
        SAPI_RAW_LOG(ERROR, "util::ForkWithFlags(%x)", clone_flags);
      }
      if (sandboxee_pid != 0) {
        _exit(0);
      }
      // Send sandboxee pid
      absl::Status status = SendPid(signaling_fds[1].get());
      SAPI_RAW_CHECK(status.ok(),
                     absl::StrCat("sending pid: ", status.message()).c_str());
    }
  } else {
    sandboxee_pid = util::ForkWithFlags(clone_flags);
    if (sandboxee_pid == -1) {
      SAPI_RAW_LOG(ERROR, "util::ForkWithFlags(%x)", clone_flags);
    }
    if (sandboxee_pid == 0) {
      close(initial_userns_fd_);
      close(initial_mntns_fd_);
    }
  }

  // Child.
  if (sandboxee_pid == 0) {
    signaling_fds[0].Close();
    pipe_fds[0].Close();
    // Make sure we override the forkserver's comms fd
    comms_->Terminate();
    if (exec_fd != -1) {
      int signaling_fd = signaling_fds[1].Release();
      int pipe_fd = pipe_fds[1].Release();
      MoveFDs({{&exec_fd, Comms::kSandbox2TargetExecFD},
               {&comms_fd, Comms::kSandbox2ClientCommsFD}},
              {&signaling_fd, &pipe_fd});
      signaling_fds[1] = FDCloser(signaling_fd);
      pipe_fds[1] = FDCloser(pipe_fd);
    }
    *comms_ = Comms(comms_fd);
    LaunchChild(fork_request, exec_fd, uid, gid, std::move(signaling_fds[1]),
                std::move(pipe_fds[1]), avoid_pivot_root);
    return sandboxee_pid;
  }

  signaling_fds[1].Close();

  if (avoid_pivot_root) {
    if (auto pid = ReceivePid(signaling_fds[0].get()); !pid.ok()) {
      SAPI_RAW_LOG(ERROR, "%s", std::string(pid.status().message()).c_str());
    } else {
      sandboxee_pid = pid.value();
    }
  }

  if (fork_request.clone_flags() & CLONE_NEWPID) {
    // The pid of the init process is equal to the child process that we've
    // previously forked.
    init_pid = sandboxee_pid;
    sandboxee_pid = -1;
    // And the actual sandboxee is forked from the init process, so we need to
    // receive the actual PID.
    if (auto pid_or = ReceivePid(signaling_fds[0].get()); !pid_or.ok()) {
      SAPI_RAW_LOG(ERROR, "%s", std::string(pid_or.status().message()).c_str());
      if (init_pid != -1) {
        kill(init_pid, SIGKILL);
      }
      init_pid = -1;
    } else {
      sandboxee_pid = pid_or.value();
    }
  }

  // Parent.
  pipe_fds[1].Close();
  close(comms_fd);
  if (exec_fd >= 0) {
    close(exec_fd);
  }
  SAPI_RAW_CHECK(comms_->SendInt32(init_pid),
                 absl::StrCat("Failed to send init PID: ", init_pid).c_str());
  SAPI_RAW_CHECK(
      comms_->SendInt32(sandboxee_pid),
      absl::StrCat("Failed to send sandboxee PID: ", sandboxee_pid).c_str());

  if (pipe_fds[0].get() >= 0) {
    SAPI_RAW_CHECK(comms_->SendFD(pipe_fds[0].get()),
                   "Failed to send status pipe");
  }
  return sandboxee_pid;
}

bool ForkServer::IsTerminated() const { return comms_->IsTerminated(); }

bool ForkServer::Initialize() {
  // For safety drop as many capabilities as possible.
  // Note that cap_t is actually a pointer.
  cap_t have_caps = cap_get_proc();  // caps we currently have
  SAPI_RAW_CHECK(have_caps, "failed to cap_get_proc()");
  cap_t wanted_caps = cap_init();  // starts as empty set, ie. no caps
  SAPI_RAW_CHECK(wanted_caps, "failed to cap_init()");

  for (cap_flag_t flag : {CAP_EFFECTIVE, CAP_PERMITTED}) {
    cap_flag_value_t value;
    int rc = cap_get_flag(have_caps, CAP_SETFCAP, flag, &value);
    SAPI_RAW_CHECK(!rc, "cap_get_flag");
    if (value == CAP_SET) {
      cap_value_t caps_to_set[1] = {
          CAP_SETFCAP,
      };
      rc = cap_set_flag(wanted_caps, flag, 1, caps_to_set, CAP_SET);
      SAPI_RAW_CHECK(!rc, "cap_set_flag");
    }
  }

  SAPI_RAW_CHECK(!cap_set_proc(wanted_caps), "while dropping capabilities");
  SAPI_RAW_CHECK(!cap_free(wanted_caps), "while freeing wanted_caps");
  SAPI_RAW_CHECK(!cap_free(have_caps), "while freeing have_caps");

  // All processes spawned by the fork'd/execute'd process will see this process
  // as /sbin/init. Therefore it will receive (and ignore) their final status
  // (see the next comment as well). PR_SET_CHILD_SUBREAPER is available since
  // kernel version 3.4, so don't panic if it fails.
  if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) == -1) {
    SAPI_RAW_VLOG(3, "prctl(PR_SET_CHILD_SUBREAPER, 1): %s [%d]",
                  StrError(errno).c_str(), errno);
  }

  // Don't convert terminated child processes into zombies. It's up to the
  // sandbox (Monitor) to track them and receive/report their final status.
  struct sigaction sa;
  sa.sa_handler = SIG_DFL;
  sa.sa_flags = SA_NOCLDWAIT;
  sigemptyset(&sa.sa_mask);
  if (sigaction(SIGCHLD, &sa, nullptr) == -1) {
    SAPI_RAW_PLOG(ERROR, "sigaction(SIGCHLD, flags=SA_NOCLDWAIT)");
    return false;
  }
  return true;
}

void ForkServer::CreateInitialNamespaces() {
  // Spawn a new process to create initial user and mount namespaces to be used
  // as a base for each namespaced sandboxee.

  // Store uid and gid to create mappings after CLONE_NEWUSER
  uid_t uid = getuid();
  gid_t gid = getgid();

  // Socket to synchronize so that we open ns fds before process dies
  FDCloser create_efd(eventfd(0, EFD_CLOEXEC));
  SAPI_RAW_PCHECK(create_efd.get() != -1, "creating eventfd");
  FDCloser open_efd(eventfd(0, EFD_CLOEXEC));
  SAPI_RAW_PCHECK(open_efd.get() != -1, "creating eventfd");
  pid_t pid = util::ForkWithFlags(CLONE_NEWUSER | CLONE_NEWNS | SIGCHLD);
  if (pid == -1 && errno == EPERM && IsLikelyChrooted()) {
    SAPI_RAW_LOG(FATAL,
                 "failed to fork initial namespaces process: parent process is "
                 "likely chrooted");
  }
  SAPI_RAW_PCHECK(pid != -1, "failed to fork initial namespaces process");
  uint64_t value = 1;
  if (pid == 0) {
    Namespace::InitializeInitialNamespaces(uid, gid);
    SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(write(create_efd.get(), &value,
                                             sizeof(value))) == sizeof(value),
                    "synchronizing initial namespaces creation");
    SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(read(open_efd.get(), &value,
                                            sizeof(value))) == sizeof(value),
                    "synchronizing initial namespaces creation");
    SAPI_RAW_PCHECK(chroot("/realroot") == 0,
                    "chrooting prior to dumping coverage");
    util::DumpCoverageData();
    _exit(0);
  }
  SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(read(create_efd.get(), &value,
                                          sizeof(value))) == sizeof(value),
                  "synchronizing initial namespaces creation");
  initial_userns_fd_ = open(absl::StrCat("/proc/", pid, "/ns/user").c_str(),
                            O_RDONLY | O_CLOEXEC);
  SAPI_RAW_PCHECK(initial_userns_fd_ != -1, "getting initial userns fd");
  initial_mntns_fd_ = open(absl::StrCat("/proc/", pid, "/ns/mnt").c_str(),
                           O_RDONLY | O_CLOEXEC);
  SAPI_RAW_PCHECK(initial_mntns_fd_ != -1, "getting initial mntns fd");
  SAPI_RAW_PCHECK(TEMP_FAILURE_RETRY(write(open_efd.get(), &value,
                                           sizeof(value))) == sizeof(value),
                  "synchronizing initial namespaces creation");
}

void ForkServer::SanitizeEnvironment() const {
  // Mark all file descriptors, except the standard ones (needed
  // for proper sandboxed process operations), as close-on-exec.
  absl::Status status = sanitizer::SanitizeCurrentProcess(
      {STDIN_FILENO, STDOUT_FILENO, STDERR_FILENO, comms_->GetConnectionFD()},
      /* close_fds = */ false);
  SAPI_RAW_CHECK(
      status.ok(),
      absl::StrCat("while sanitizing process: ", status.message()).c_str());
}

void ForkServer::ExecuteProcess(int execve_fd, const char* const* argv,
                                const char* const* envp) {
  // Do not add any code before execve(), as it's subject to seccomp policies.
  // Indicate that it's a special execve(), by setting 4th, 5th and 6th syscall
  // argument to magic values.
  util::Execveat(execve_fd, "", argv, envp, AT_EMPTY_PATH,
                 internal::kExecveMagic);

  int saved_errno = errno;
  SAPI_RAW_PLOG(ERROR, "execveat failed");
  if (argv[0]) {
    SAPI_RAW_LOG(ERROR, "argv[0]=%s", argv[0]);
  }

  if (saved_errno == ENOSYS) {
    SAPI_RAW_LOG(ERROR,
                 "This is likely caused by running on a kernel that is too old."
    );
  } else if (saved_errno == ENOENT && execve_fd >= 0) {
    // Since we know the file exists, it must be that the file is dynamically
    // linked and the ELF interpreter is what's actually missing.
    SAPI_RAW_LOG(
        ERROR,
        "This is likely caused by running dynamically-linked sandboxee without "
        "calling .AddLibrariesForBinary() on the policy builder.");
  }

  util::Syscall(__NR_exit_group, EXIT_FAILURE);
  abort();
}

void ForkServer::InitializeNamespaces(const ForkRequest& request, uid_t uid,
                                      gid_t gid, bool avoid_pivot_root) {
  if (!request.has_mount_tree()) {
    return;
  }
  Namespace::InitializeNamespaces(
      uid, gid, request.clone_flags(), Mounts(request.mount_tree()),
      request.hostname(), avoid_pivot_root, request.allow_mount_propagation());
}

}  // namespace sandbox2