diff options
author | Ryan Savitski <rsavitski@google.com> | 2023-03-21 15:27:52 +0000 |
---|---|---|
committer | Ryan Savitski <rsavitski@google.com> | 2023-03-22 14:03:07 +0000 |
commit | 47141cd40d57f90b42ed0548ff007a11d542c97c (patch) | |
tree | 77b0fc161864f1cce3ad4d4cab128111c30b3485 | |
parent | 638a4b25725353062fb3769f962175a4d3d87abc (diff) | |
download | perfetto-47141cd40d57f90b42ed0548ff007a11d542c97c.tar.gz |
service: producer port: reduce blocking socket send timeout 10s -> 10ms
On Android, with more apps acting as perfetto producers and use of
freezer, the service is more prone to be connected to unresponsive
producers. This is a problem if we ever saturate the kernel send buffer
on a given socket since it causes the tracing service to block for 10s.
As an immediate workaround, drastically lower the producer port timeouts
to 50ms as we're not supposed to be hitting that limit under normal
operation in the first place.
Longer term, we need to reduce our reliance on blocking socket
operations, which were always known to be sub-optimal, but also worked
well enough in practice while being less complex than nonblocking
approaches.
Tested manually, strace as proof (note that we're using a MSG_DONTWAIT sendmsg + manual
poll fallback due to b/193234818):
fcntl(8, F_GETFL) = 0x802 (flags O_RDWR|O_NONBLOCK)
fcntl(8, F_SETFL, O_RDWR) = 0
fcntl(8, F_GETFL) = 0x2 (flags O_RDWR)
sendmsg(8, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="\24\0\0\0\20\0032\215\200\200\0\10\1\20\1\32\7:\202\200\200\0\10\3", iov_len=24}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, MSG_DONTWAIT|MSG_NOSIGNAL) = -1 EAGAIN (Resource temporarily unavailable)
poll([{fd=8, events=POLLOUT}], 1, 10) = 0 (Timeout)
fcntl(8, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(8, F_SETFL, O_RDWR|O_NONBLOCK) = 0
shutdown(8, SHUT_RDWR) = 0
close(8) = 0
Bug: 236813972
Merged-In: I75489478cae5a50809ce0b2a0cabce8424b46820
Change-Id: I75489478cae5a50809ce0b2a0cabce8424b46820
(cherry picked from commit 8cfee469c0b3aa5e70576c4d2888e07e72651f35)
-rw-r--r-- | include/perfetto/ext/ipc/host.h | 3 | ||||
-rw-r--r-- | src/ipc/host_impl.cc | 9 | ||||
-rw-r--r-- | src/ipc/host_impl.h | 4 | ||||
-rw-r--r-- | src/tracing/ipc/service/service_ipc_host_impl.cc | 14 |
4 files changed, 28 insertions, 2 deletions
diff --git a/include/perfetto/ext/ipc/host.h b/include/perfetto/ext/ipc/host.h index a9b5824b9..0f8533ee3 100644 --- a/include/perfetto/ext/ipc/host.h +++ b/include/perfetto/ext/ipc/host.h @@ -59,6 +59,9 @@ class Host { // case of errors (e.g., another service with the same name is already // registered). virtual bool ExposeService(std::unique_ptr<Service>) = 0; + + // Overrides the default send timeout for the per-connection sockets. + virtual void SetSocketSendTimeoutMs(uint32_t timeout_ms) = 0; }; } // namespace ipc diff --git a/src/ipc/host_impl.cc b/src/ipc/host_impl.cc index c182cd17e..1bf234835 100644 --- a/src/ipc/host_impl.cc +++ b/src/ipc/host_impl.cc @@ -116,6 +116,12 @@ bool HostImpl::ExposeService(std::unique_ptr<Service> service) { return true; } +void HostImpl::SetSocketSendTimeoutMs(uint32_t timeout_ms) { + PERFETTO_DCHECK_THREAD(thread_checker_); + // Should be less than the watchdog period (30s). + socket_tx_timeout_ms_ = timeout_ms; +} + void HostImpl::OnNewIncomingConnection( base::UnixSocket*, std::unique_ptr<base::UnixSocket> new_conn) { @@ -125,8 +131,7 @@ void HostImpl::OnNewIncomingConnection( clients_by_socket_[new_conn.get()] = client.get(); client->id = client_id; client->sock = std::move(new_conn); - // Watchdog is 30 seconds, so set the socket timeout to 10 seconds. - client->sock->SetTxTimeout(10000); + client->sock->SetTxTimeout(socket_tx_timeout_ms_); clients_[client_id] = std::move(client); } diff --git a/src/ipc/host_impl.h b/src/ipc/host_impl.h index ac0a59319..de7d07e29 100644 --- a/src/ipc/host_impl.h +++ b/src/ipc/host_impl.h @@ -32,6 +32,8 @@ namespace perfetto { namespace ipc { +constexpr uint32_t kDefaultIpcTxTimeoutMs = 10000; + class HostImpl : public Host, public base::UnixSocket::EventListener { public: HostImpl(const char* socket_name, base::TaskRunner*); @@ -40,6 +42,7 @@ class HostImpl : public Host, public base::UnixSocket::EventListener { // Host implementation. bool ExposeService(std::unique_ptr<Service>) override; + void SetSocketSendTimeoutMs(uint32_t timeout_ms) override; // base::UnixSocket::EventListener implementation. void OnNewIncomingConnection(base::UnixSocket*, @@ -88,6 +91,7 @@ class HostImpl : public Host, public base::UnixSocket::EventListener { std::map<base::UnixSocket*, ClientConnection*> clients_by_socket_; ServiceID last_service_id_ = 0; ClientID last_client_id_ = 0; + uint32_t socket_tx_timeout_ms_ = kDefaultIpcTxTimeoutMs; PERFETTO_THREAD_CHECKER(thread_checker_) base::WeakPtrFactory<HostImpl> weak_ptr_factory_; // Keep last. }; diff --git a/src/tracing/ipc/service/service_ipc_host_impl.cc b/src/tracing/ipc/service/service_ipc_host_impl.cc index 5618b3cb3..3598bf59e 100644 --- a/src/tracing/ipc/service/service_ipc_host_impl.cc +++ b/src/tracing/ipc/service/service_ipc_host_impl.cc @@ -31,6 +31,10 @@ namespace perfetto { +namespace { +constexpr uint32_t kProducerSocketTxTimeoutMs = 10; +} + // TODO(fmayer): implement per-uid connection limit (b/69093705). // Implements the publicly exposed factory method declared in @@ -85,6 +89,16 @@ bool ServiceIPCHostImpl::DoStart() { return false; } + // Lower the timeout for blocking socket sends to producers as we shouldn't + // normally exhaust the kernel send buffer unless the producer is + // unresponsive. We'll drop the connection if the timeout is hit (see + // UnixSocket::Send). Context in b/236813972, b/193234818. + // Consumer port continues using the default timeout (10s) as there are + // generally fewer consumer processes, and they're better behaved. Also the + // consumer port ipcs might exhaust the send buffer under normal operation + // due to large messages such as ReadBuffersResponse. + producer_ipc_port_->SetSocketSendTimeoutMs(kProducerSocketTxTimeoutMs); + // TODO(fmayer): add a test that destroyes the ServiceIPCHostImpl soon after // Start() and checks that no spurious callbacks are issued. bool producer_service_exposed = producer_ipc_port_->ExposeService( |