aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRyan Savitski <rsavitski@google.com>2023-03-21 15:27:52 +0000
committerRyan Savitski <rsavitski@google.com>2023-03-22 14:03:07 +0000
commit47141cd40d57f90b42ed0548ff007a11d542c97c (patch)
tree77b0fc161864f1cce3ad4d4cab128111c30b3485
parent638a4b25725353062fb3769f962175a4d3d87abc (diff)
downloadperfetto-47141cd40d57f90b42ed0548ff007a11d542c97c.tar.gz
service: producer port: reduce blocking socket send timeout 10s -> 10ms
On Android, with more apps acting as perfetto producers and use of freezer, the service is more prone to be connected to unresponsive producers. This is a problem if we ever saturate the kernel send buffer on a given socket since it causes the tracing service to block for 10s. As an immediate workaround, drastically lower the producer port timeouts to 50ms as we're not supposed to be hitting that limit under normal operation in the first place. Longer term, we need to reduce our reliance on blocking socket operations, which were always known to be sub-optimal, but also worked well enough in practice while being less complex than nonblocking approaches. Tested manually, strace as proof (note that we're using a MSG_DONTWAIT sendmsg + manual poll fallback due to b/193234818): fcntl(8, F_GETFL) = 0x802 (flags O_RDWR|O_NONBLOCK) fcntl(8, F_SETFL, O_RDWR) = 0 fcntl(8, F_GETFL) = 0x2 (flags O_RDWR) sendmsg(8, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="\24\0\0\0\20\0032\215\200\200\0\10\1\20\1\32\7:\202\200\200\0\10\3", iov_len=24}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, MSG_DONTWAIT|MSG_NOSIGNAL) = -1 EAGAIN (Resource temporarily unavailable) poll([{fd=8, events=POLLOUT}], 1, 10) = 0 (Timeout) fcntl(8, F_GETFL) = 0x2 (flags O_RDWR) fcntl(8, F_SETFL, O_RDWR|O_NONBLOCK) = 0 shutdown(8, SHUT_RDWR) = 0 close(8) = 0 Bug: 236813972 Merged-In: I75489478cae5a50809ce0b2a0cabce8424b46820 Change-Id: I75489478cae5a50809ce0b2a0cabce8424b46820 (cherry picked from commit 8cfee469c0b3aa5e70576c4d2888e07e72651f35)
-rw-r--r--include/perfetto/ext/ipc/host.h3
-rw-r--r--src/ipc/host_impl.cc9
-rw-r--r--src/ipc/host_impl.h4
-rw-r--r--src/tracing/ipc/service/service_ipc_host_impl.cc14
4 files changed, 28 insertions, 2 deletions
diff --git a/include/perfetto/ext/ipc/host.h b/include/perfetto/ext/ipc/host.h
index a9b5824b9..0f8533ee3 100644
--- a/include/perfetto/ext/ipc/host.h
+++ b/include/perfetto/ext/ipc/host.h
@@ -59,6 +59,9 @@ class Host {
// case of errors (e.g., another service with the same name is already
// registered).
virtual bool ExposeService(std::unique_ptr<Service>) = 0;
+
+ // Overrides the default send timeout for the per-connection sockets.
+ virtual void SetSocketSendTimeoutMs(uint32_t timeout_ms) = 0;
};
} // namespace ipc
diff --git a/src/ipc/host_impl.cc b/src/ipc/host_impl.cc
index c182cd17e..1bf234835 100644
--- a/src/ipc/host_impl.cc
+++ b/src/ipc/host_impl.cc
@@ -116,6 +116,12 @@ bool HostImpl::ExposeService(std::unique_ptr<Service> service) {
return true;
}
+void HostImpl::SetSocketSendTimeoutMs(uint32_t timeout_ms) {
+ PERFETTO_DCHECK_THREAD(thread_checker_);
+ // Should be less than the watchdog period (30s).
+ socket_tx_timeout_ms_ = timeout_ms;
+}
+
void HostImpl::OnNewIncomingConnection(
base::UnixSocket*,
std::unique_ptr<base::UnixSocket> new_conn) {
@@ -125,8 +131,7 @@ void HostImpl::OnNewIncomingConnection(
clients_by_socket_[new_conn.get()] = client.get();
client->id = client_id;
client->sock = std::move(new_conn);
- // Watchdog is 30 seconds, so set the socket timeout to 10 seconds.
- client->sock->SetTxTimeout(10000);
+ client->sock->SetTxTimeout(socket_tx_timeout_ms_);
clients_[client_id] = std::move(client);
}
diff --git a/src/ipc/host_impl.h b/src/ipc/host_impl.h
index ac0a59319..de7d07e29 100644
--- a/src/ipc/host_impl.h
+++ b/src/ipc/host_impl.h
@@ -32,6 +32,8 @@
namespace perfetto {
namespace ipc {
+constexpr uint32_t kDefaultIpcTxTimeoutMs = 10000;
+
class HostImpl : public Host, public base::UnixSocket::EventListener {
public:
HostImpl(const char* socket_name, base::TaskRunner*);
@@ -40,6 +42,7 @@ class HostImpl : public Host, public base::UnixSocket::EventListener {
// Host implementation.
bool ExposeService(std::unique_ptr<Service>) override;
+ void SetSocketSendTimeoutMs(uint32_t timeout_ms) override;
// base::UnixSocket::EventListener implementation.
void OnNewIncomingConnection(base::UnixSocket*,
@@ -88,6 +91,7 @@ class HostImpl : public Host, public base::UnixSocket::EventListener {
std::map<base::UnixSocket*, ClientConnection*> clients_by_socket_;
ServiceID last_service_id_ = 0;
ClientID last_client_id_ = 0;
+ uint32_t socket_tx_timeout_ms_ = kDefaultIpcTxTimeoutMs;
PERFETTO_THREAD_CHECKER(thread_checker_)
base::WeakPtrFactory<HostImpl> weak_ptr_factory_; // Keep last.
};
diff --git a/src/tracing/ipc/service/service_ipc_host_impl.cc b/src/tracing/ipc/service/service_ipc_host_impl.cc
index 5618b3cb3..3598bf59e 100644
--- a/src/tracing/ipc/service/service_ipc_host_impl.cc
+++ b/src/tracing/ipc/service/service_ipc_host_impl.cc
@@ -31,6 +31,10 @@
namespace perfetto {
+namespace {
+constexpr uint32_t kProducerSocketTxTimeoutMs = 10;
+}
+
// TODO(fmayer): implement per-uid connection limit (b/69093705).
// Implements the publicly exposed factory method declared in
@@ -85,6 +89,16 @@ bool ServiceIPCHostImpl::DoStart() {
return false;
}
+ // Lower the timeout for blocking socket sends to producers as we shouldn't
+ // normally exhaust the kernel send buffer unless the producer is
+ // unresponsive. We'll drop the connection if the timeout is hit (see
+ // UnixSocket::Send). Context in b/236813972, b/193234818.
+ // Consumer port continues using the default timeout (10s) as there are
+ // generally fewer consumer processes, and they're better behaved. Also the
+ // consumer port ipcs might exhaust the send buffer under normal operation
+ // due to large messages such as ReadBuffersResponse.
+ producer_ipc_port_->SetSocketSendTimeoutMs(kProducerSocketTxTimeoutMs);
+
// TODO(fmayer): add a test that destroyes the ServiceIPCHostImpl soon after
// Start() and checks that no spurious callbacks are issued.
bool producer_service_exposed = producer_ipc_port_->ExposeService(