hyperactor/
metrics.rs

1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9//! Hyperactor metrics.
10//!
11//! This module contains metrics definitions for various components of hyperactor.
12
13use hyperactor_telemetry::declare_static_counter;
14use hyperactor_telemetry::declare_static_histogram;
15use hyperactor_telemetry::declare_static_timer;
16use hyperactor_telemetry::declare_static_up_down_counter;
17
18/// Error types for channel-related errors. Only used for telemetry.
19#[derive(Debug, Clone, Copy)]
20pub enum ChannelErrorType {
21    /// Error occurred while sending a message.
22    SendError,
23    /// Error occurred while connecting to a channel.
24    ConnectionError,
25    /// Error occurred while deframing a message.
26    DeframeError,
27    /// Error occurred while deserializing a message.
28    DeserializeError,
29}
30
31impl ChannelErrorType {
32    /// Returns the string representation of the error type.
33    pub fn as_str(&self) -> &'static str {
34        match self {
35            ChannelErrorType::SendError => "send_error",
36            ChannelErrorType::ConnectionError => "connection_error",
37            ChannelErrorType::DeframeError => "deframe_error",
38            ChannelErrorType::DeserializeError => "deserialize_error",
39        }
40    }
41}
42
43// MAILBOX
44// Tracks messages that couldn't be delivered to their destination and were returned as undeliverable
45declare_static_counter!(
46    MAILBOX_UNDELIVERABLE_MESSAGES,
47    "mailbox.undeliverable_messages"
48);
49// Tracks the number of messages that were posted.
50hyperactor_telemetry::declare_static_counter!(MAILBOX_POSTS, "mailbox.posts");
51
52// ACTOR
53// Tracks the current size of the message queue for actors (increases when messages are queued, decreases when processed)
54declare_static_up_down_counter!(ACTOR_MESSAGE_QUEUE_SIZE, "actor.message_queue_size");
55// Tracks the total number of messages sent by actors
56declare_static_counter!(ACTOR_MESSAGES_SENT, "actor.messages_sent");
57// Tracks the total number of messages received by actors
58declare_static_counter!(ACTOR_MESSAGES_RECEIVED, "actor.messages_received");
59// Tracks errors that occur when receiving messages
60declare_static_counter!(ACTOR_MESSAGE_RECEIVE_ERRORS, "actor.message_receive_errors");
61// Measures the time taken to handle messages by actors
62declare_static_timer!(
63    ACTOR_MESSAGE_HANDLER_DURATION,
64    "actor.message_handler_duration",
65    hyperactor_telemetry::TimeUnit::Nanos
66);
67
68// CHANNEL
69declare_static_histogram!(REMOTE_MESSAGE_SEND_SIZE, "channel.remote_message_send_size");
70// Tracks the number of new channel connections established (client and server)
71declare_static_counter!(CHANNEL_CONNECTIONS, "channel.connections");
72// Tracks the number of channel reconnection attempts
73declare_static_counter!(CHANNEL_RECONNECTIONS, "channel.reconnections");
74// Tracks errors for each channel pair
75declare_static_counter!(CHANNEL_ERRORS, "channel.errors");
76// Tracks the number of NetRx encountering full buffer, i.e. its mpsc channel.
77
78// This metric counts how often the NetRx→client mpsc channel remains full,
79// incrementing once per CHANNEL_NET_RX_BUFFER_FULL_CHECK_INTERVAL while blocked.
80declare_static_counter!(CHANNEL_NET_RX_BUFFER_FULL, "channel.net_rx_buffer_full");
81
82// Tracks throughput (bytes sent)
83declare_static_counter!(CHANNEL_THROUGHPUT_BYTES, "channel.throughput.bytes");
84// Tracks throughput (message count)
85declare_static_counter!(CHANNEL_THROUGHPUT_MESSAGES, "channel.throughput.messages");
86// Tracks message latency for each channel pair in microseconds
87declare_static_histogram!(CHANNEL_LATENCY_MICROS, "channel.latency.us");
88
89// PROC MESH
90// Tracks the number of active processes in the process mesh
91declare_static_counter!(PROC_MESH_ALLOCATION, "proc_mesh.active_procs");
92// Tracks the number of process failures in the process mesh
93declare_static_counter!(PROC_MESH_PROC_STOPPED, "proc_mesh.proc_failures");
94// Tracks the number of actor failures within the process mesh
95declare_static_counter!(PROC_MESH_ACTOR_FAILURES, "proc_mesh.actor_failures");
96
97// MESSAGE LATENCY
98// Tracks end-to-end message latency in microseconds (sampled at 1% by default)
99declare_static_histogram!(MESSAGE_LATENCY_MICROS, "message.e2e_latency.us");