hyperactor/
panic_handler.rs

1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9//! Used to capture the backtrace from panic and store it in a task_local, so
10//! that it can be retrieved later when the panic is catched.
11
12use std::backtrace::Backtrace;
13use std::cell::RefCell;
14use std::future::Future;
15use std::panic;
16
17/// A struct to store the message and backtrace from a panic.
18pub(crate) struct PanicInfo {
19    /// The message from the panic.
20    message: String,
21    /// The location where the panic occurred.
22    location: Option<PanicLocation>,
23    /// The backtrace from the panic.
24    backtrace: Backtrace,
25}
26
27impl std::fmt::Display for PanicInfo {
28    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
29        write!(f, "panic at ")?;
30        match &self.location {
31            Some(loc) => write!(f, "{}", loc)?,
32            None => write!(f, "unavailable")?,
33        }
34        write!(f, ": {}\n{}", self.message, self.backtrace)
35    }
36}
37
38/// A struct to store location information from a panic with owned data
39#[derive(Clone, Debug)]
40struct PanicLocation {
41    file: String,
42    line: u32,
43    column: u32,
44}
45
46impl From<&panic::Location<'_>> for PanicLocation {
47    fn from(loc: &panic::Location<'_>) -> Self {
48        Self {
49            file: loc.file().to_string(),
50            line: loc.line(),
51            column: loc.column(),
52        }
53    }
54}
55
56impl std::fmt::Display for PanicLocation {
57    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
58        write!(f, "{}:{}:{}", self.file, self.line, self.column)
59    }
60}
61
62tokio::task_local! {
63    /// A task_local variable to store the backtrace from a panic, so it can be
64    /// retrieved later.
65    static BACKTRACE: RefCell<Option<PanicInfo>>;
66}
67
68/// Call this from the main method of your application, and use it in conjunction
69/// with [[with_backtrace_tracking]] and [[take_panic_info]], in order to
70/// capture the backtrace from a panic.
71pub fn set_panic_hook() {
72    panic::update_hook(move |prev, info| {
73        let backtrace = Backtrace::force_capture();
74
75        // Extract the panic message from the payload
76        let panic_msg = if let Some(s) = info.payload_as_str() {
77            s.to_string()
78        } else {
79            "panic message was not a string".to_string()
80        };
81
82        let location = info.location().map(PanicLocation::from);
83        let loc_str = location
84            .as_ref()
85            .map_or_else(|| "unavailable".to_owned(), |l| l.to_string());
86        tracing::error!("stacktrace"=%backtrace, "panic at {loc_str}: {panic_msg}");
87
88        let _result = BACKTRACE.try_with(|entry| match entry.try_borrow_mut() {
89            Ok(mut entry_ref) => {
90                *entry_ref = Some(PanicInfo {
91                    message: panic_msg,
92                    location,
93                    backtrace,
94                });
95            }
96            Err(borrow_mut_error) => {
97                eprintln!(
98                    "failed to store backtrace to task_local: {:?}",
99                    borrow_mut_error
100                );
101            }
102        });
103
104        // Execute the default hood to preserve the default behavior.
105        prev(info);
106    });
107}
108
109/// Set a task_local variable for this future f, so any panic occurred in f can
110/// be stored and retrieved later.
111pub(crate) async fn with_backtrace_tracking<F>(f: F) -> F::Output
112where
113    F: Future,
114{
115    BACKTRACE.scope(RefCell::new(None), f).await
116}
117
118/// Take the backtrace from the task_local variable, and reset the task_local to
119/// None. Return error if the backtrace is not stored, or cannot be retrieved.
120pub(crate) fn take_panic_info() -> Result<PanicInfo, anyhow::Error> {
121    BACKTRACE
122        .try_with(|entry| {
123            entry
124                .try_borrow_mut()
125                .map_err(|e| anyhow::anyhow!("failed to borrow task_local: {:?}", e))
126                .and_then(|mut entry_ref| {
127                    // Use take because we want to clear the task_local after
128                    // the panic info has been retrieve.
129                    entry_ref
130                        .take()
131                        .ok_or_else(|| anyhow::anyhow!("nothing is stored in task_local"))
132                })
133        })
134        .map_err(|e| anyhow::anyhow!("failed to access task_local: {:?}", e))?
135}
136
137#[cfg(test)]
138mod tests {
139    use futures::FutureExt;
140
141    use super::*;
142
143    async fn execute_panic() {
144        let result = async {
145            panic!("boom!");
146        }
147        .catch_unwind()
148        .await;
149        assert!(result.is_err());
150    }
151
152    #[tokio::test]
153    async fn test_with_tracking() {
154        set_panic_hook();
155        with_backtrace_tracking(async {
156            execute_panic().await;
157            // Verify backtrace can be taken successfully.
158            assert!(take_panic_info().is_ok());
159            // Cannot take backtrace again because task_local is reset in the
160            // previous take.
161            assert!(take_panic_info().is_err());
162        })
163        .await;
164
165        // Cannot get backtrace because this is out of the set task_local's
166        // scope.
167        assert!(take_panic_info().is_err());
168    }
169
170    #[tokio::test]
171    async fn test_without_tracking() {
172        set_panic_hook();
173        async {
174            execute_panic().await;
175            // Cannot get backtrace because task_local is not set.
176            assert!(take_panic_info().is_err());
177        }
178        .await;
179    }
180
181    #[tokio::test]
182    async fn test_without_init() {
183        // set_panic_hook() was not called.
184        with_backtrace_tracking(async {
185            execute_panic().await;
186            // Cannot get backtrace because the custom panic hook is not set.
187            assert!(take_panic_info().is_err());
188        })
189        .await;
190    }
191
192    #[tokio::test]
193    async fn test_nested_tasks() {
194        async fn verify_inner_panic(backtrace_captured: bool) {
195            let result = async {
196                panic!("wow!");
197            }
198            .catch_unwind()
199            .await;
200            assert!(result.is_err());
201            if backtrace_captured {
202                let info = take_panic_info().unwrap();
203                assert_eq!(info.message, "wow!");
204                assert!(info.backtrace.to_string().contains("verify_inner_panic"));
205            } else {
206                assert!(take_panic_info().is_err());
207            }
208        }
209
210        set_panic_hook();
211        with_backtrace_tracking(async {
212            execute_panic().await;
213            // Execute a nested task without tracking, and verify it cannot get backtrace.
214            let result = tokio::task::spawn(async {
215                verify_inner_panic(false).await;
216            })
217            .await;
218            assert!(result.is_ok());
219
220            // Execute a nested task with tracking, and verify it can get its own backtrace.
221            let result =
222                tokio::task::spawn(with_backtrace_tracking(verify_inner_panic(true))).await;
223            assert!(result.is_ok());
224
225            // Verify the outer task can get its own backtrace.
226            let info = take_panic_info().unwrap();
227            assert_eq!(info.message, "boom!");
228            assert!(info.backtrace.to_string().contains("test_nested_tasks"));
229        })
230        .await;
231    }
232}