hyperactor_mesh/
logging.rs

1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9use std::collections::HashMap;
10use std::collections::VecDeque;
11use std::fmt;
12use std::path::Path;
13use std::path::PathBuf;
14use std::sync::Arc;
15use std::time::Duration;
16use std::time::SystemTime;
17
18use anyhow::Result;
19use async_trait::async_trait;
20use chrono::DateTime;
21use chrono::Local;
22use hostname;
23use hyperactor::Actor;
24use hyperactor::ActorRef;
25use hyperactor::Context;
26use hyperactor::Endpoint as _;
27use hyperactor::HandleClient;
28use hyperactor::Handler;
29use hyperactor::Instance;
30use hyperactor::OncePortRef;
31use hyperactor::ProcAddr;
32use hyperactor::RefClient;
33use hyperactor::channel;
34use hyperactor::channel::ChannelAddr;
35use hyperactor::channel::ChannelRx;
36use hyperactor::channel::ChannelTransport;
37use hyperactor::channel::ChannelTx;
38use hyperactor::channel::Rx;
39use hyperactor::channel::Tx;
40use hyperactor::channel::TxStatus;
41use hyperactor_config::CONFIG;
42use hyperactor_config::ConfigAttr;
43use hyperactor_config::Flattrs;
44use hyperactor_config::attrs::declare_attrs;
45use hyperactor_telemetry::env;
46use hyperactor_telemetry::log_file_path;
47use serde::Deserialize;
48use serde::Serialize;
49use tokio::io;
50use tokio::io::AsyncRead;
51use tokio::io::AsyncReadExt;
52use tokio::io::AsyncWriteExt;
53use tokio::sync::Mutex;
54use tokio::sync::Notify;
55use tokio::sync::RwLock;
56use tokio::sync::watch::Receiver;
57use tokio::task::JoinHandle;
58use tracing::Level;
59use typeuri::Named;
60
61use crate::bootstrap::BOOTSTRAP_LOG_CHANNEL;
62use crate::shortuuid::ShortUuid;
63
64mod line_prefixing_writer;
65
66pub(crate) const DEFAULT_AGGREGATE_WINDOW_SEC: u64 = 5;
67const MAX_LINE_SIZE: usize = 4 * 1024;
68
69declare_attrs! {
70    /// Maximum number of lines to batch before flushing to client
71    /// This means that stdout/err reader will be paused after reading `HYPERACTOR_READ_LOG_BUFFER` lines.
72    /// After pause lines will be flushed and reading will resume.
73    @meta(CONFIG = ConfigAttr::new(
74        Some("HYPERACTOR_READ_LOG_BUFFER".to_string()),
75        Some("read_log_buffer".to_string()),
76    ))
77    pub attr READ_LOG_BUFFER: usize = 100;
78
79    /// If enabled, local logs are also written to a file and aggregated
80    @meta(CONFIG = ConfigAttr::new(
81        Some("HYPERACTOR_FORCE_FILE_LOG".to_string()),
82        Some("force_file_log".to_string()),
83    ))
84    pub attr FORCE_FILE_LOG: bool = false;
85
86    /// Prefixes logs with rank
87    @meta(CONFIG = ConfigAttr::new(
88        Some("HYPERACTOR_PREFIX_WITH_RANK".to_string()),
89        Some("prefix_with_rank".to_string()),
90    ))
91    pub attr PREFIX_WITH_RANK: bool = true;
92}
93
94/// Calculate the Levenshtein distance between two strings
95fn levenshtein_distance(left: &str, right: &str) -> usize {
96    let left_chars: Vec<char> = left.chars().collect();
97    let right_chars: Vec<char> = right.chars().collect();
98
99    let left_len = left_chars.len();
100    let right_len = right_chars.len();
101
102    // Handle edge cases
103    if left_len == 0 {
104        return right_len;
105    }
106    if right_len == 0 {
107        return left_len;
108    }
109
110    // Create a matrix of size (len_s1+1) x (len_s2+1)
111    let mut matrix = vec![vec![0; right_len + 1]; left_len + 1];
112
113    // Initialize the first row and column
114    for (i, row) in matrix.iter_mut().enumerate().take(left_len + 1) {
115        row[0] = i;
116    }
117    for (j, cell) in matrix[0].iter_mut().enumerate().take(right_len + 1) {
118        *cell = j;
119    }
120
121    // Fill the matrix
122    for i in 1..=left_len {
123        for j in 1..=right_len {
124            let cost = if left_chars[i - 1] == right_chars[j - 1] {
125                0
126            } else {
127                1
128            };
129
130            matrix[i][j] = std::cmp::min(
131                std::cmp::min(
132                    matrix[i - 1][j] + 1, // deletion
133                    matrix[i][j - 1] + 1, // insertion
134                ),
135                matrix[i - 1][j - 1] + cost, // substitution
136            );
137        }
138    }
139
140    // Return the bottom-right cell
141    matrix[left_len][right_len]
142}
143
144/// Calculate the normalized edit distance between two strings (0.0 to 1.0)
145fn normalized_edit_distance(left: &str, right: &str) -> f64 {
146    let distance = levenshtein_distance(left, right) as f64;
147    let max_len = std::cmp::max(left.len(), right.len()) as f64;
148
149    if max_len == 0.0 {
150        0.0 // Both strings are empty, so they're identical
151    } else {
152        distance / max_len
153    }
154}
155
156#[derive(Debug, Clone)]
157/// LogLine represents a single log line with its content and count
158struct LogLine {
159    content: String,
160    pub count: u64,
161}
162
163impl LogLine {
164    fn new(content: String) -> Self {
165        Self { content, count: 1 }
166    }
167}
168
169impl fmt::Display for LogLine {
170    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
171        write!(
172            f,
173            "\x1b[33m[{} similar log lines]\x1b[0m {}",
174            self.count, self.content
175        )
176    }
177}
178
179#[derive(Debug, Clone)]
180/// Aggregator is a struct that holds a list of LogLines and a start time.
181/// It can aggregate new log lines to existing ones if they are "similar" based on edit distance.
182struct Aggregator {
183    lines: Vec<LogLine>,
184    start_time: SystemTime,
185    similarity_threshold: f64, // Threshold for considering two strings similar (0.0 to 1.0)
186}
187
188impl Aggregator {
189    fn new() -> Self {
190        // Default threshold: strings with normalized edit distance < 0.15 are considered similar
191        Self::new_with_threshold(0.15)
192    }
193
194    fn new_with_threshold(threshold: f64) -> Self {
195        Aggregator {
196            lines: vec![],
197            start_time: std::time::SystemTime::now(),
198            similarity_threshold: threshold,
199        }
200    }
201
202    fn reset(&mut self) {
203        self.lines.clear();
204        self.start_time = std::time::SystemTime::now();
205    }
206
207    fn add_line(&mut self, line: &str) -> anyhow::Result<()> {
208        // Find the most similar existing line
209        let mut best_match_idx = None;
210        let mut best_similarity = f64::MAX;
211
212        for (idx, existing_line) in self.lines.iter().enumerate() {
213            let distance = normalized_edit_distance(&existing_line.content, line);
214
215            // If this line is more similar than our current best match
216            if distance < best_similarity && distance < self.similarity_threshold {
217                best_match_idx = Some(idx);
218                best_similarity = distance;
219            }
220        }
221
222        // If we found a similar enough line, increment its count
223        if let Some(idx) = best_match_idx {
224            self.lines[idx].count += 1;
225        } else {
226            // Otherwise, add a new line
227            self.lines.push(LogLine::new(line.to_string()));
228        }
229
230        Ok(())
231    }
232
233    fn is_empty(&self) -> bool {
234        self.lines.is_empty()
235    }
236}
237
238// Helper function to format SystemTime
239fn format_system_time(time: SystemTime) -> String {
240    let datetime: DateTime<Local> = time.into();
241    datetime.format("%Y-%m-%d %H:%M:%S").to_string()
242}
243
244impl fmt::Display for Aggregator {
245    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
246        // Format the start time
247        let start_time_str = format_system_time(self.start_time);
248
249        // Get and format the current time
250        let current_time = std::time::SystemTime::now();
251        let end_time_str = format_system_time(current_time);
252
253        // Write the header with formatted time window
254        writeln!(
255            f,
256            "\x1b[36m>>> Aggregated Logs ({}) >>>\x1b[0m",
257            start_time_str
258        )?;
259
260        // Write each log line
261        for line in self.lines.iter() {
262            writeln!(f, "{}", line)?;
263        }
264        writeln!(
265            f,
266            "\x1b[36m<<< Aggregated Logs ({}) <<<\x1b[0m",
267            end_time_str
268        )?;
269        Ok(())
270    }
271}
272
273/// Messages that can be sent to the LogClientActor remotely.
274#[derive(
275    Debug,
276    Clone,
277    Serialize,
278    Deserialize,
279    Named,
280    Handler,
281    HandleClient,
282    RefClient
283)]
284pub enum LogMessage {
285    /// Log details
286    Log {
287        /// The hostname of the process that generated the log
288        hostname: String,
289        /// String representation of the ProcAddr that generated the log
290        proc_id: String,
291        /// The target output stream (stdout or stderr)
292        output_target: OutputTarget,
293        /// The log payload as bytes
294        payload: wirevalue::Any,
295    },
296
297    /// Flush the log
298    Flush {
299        /// Indicate if the current flush is synced or non-synced.
300        /// If synced, a version number is available. Otherwise, none.
301        sync_version: Option<u64>,
302    },
303}
304
305/// Messages that can be sent to the LogClient locally.
306#[derive(
307    Debug,
308    Clone,
309    Serialize,
310    Deserialize,
311    Named,
312    Handler,
313    HandleClient,
314    RefClient
315)]
316#[expect(
317    clippy::large_enum_variant,
318    reason = "actor message enum with Handler/HandleClient/RefClient derives; boxing fields ripples into client/handler call sites and may require derive-macro changes — separate diff"
319)]
320pub enum LogClientMessage {
321    SetAggregate {
322        /// The time window in seconds to aggregate logs. If None, aggregation is disabled.
323        aggregate_window_sec: Option<u64>,
324    },
325
326    /// Synchronously flush all the logs from all the procs. This is for client to call.
327    StartSyncFlush {
328        /// Expect these many procs to ack the flush message.
329        expected_procs: usize,
330        /// Return once we have received the acks from all the procs
331        reply: OncePortRef<()>,
332        /// Return to the caller the current flush version
333        version: OncePortRef<u64>,
334    },
335}
336
337/// Trait for sending logs
338#[async_trait]
339pub trait LogSender: Send + Sync {
340    /// Send a log payload in bytes
341    fn send(&mut self, target: OutputTarget, payload: Vec<Vec<u8>>) -> anyhow::Result<()>;
342
343    /// Flush the log channel, ensuring all messages are delivered
344    /// Returns when the flush message has been acknowledged
345    fn flush(&mut self) -> anyhow::Result<()>;
346}
347
348/// Represents the target output stream (stdout or stderr)
349#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Hash)]
350pub enum OutputTarget {
351    /// Standard output stream
352    Stdout,
353    /// Standard error stream
354    Stderr,
355}
356
357#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Hash)]
358pub enum Stream {
359    /// Standard output stream
360    ChildStdout,
361    /// Standard error stream
362    ChildStderr,
363}
364
365/// Write the log to a local unix channel so some actors can listen to it and stream the log back.
366pub struct LocalLogSender {
367    hostname: String,
368    proc_id: String,
369    tx: ChannelTx<LogMessage>,
370    status: Receiver<TxStatus>,
371}
372
373impl LocalLogSender {
374    fn new(log_channel: ChannelAddr, proc_id: &ProcAddr) -> Result<Self, anyhow::Error> {
375        let tx = channel::dial::<LogMessage>(log_channel)?;
376        let status = tx.status().clone();
377
378        let hostname = hostname::get()
379            .unwrap_or_else(|_| "unknown_host".into())
380            .into_string()
381            .unwrap_or("unknown_host".to_string());
382        Ok(Self {
383            hostname,
384            proc_id: proc_id.to_string(),
385            tx,
386            status,
387        })
388    }
389}
390
391#[async_trait]
392impl LogSender for LocalLogSender {
393    fn send(&mut self, target: OutputTarget, payload: Vec<Vec<u8>>) -> anyhow::Result<()> {
394        if TxStatus::Active == *self.status.borrow() {
395            self.tx.post(LogMessage::Log {
396                hostname: self.hostname.clone(),
397                proc_id: self.proc_id.clone(),
398                output_target: target,
399                payload: wirevalue::Any::serialize(&payload)?,
400            });
401        }
402
403        Ok(())
404    }
405
406    fn flush(&mut self) -> anyhow::Result<()> {
407        // send will make sure message is delivered
408        if TxStatus::Active == *self.status.borrow() {
409            self.tx.post(LogMessage::Flush { sync_version: None });
410        }
411        Ok(())
412    }
413}
414
415/// Message sent to FileMonitor
416#[derive(Debug, Clone, Serialize, Deserialize, Named)]
417pub struct FileMonitorMessage {
418    lines: Vec<String>,
419}
420wirevalue::register_type!(FileMonitorMessage);
421
422/// File appender, coordinates write access to a file via a channel.
423pub struct FileAppender {
424    stdout_addr: ChannelAddr,
425    stderr_addr: ChannelAddr,
426    #[allow(dead_code)] // Tasks are self terminating
427    stdout_task: JoinHandle<()>,
428    #[allow(dead_code)]
429    stderr_task: JoinHandle<()>,
430    stop: Arc<Notify>,
431}
432
433impl fmt::Debug for FileAppender {
434    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
435        f.debug_struct("FileMonitor")
436            .field("stdout_addr", &self.stdout_addr)
437            .field("stderr_addr", &self.stderr_addr)
438            .finish()
439    }
440}
441
442impl FileAppender {
443    /// Create a new FileAppender with aggregated log files for stdout and stderr
444    /// Returns None if file creation fails
445    pub fn new() -> Option<Self> {
446        let stop = Arc::new(Notify::new());
447        // TODO make it configurable
448        let file_name_tag = hostname::get()
449            .unwrap_or_else(|_| "unknown_host".into())
450            .into_string()
451            .unwrap_or("unknown_host".to_string());
452
453        // Create stdout file and task
454        let (stdout_path, stdout_writer) =
455            match get_unique_local_log_destination(&file_name_tag, OutputTarget::Stdout) {
456                Some(writer) => writer,
457                None => {
458                    tracing::warn!("failed to create stdout file");
459                    return None;
460                }
461            };
462        let (stdout_addr, stdout_rx) = {
463            let _guard = tracing::span!(Level::INFO, "appender", file = "stdout").entered();
464            match channel::serve(ChannelAddr::any(ChannelTransport::Unix)) {
465                Ok((addr, rx)) => (addr, rx),
466                Err(e) => {
467                    tracing::warn!("failed to serve stdout channel: {}", e);
468                    return None;
469                }
470            }
471        };
472        let stdout_stop = stop.clone();
473        let stdout_task = tokio::spawn(file_monitor_task(
474            stdout_rx,
475            stdout_writer,
476            OutputTarget::Stdout,
477            stdout_stop,
478        ));
479
480        // Create stderr file and task
481        let (stderr_path, stderr_writer) =
482            match get_unique_local_log_destination(&file_name_tag, OutputTarget::Stderr) {
483                Some(writer) => writer,
484                None => {
485                    tracing::warn!("failed to create stderr file");
486                    return None;
487                }
488            };
489        let (stderr_addr, stderr_rx) = {
490            let _guard = tracing::span!(Level::INFO, "appender", file = "stderr").entered();
491            match channel::serve(ChannelAddr::any(ChannelTransport::Unix)) {
492                Ok((addr, rx)) => (addr, rx),
493                Err(e) => {
494                    tracing::warn!("failed to serve stderr channel: {}", e);
495                    return None;
496                }
497            }
498        };
499        let stderr_stop = stop.clone();
500        let stderr_task = tokio::spawn(file_monitor_task(
501            stderr_rx,
502            stderr_writer,
503            OutputTarget::Stderr,
504            stderr_stop,
505        ));
506
507        tracing::debug!(
508            "FileAppender: created for stdout {} stderr {} ",
509            stdout_path.display(),
510            stderr_path.display()
511        );
512
513        Some(Self {
514            stdout_addr,
515            stderr_addr,
516            stdout_task,
517            stderr_task,
518            stop,
519        })
520    }
521
522    /// Get a channel address for the specified output target
523    pub fn addr_for(&self, target: OutputTarget) -> ChannelAddr {
524        match target {
525            OutputTarget::Stdout => self.stdout_addr.clone(),
526            OutputTarget::Stderr => self.stderr_addr.clone(),
527        }
528    }
529}
530
531impl Drop for FileAppender {
532    fn drop(&mut self) {
533        // Trigger stop signal to notify tasks to exit
534        self.stop.notify_waiters();
535        tracing::debug!("FileMonitor: dropping, stop signal sent, tasks will flush and exit");
536    }
537}
538
539/// Task that receives lines from StreamFwds and writes them to the aggregated file
540async fn file_monitor_task(
541    mut rx: ChannelRx<FileMonitorMessage>,
542    mut writer: Box<dyn io::AsyncWrite + Send + Unpin + 'static>,
543    target: OutputTarget,
544    stop: Arc<Notify>,
545) {
546    loop {
547        tokio::select! {
548            msg = rx.recv() => {
549                match msg {
550                    Ok(msg) => {
551                        // Write lines to aggregated file
552                        for line in &msg.lines {
553                            if let Err(e) = writer.write_all(line.as_bytes()).await {
554                                tracing::warn!("FileMonitor: failed to write line to file: {}", e);
555                                continue;
556                            }
557                            if let Err(e) = writer.write_all(b"\n").await {
558                                tracing::warn!("FileMonitor: failed to write newline to file: {}", e);
559                            }
560                        }
561                        if let Err(e) = writer.flush().await {
562                            tracing::warn!("FileMonitor: failed to flush file: {}", e);
563                        }
564                    }
565                    Err(e) => {
566                        // Channel error
567                        tracing::debug!("FileMonitor task for {:?}: channel error: {}", target, e);
568                        break;
569                    }
570                }
571            }
572            _ = stop.notified() => {
573                tracing::debug!("FileMonitor task for {:?}: stop signal received", target);
574                break;
575            }
576        }
577    }
578
579    // Graceful shutdown: flush one last time
580    if let Err(e) = writer.flush().await {
581        tracing::warn!("FileMonitor: failed final flush: {}", e);
582    }
583    tracing::debug!("FileMonitor task for {:?} exiting", target);
584}
585
586fn create_unique_file_writer(
587    file_name_tag: &str,
588    output_target: OutputTarget,
589    env: env::Env,
590) -> Result<(PathBuf, Box<dyn io::AsyncWrite + Send + Unpin + 'static>)> {
591    let suffix = match output_target {
592        OutputTarget::Stderr => "stderr",
593        OutputTarget::Stdout => "stdout",
594    };
595    let (path, filename) = log_file_path(env, None)?;
596    let path = Path::new(&path);
597    let mut full_path = PathBuf::from(path);
598
599    let uuid = ShortUuid::generate();
600
601    full_path.push(format!(
602        "{}_{}_{}.{}",
603        filename, file_name_tag, uuid, suffix
604    ));
605    let file = std::fs::OpenOptions::new()
606        .create(true)
607        .append(true)
608        .open(full_path.clone())?;
609    let tokio_file = tokio::fs::File::from_std(file);
610    // TODO: should we buffer this?
611    Ok((full_path, Box::new(tokio_file)))
612}
613
614fn get_unique_local_log_destination(
615    file_name_tag: &str,
616    output_target: OutputTarget,
617) -> Option<(PathBuf, Box<dyn io::AsyncWrite + Send + Unpin + 'static>)> {
618    let env: env::Env = env::Env::current();
619    if env == env::Env::Local && !hyperactor_config::global::get(FORCE_FILE_LOG) {
620        tracing::debug!("not creating log file because of env type");
621        None
622    } else {
623        match create_unique_file_writer(file_name_tag, output_target, env) {
624            Ok((a, b)) => Some((a, b)),
625            Err(e) => {
626                tracing::warn!("failed to create unique file writer: {}", e);
627                None
628            }
629        }
630    }
631}
632
633/// Create a writer for stdout or stderr
634fn std_writer(target: OutputTarget) -> Box<dyn io::AsyncWrite + Send + Unpin> {
635    // Return the appropriate standard output or error writer
636    match target {
637        OutputTarget::Stdout => Box::new(tokio::io::stdout()),
638        OutputTarget::Stderr => Box::new(tokio::io::stderr()),
639    }
640}
641
642/// Copy bytes from `reader` to `writer`, forward to log_sender, and forward to FileMonitor.
643/// The same formatted lines go to both log_sender and file_monitor.
644async fn tee(
645    mut reader: impl AsyncRead + Unpin + Send + 'static,
646    mut std_writer: Box<dyn io::AsyncWrite + Send + Unpin>,
647    log_sender: Option<Box<dyn LogSender + Send>>,
648    file_monitor_addr: Option<ChannelAddr>,
649    target: OutputTarget,
650    prefix: Option<String>,
651    stop: Arc<Notify>,
652    recent_lines_buf: RotatingLineBuffer,
653) -> Result<(), io::Error> {
654    let mut buf = [0u8; 8192];
655    let mut line_buffer = Vec::with_capacity(MAX_LINE_SIZE);
656    let mut log_sender = log_sender;
657
658    // Dial the file monitor channel if provided
659    let mut file_monitor_tx: Option<ChannelTx<FileMonitorMessage>> =
660        file_monitor_addr.and_then(|addr| match channel::dial(addr.clone()) {
661            Ok(tx) => Some(tx),
662            Err(e) => {
663                tracing::warn!("Failed to dial file monitor channel {}: {}", addr, e);
664                None
665            }
666        });
667
668    loop {
669        tokio::select! {
670            read_result = reader.read(&mut buf) => {
671                match read_result {
672                    Ok(n) => {
673                        if n == 0 {
674                            // EOF reached
675                            tracing::debug!("EOF reached in tee");
676                            break;
677                        }
678
679                        // Write to console
680                        if let Err(e) = std_writer.write_all(&buf[..n]).await {
681                            tracing::warn!("error writing to std: {}", e);
682                        }
683
684                        // Process bytes into lines for log_sender and FileMonitor
685                        let mut completed_lines = Vec::new();
686
687                        for &byte in &buf[..n] {
688                            if byte == b'\n' {
689                                // Complete line found
690                                let mut line = String::from_utf8_lossy(&line_buffer).to_string();
691
692                                // Truncate if too long, respecting UTF-8 boundaries
693                                // (multi-byte chars like emojis can be up to 4 bytes)
694                                if line.len() > MAX_LINE_SIZE {
695                                    let mut truncate_at = MAX_LINE_SIZE;
696                                    while truncate_at > 0 && !line.is_char_boundary(truncate_at) {
697                                        truncate_at -= 1;
698                                    }
699                                    line.truncate(truncate_at);
700                                    line.push_str("... [TRUNCATED]");
701                                }
702
703                                // Prepend with prefix if configured
704                                let final_line = if let Some(ref p) = prefix {
705                                    format!("[{}] {}", p, line)
706                                } else {
707                                    line
708                                };
709
710                                completed_lines.push(final_line);
711                                line_buffer.clear();
712                            } else {
713                                line_buffer.push(byte);
714                            }
715                        }
716
717                        // Send completed lines to both log_sender and FileAppender
718                        if !completed_lines.is_empty() {
719                            if let Some(ref mut sender) = log_sender {
720                                let bytes: Vec<Vec<u8>> = completed_lines.iter()
721                                    .map(|s| s.as_bytes().to_vec())
722                                    .collect();
723                                if let Err(e) = sender.send(target, bytes) {
724                                    tracing::warn!("error sending to log_sender: {}", e);
725                                }
726                            }
727
728                            // Send to FileMonitor via hyperactor channel
729                            if let Some(ref mut tx) = file_monitor_tx {
730                                let msg = FileMonitorMessage {
731                                    lines: completed_lines,
732                                };
733                                // Use post() to avoid blocking
734                                tx.post(msg);
735                            }
736                        }
737
738                        recent_lines_buf.try_add_data(&buf, n);
739                    },
740                    Err(e) => {
741                        tracing::debug!("read error in tee: {}", e);
742                        return Err(e);
743                    }
744                }
745            },
746            _ = stop.notified() => {
747                tracing::debug!("stop signal received in tee");
748                break;
749            }
750        }
751    }
752
753    std_writer.flush().await?;
754
755    // Send any remaining partial line
756    if !line_buffer.is_empty() {
757        let mut line = String::from_utf8_lossy(&line_buffer).to_string();
758        // Truncate if too long, respecting UTF-8 boundaries
759        // (multi-byte chars like emojis can be up to 4 bytes)
760        if line.len() > MAX_LINE_SIZE {
761            let mut truncate_at = MAX_LINE_SIZE;
762            while truncate_at > 0 && !line.is_char_boundary(truncate_at) {
763                truncate_at -= 1;
764            }
765            line.truncate(truncate_at);
766            line.push_str("... [TRUNCATED]");
767        }
768        let final_line = if let Some(ref p) = prefix {
769            format!("[{}] {}", p, line)
770        } else {
771            line
772        };
773
774        let final_lines = vec![final_line];
775
776        // Send to log_sender
777        if let Some(ref mut sender) = log_sender {
778            let bytes: Vec<Vec<u8>> = final_lines.iter().map(|s| s.as_bytes().to_vec()).collect();
779            let _ = sender.send(target, bytes);
780        }
781
782        // Send to FileMonitor
783        if let Some(ref mut tx) = file_monitor_tx {
784            let msg = FileMonitorMessage { lines: final_lines };
785            tx.post(msg);
786        }
787    }
788
789    // Flush log_sender
790    if let Some(ref mut sender) = log_sender {
791        let _ = sender.flush();
792    }
793
794    Ok(())
795}
796
797#[derive(Debug, Clone)]
798struct RotatingLineBuffer {
799    recent_lines: Arc<RwLock<VecDeque<String>>>,
800    max_buffer_size: usize,
801}
802
803impl RotatingLineBuffer {
804    fn try_add_data(&self, buf: &[u8], buf_end: usize) {
805        let data_str = String::from_utf8_lossy(&buf[..buf_end]);
806        let lines: Vec<&str> = data_str.lines().collect();
807
808        if let Ok(mut recent_lines_guard) = self.recent_lines.try_write() {
809            for line in lines {
810                if !line.is_empty() {
811                    recent_lines_guard.push_back(line.to_string());
812                    if recent_lines_guard.len() > self.max_buffer_size {
813                        recent_lines_guard.pop_front();
814                    }
815                }
816            }
817        } else {
818            tracing::debug!("Failed to acquire write lock on recent_lines buffer in tee");
819        }
820    }
821
822    async fn peek(&self) -> Vec<String> {
823        let lines = self.recent_lines.read().await;
824        let start_idx = if lines.len() > self.max_buffer_size {
825            lines.len() - self.max_buffer_size
826        } else {
827            0
828        };
829
830        lines.range(start_idx..).cloned().collect()
831    }
832}
833
834/// Given a stream forwards data to the provided channel.
835pub struct StreamFwder {
836    teer: JoinHandle<Result<(), io::Error>>,
837    // Shared buffer for peek functionality
838    recent_lines_buf: RotatingLineBuffer,
839    // Shutdown signal to stop the monitoring loop
840    stop: Arc<Notify>,
841}
842
843impl StreamFwder {
844    /// Create a new StreamFwder instance, and start monitoring the provided path.
845    /// Once started Monitor will
846    /// - forward logs to log_sender
847    /// - forward logs to file_monitor (if available)
848    /// - pipe reader to target
849    /// - And capture last `max_buffer_size` which can be used to inspect file contents via `peek`.
850    pub fn start(
851        reader: impl AsyncRead + Unpin + Send + 'static,
852        file_monitor_addr: Option<ChannelAddr>,
853        target: OutputTarget,
854        max_buffer_size: usize,
855        log_channel: Option<ChannelAddr>,
856        proc_id: &ProcAddr,
857        local_rank: usize,
858    ) -> Self {
859        let prefix = match hyperactor_config::global::get(PREFIX_WITH_RANK) {
860            true => Some(local_rank.to_string()),
861            false => None,
862        };
863        let std_writer = std_writer(target);
864
865        Self::start_with_writer(
866            reader,
867            std_writer,
868            file_monitor_addr,
869            target,
870            max_buffer_size,
871            log_channel,
872            proc_id,
873            prefix,
874        )
875    }
876
877    /// Create a new StreamFwder instance with a custom writer (used in tests).
878    fn start_with_writer(
879        reader: impl AsyncRead + Unpin + Send + 'static,
880        std_writer: Box<dyn io::AsyncWrite + Send + Unpin>,
881        file_monitor_addr: Option<ChannelAddr>,
882        target: OutputTarget,
883        max_buffer_size: usize,
884        log_channel: Option<ChannelAddr>,
885        proc_id: &ProcAddr,
886        prefix: Option<String>,
887    ) -> Self {
888        // Sanity: when there is no file sink, no log forwarding, and
889        // `tail_size == 0`, the child should have **inherited** stdio
890        // and no `StreamFwder` should exist. In that case console
891        // mirroring happens via inheritance, not via `StreamFwder`.
892        // If we hit this, we piped unnecessarily.
893        debug_assert!(
894            file_monitor_addr.is_some() || max_buffer_size > 0 || log_channel.is_some(),
895            "StreamFwder started with no sinks and no tail"
896        );
897
898        let stop = Arc::new(Notify::new());
899        let recent_lines_buf = RotatingLineBuffer {
900            recent_lines: Arc::new(RwLock::new(VecDeque::<String>::with_capacity(
901                max_buffer_size,
902            ))),
903            max_buffer_size,
904        };
905
906        let log_sender: Option<Box<dyn LogSender + Send>> = if let Some(addr) = log_channel {
907            match LocalLogSender::new(addr, proc_id) {
908                Ok(s) => Some(Box::new(s) as Box<dyn LogSender + Send>),
909                Err(e) => {
910                    tracing::error!("failed to create log sender: {}", e);
911                    None
912                }
913            }
914        } else {
915            None
916        };
917
918        let teer_stop = stop.clone();
919        let recent_line_buf_clone = recent_lines_buf.clone();
920        let teer = tokio::spawn(async move {
921            tee(
922                reader,
923                std_writer,
924                log_sender,
925                file_monitor_addr,
926                target,
927                prefix,
928                teer_stop,
929                recent_line_buf_clone,
930            )
931            .await
932        });
933
934        StreamFwder {
935            teer,
936            recent_lines_buf,
937            stop,
938        }
939    }
940
941    pub async fn abort(self) -> (Vec<String>, Result<(), anyhow::Error>) {
942        self.stop.notify_waiters();
943
944        let lines = self.peek().await;
945        let teer_result = self.teer.await;
946
947        let result: Result<(), anyhow::Error> = match teer_result {
948            Ok(inner) => inner.map_err(anyhow::Error::from),
949            Err(e) => Err(e.into()),
950        };
951
952        (lines, result)
953    }
954
955    /// Inspect the latest `max_buffer` lines read from the file being monitored
956    /// Returns lines in chronological order (oldest first)
957    pub async fn peek(&self) -> Vec<String> {
958        self.recent_lines_buf.peek().await
959    }
960}
961
962/// Messages that can be sent to the LogForwarder
963#[derive(
964    Debug,
965    Clone,
966    Serialize,
967    Deserialize,
968    Named,
969    Handler,
970    HandleClient,
971    RefClient
972)]
973pub enum LogForwardMessage {
974    /// Receive the log from the parent process and forward it to the client.
975    Forward {},
976
977    /// If to stream the log back to the client.
978    SetMode { stream_to_client: bool },
979
980    /// Flush the log with a version number.
981    ForceSyncFlush { version: u64 },
982}
983
984/// A log forwarder that receives the log from its parent process and forward it back to the client
985#[hyperactor::export(LogForwardMessage)]
986#[hyperactor::spawnable]
987pub struct LogForwardActor {
988    rx: ChannelRx<LogMessage>,
989    flush_tx: Arc<Mutex<ChannelTx<LogMessage>>>,
990    next_flush_deadline: SystemTime,
991    logging_client_ref: ActorRef<LogClientActor>,
992    stream_to_client: bool,
993}
994
995#[async_trait]
996impl Actor for LogForwardActor {
997    async fn init(&mut self, this: &Instance<Self>) -> Result<(), anyhow::Error> {
998        this.set_system();
999        this.post_after(this, LogForwardMessage::Forward {}, Duration::from_secs(0));
1000
1001        // Make sure we start the flush loop periodically so the log channel will not deadlock.
1002        self.flush_tx
1003            .lock()
1004            .await
1005            .send(LogMessage::Flush { sync_version: None })
1006            .await?;
1007        Ok(())
1008    }
1009}
1010
1011#[async_trait]
1012impl hyperactor::RemoteSpawn for LogForwardActor {
1013    type Params = ActorRef<LogClientActor>;
1014
1015    async fn new(logging_client_ref: Self::Params, _environment: Flattrs) -> Result<Self> {
1016        let log_channel: ChannelAddr = match std::env::var(BOOTSTRAP_LOG_CHANNEL) {
1017            Ok(channel) => channel.parse()?,
1018            Err(err) => {
1019                tracing::debug!(
1020                    "log forwarder actor failed to read env var {}: {}",
1021                    BOOTSTRAP_LOG_CHANNEL,
1022                    err
1023                );
1024                // TODO: an empty channel to serve
1025                ChannelAddr::any(ChannelTransport::Unix)
1026            }
1027        };
1028        tracing::info!(
1029            "log forwarder {} serve at {}",
1030            std::process::id(),
1031            log_channel
1032        );
1033
1034        let rx = match channel::serve(log_channel.clone()) {
1035            Ok((_, rx)) => rx,
1036            Err(err) => {
1037                // This can happen if we are not spanwed on a separate process like local.
1038                // For local mesh, log streaming anyway is not needed.
1039                tracing::error!(
1040                    "log forwarder actor failed to bootstrap on given channel {}: {}",
1041                    log_channel,
1042                    err
1043                );
1044                channel::serve(ChannelAddr::any(ChannelTransport::Unix))?.1
1045            }
1046        };
1047
1048        // Dial the same channel to send flush message to drain the log queue.
1049        let flush_tx = Arc::new(Mutex::new(channel::dial::<LogMessage>(log_channel)?));
1050        let now = std::time::SystemTime::now();
1051
1052        Ok(Self {
1053            rx,
1054            flush_tx,
1055            next_flush_deadline: now,
1056            logging_client_ref,
1057            stream_to_client: true,
1058        })
1059    }
1060}
1061
1062#[async_trait]
1063#[hyperactor::handle(LogForwardMessage)]
1064impl LogForwardMessageHandler for LogForwardActor {
1065    async fn forward(&mut self, ctx: &Context<Self>) -> Result<(), anyhow::Error> {
1066        match self.rx.recv().await {
1067            Ok(LogMessage::Flush { sync_version }) => {
1068                let now = std::time::SystemTime::now();
1069                match sync_version {
1070                    None => {
1071                        // Schedule another flush to keep the log channel from deadlocking.
1072                        let delay = Duration::from_secs(1);
1073                        if now >= self.next_flush_deadline {
1074                            self.next_flush_deadline = now + delay;
1075                            let flush_tx = self.flush_tx.clone();
1076                            tokio::spawn(async move {
1077                                tokio::time::sleep(delay).await;
1078                                if let Err(e) = flush_tx
1079                                    .lock()
1080                                    .await
1081                                    .send(LogMessage::Flush { sync_version: None })
1082                                    .await
1083                                {
1084                                    tracing::error!("failed to send flush message: {}", e);
1085                                }
1086                            });
1087                        }
1088                    }
1089                    version => {
1090                        self.logging_client_ref.flush(ctx, version).await?;
1091                    }
1092                }
1093            }
1094            Ok(LogMessage::Log {
1095                hostname,
1096                proc_id,
1097                output_target,
1098                payload,
1099            }) => {
1100                if self.stream_to_client {
1101                    self.logging_client_ref
1102                        .log(ctx, hostname, proc_id, output_target, payload)
1103                        .await?;
1104                }
1105            }
1106            Err(e) => {
1107                return Err(e.into());
1108            }
1109        }
1110
1111        // This is not ideal as we are using raw tx/rx.
1112        ctx.post_after(ctx, LogForwardMessage::Forward {}, Duration::from_secs(0));
1113
1114        Ok(())
1115    }
1116
1117    async fn set_mode(
1118        &mut self,
1119        _ctx: &Context<Self>,
1120        stream_to_client: bool,
1121    ) -> Result<(), anyhow::Error> {
1122        self.stream_to_client = stream_to_client;
1123        Ok(())
1124    }
1125
1126    async fn force_sync_flush(
1127        &mut self,
1128        _cx: &Context<Self>,
1129        version: u64,
1130    ) -> Result<(), anyhow::Error> {
1131        self.flush_tx
1132            .lock()
1133            .await
1134            .send(LogMessage::Flush {
1135                sync_version: Some(version),
1136            })
1137            .await
1138            .map_err(anyhow::Error::from)
1139    }
1140}
1141
1142/// Deserialize a serialized message and split it into UTF-8 lines
1143fn deserialize_message_lines(serialized_message: &wirevalue::Any) -> Result<Vec<Vec<String>>> {
1144    // Try to deserialize as Vec<Vec<u8>> first (multiple byte arrays)
1145    if let Ok(message_bytes) = serialized_message.deserialized::<Vec<Vec<u8>>>() {
1146        let mut result = Vec::new();
1147        for bytes in message_bytes {
1148            let message_str = String::from_utf8(bytes)?;
1149            let lines: Vec<String> = message_str.lines().map(|s| s.to_string()).collect();
1150            result.push(lines);
1151        }
1152        return Ok(result);
1153    }
1154
1155    // If that fails, try to deserialize as String and wrap in Vec<Vec<String>>
1156    if let Ok(message) = serialized_message.deserialized::<String>() {
1157        let lines: Vec<String> = message.lines().map(|s| s.to_string()).collect();
1158        return Ok(vec![lines]);
1159    }
1160
1161    // If both fail, return an error
1162    anyhow::bail!("failed to deserialize message as either Vec<Vec<u8>> or String")
1163}
1164
1165/// A client to receive logs from remote processes
1166#[derive(Debug)]
1167#[hyperactor::export(LogMessage, LogClientMessage)]
1168#[hyperactor::spawnable]
1169pub struct LogClientActor {
1170    aggregate_window_sec: Option<u64>,
1171    aggregators: HashMap<OutputTarget, Aggregator>,
1172    last_flush_time: SystemTime,
1173    next_flush_deadline: Option<SystemTime>,
1174
1175    // For flush sync barrier
1176    current_flush_version: u64,
1177    current_flush_port: Option<OncePortRef<()>>,
1178    current_unflushed_procs: usize,
1179}
1180
1181impl Default for LogClientActor {
1182    fn default() -> Self {
1183        // Initialize aggregators
1184        let mut aggregators = HashMap::new();
1185        aggregators.insert(OutputTarget::Stderr, Aggregator::new());
1186        aggregators.insert(OutputTarget::Stdout, Aggregator::new());
1187
1188        Self {
1189            aggregate_window_sec: Some(DEFAULT_AGGREGATE_WINDOW_SEC),
1190            aggregators,
1191            last_flush_time: std::time::SystemTime::now(),
1192            next_flush_deadline: None,
1193            current_flush_version: 0,
1194            current_flush_port: None,
1195            current_unflushed_procs: 0,
1196        }
1197    }
1198}
1199
1200#[async_trait]
1201impl Actor for LogClientActor {
1202    async fn init(&mut self, this: &Instance<Self>) -> Result<(), anyhow::Error> {
1203        this.set_system();
1204        Ok(())
1205    }
1206}
1207
1208impl LogClientActor {
1209    fn print_aggregators(&mut self) {
1210        for (output_target, aggregator) in self.aggregators.iter_mut() {
1211            if aggregator.is_empty() {
1212                continue;
1213            }
1214            match output_target {
1215                OutputTarget::Stdout => {
1216                    println!("{}", aggregator);
1217                }
1218                OutputTarget::Stderr => {
1219                    eprintln!("{}", aggregator);
1220                }
1221            }
1222
1223            // Reset the aggregator
1224            aggregator.reset();
1225        }
1226    }
1227
1228    fn print_log_line(hostname: &str, proc_id: &str, output_target: OutputTarget, line: String) {
1229        let message = format!("[{} {}] {}", hostname, proc_id, line);
1230
1231        #[cfg(test)]
1232        crate::logging::test_tap::push(&message);
1233
1234        match output_target {
1235            OutputTarget::Stdout => println!("{}", message),
1236            OutputTarget::Stderr => eprintln!("{}", message),
1237        }
1238    }
1239
1240    fn flush_internal(&mut self) {
1241        self.print_aggregators();
1242        self.last_flush_time = std::time::SystemTime::now();
1243        self.next_flush_deadline = None;
1244    }
1245}
1246
1247impl Drop for LogClientActor {
1248    fn drop(&mut self) {
1249        // Flush the remaining logs before shutting down
1250        self.print_aggregators();
1251    }
1252}
1253
1254#[async_trait]
1255#[hyperactor::handle(LogMessage)]
1256impl LogMessageHandler for LogClientActor {
1257    async fn log(
1258        &mut self,
1259        cx: &Context<Self>,
1260        hostname: String,
1261        proc_id: String,
1262        output_target: OutputTarget,
1263        payload: wirevalue::Any,
1264    ) -> Result<(), anyhow::Error> {
1265        // Deserialize the message and process line by line with UTF-8
1266        let message_line_groups = deserialize_message_lines(&payload)?;
1267        let hostname = hostname.as_str();
1268
1269        let message_lines: Vec<String> = message_line_groups.into_iter().flatten().collect();
1270        match self.aggregate_window_sec {
1271            None => {
1272                for line in message_lines {
1273                    Self::print_log_line(hostname, &proc_id, output_target, line);
1274                }
1275                self.last_flush_time = std::time::SystemTime::now();
1276            }
1277            Some(window) => {
1278                for line in message_lines {
1279                    if let Some(aggregator) = self.aggregators.get_mut(&output_target) {
1280                        if let Err(e) = aggregator.add_line(&line) {
1281                            tracing::error!("error adding log line: {}", e);
1282                            // For the sake of completeness, flush the log lines.
1283                            Self::print_log_line(hostname, &proc_id, output_target, line);
1284                        }
1285                    } else {
1286                        tracing::error!("unknown output target: {:?}", output_target);
1287                        // For the sake of completeness, flush the log lines.
1288                        Self::print_log_line(hostname, &proc_id, output_target, line);
1289                    }
1290                }
1291
1292                let new_deadline = self.last_flush_time + Duration::from_secs(window);
1293                let now = std::time::SystemTime::now();
1294                if new_deadline <= now {
1295                    self.flush_internal();
1296                } else {
1297                    let delay = new_deadline.duration_since(now)?;
1298                    match self.next_flush_deadline {
1299                        None => {
1300                            self.next_flush_deadline = Some(new_deadline);
1301                            cx.post_after(cx, LogMessage::Flush { sync_version: None }, delay);
1302                        }
1303                        Some(deadline) => {
1304                            // Some early log lines have alrady triggered the flush.
1305                            if new_deadline < deadline {
1306                                // This can happen if the user has adjusted the aggregation window.
1307                                self.next_flush_deadline = Some(new_deadline);
1308                                cx.post_after(cx, LogMessage::Flush { sync_version: None }, delay);
1309                            }
1310                        }
1311                    }
1312                }
1313            }
1314        }
1315
1316        Ok(())
1317    }
1318
1319    async fn flush(
1320        &mut self,
1321        cx: &Context<Self>,
1322        sync_version: Option<u64>,
1323    ) -> Result<(), anyhow::Error> {
1324        match sync_version {
1325            None => {
1326                self.flush_internal();
1327            }
1328            Some(version) => {
1329                if version != self.current_flush_version {
1330                    tracing::error!(
1331                        "found mismatched flush versions: got {}, expect {}; this can happen if some previous flush didn't finish fully",
1332                        version,
1333                        self.current_flush_version
1334                    );
1335                    return Ok(());
1336                }
1337
1338                if self.current_unflushed_procs == 0 || self.current_flush_port.is_none() {
1339                    // This is a serious issue; it's better to error out.
1340                    anyhow::bail!("found no ongoing flush request");
1341                }
1342                self.current_unflushed_procs -= 1;
1343
1344                tracing::debug!(
1345                    "ack sync flush: version {}; remaining procs: {}",
1346                    self.current_flush_version,
1347                    self.current_unflushed_procs
1348                );
1349
1350                if self.current_unflushed_procs == 0 {
1351                    self.flush_internal();
1352                    let reply = self.current_flush_port.take().unwrap();
1353                    self.current_flush_port = None;
1354                    reply.post(cx, ());
1355                }
1356            }
1357        }
1358
1359        Ok(())
1360    }
1361}
1362
1363#[async_trait]
1364#[hyperactor::handle(LogClientMessage)]
1365impl LogClientMessageHandler for LogClientActor {
1366    async fn set_aggregate(
1367        &mut self,
1368        _cx: &Context<Self>,
1369        aggregate_window_sec: Option<u64>,
1370    ) -> Result<(), anyhow::Error> {
1371        if self.aggregate_window_sec.is_some() && aggregate_window_sec.is_none() {
1372            // Make sure we flush whatever in the aggregators before disabling aggregation.
1373            self.print_aggregators();
1374        }
1375        self.aggregate_window_sec = aggregate_window_sec;
1376        Ok(())
1377    }
1378
1379    async fn start_sync_flush(
1380        &mut self,
1381        cx: &Context<Self>,
1382        expected_procs_flushed: usize,
1383        reply: OncePortRef<()>,
1384        version: OncePortRef<u64>,
1385    ) -> Result<(), anyhow::Error> {
1386        if self.current_unflushed_procs > 0 || self.current_flush_port.is_some() {
1387            tracing::warn!(
1388                "found unfinished ongoing flush: version {}; {} unflushed procs",
1389                self.current_flush_version,
1390                self.current_unflushed_procs,
1391            );
1392        }
1393
1394        self.current_flush_version += 1;
1395        tracing::debug!(
1396            "start sync flush with version {}",
1397            self.current_flush_version
1398        );
1399        self.current_flush_port = Some(reply.clone());
1400        self.current_unflushed_procs = expected_procs_flushed;
1401        version.post(cx, self.current_flush_version);
1402        Ok(())
1403    }
1404}
1405
1406#[cfg(test)]
1407pub mod test_tap {
1408    use std::sync::Mutex;
1409    use std::sync::OnceLock;
1410
1411    use tokio::sync::mpsc::UnboundedReceiver;
1412    use tokio::sync::mpsc::UnboundedSender;
1413
1414    static TAP: OnceLock<UnboundedSender<String>> = OnceLock::new();
1415    static RX: OnceLock<Mutex<UnboundedReceiver<String>>> = OnceLock::new();
1416
1417    // Called by tests to install the sender.
1418    pub fn install(tx: UnboundedSender<String>) {
1419        let _ = TAP.set(tx);
1420    }
1421
1422    // Called by tests to register the receiver so we can drain later.
1423    pub fn set_receiver(rx: UnboundedReceiver<String>) {
1424        let _ = RX.set(Mutex::new(rx));
1425    }
1426
1427    // Used by LogClientActor (under #[cfg(test)]) to push a line.
1428    pub fn push(s: &str) {
1429        if let Some(tx) = TAP.get() {
1430            let _ = tx.send(s.to_string());
1431        }
1432    }
1433
1434    // Tests call this to collect everything observed so far.
1435    pub fn drain() -> Vec<String> {
1436        let mut out = Vec::new();
1437        if let Some(rx) = RX.get() {
1438            let mut rx = rx.lock().unwrap();
1439            while let Ok(line) = rx.try_recv() {
1440                out.push(line);
1441            }
1442        }
1443        out
1444    }
1445}
1446
1447#[cfg(test)]
1448mod tests {
1449
1450    use std::sync::Arc;
1451    use std::sync::Mutex;
1452
1453    use hyperactor::ProcAddr;
1454    use hyperactor::RemoteSpawn;
1455    use hyperactor::channel;
1456    use hyperactor::channel::ChannelAddr;
1457    use hyperactor::channel::ChannelTx;
1458    use hyperactor::channel::Tx;
1459    use hyperactor::mailbox::BoxedMailboxSender;
1460    use hyperactor::mailbox::DialMailboxRouter;
1461    use hyperactor::mailbox::MailboxServer;
1462    use hyperactor::proc::Proc;
1463    use hyperactor::testing::ids::test_proc_id;
1464    use tokio::io::AsyncSeek;
1465    use tokio::io::AsyncSeekExt;
1466    use tokio::io::AsyncWriteExt;
1467    use tokio::io::SeekFrom;
1468    use tokio::sync::mpsc;
1469
1470    use super::*;
1471
1472    /// Result of processing file content
1473    #[derive(Debug)]
1474    struct FileProcessingResult {
1475        /// Complete lines found during processing
1476        lines: Vec<Vec<u8>>,
1477        /// Updated position in the file after processing
1478        new_position: u64,
1479        /// Any remaining incomplete line data, buffered for subsequent reads
1480        incomplete_line_buffer: Vec<u8>,
1481    }
1482
1483    /// Process new file content from a given position, extracting complete lines
1484    /// This function is extracted to enable easier unit testing without file system dependencies
1485    async fn process_file_content<R: AsyncRead + AsyncSeek + Unpin>(
1486        reader: &mut R,
1487        current_position: u64,
1488        file_size: u64,
1489        existing_line_buffer: Vec<u8>,
1490        max_buffer_size: usize,
1491    ) -> Result<FileProcessingResult> {
1492        // If position equals file size, we're at the end
1493        if current_position == file_size {
1494            return Ok(FileProcessingResult {
1495                lines: Vec::new(),
1496                new_position: current_position,
1497                incomplete_line_buffer: existing_line_buffer,
1498            });
1499        }
1500
1501        // Handle potential file truncation/rotation
1502        let actual_position = if current_position > file_size {
1503            tracing::warn!(
1504                "File appears to have been truncated (position {} > file size {}), resetting to start",
1505                current_position,
1506                file_size
1507            );
1508            reader.seek(SeekFrom::Start(0)).await?;
1509            0
1510        } else {
1511            // current_position < file_size
1512            reader.seek(SeekFrom::Start(current_position)).await?;
1513            current_position
1514        };
1515
1516        let mut buf = vec![0u8; 128 * 1024];
1517        let mut line_buffer = existing_line_buffer;
1518        let mut lines = Vec::with_capacity(max_buffer_size);
1519        let mut processed_bytes = 0u64;
1520
1521        loop {
1522            let bytes_read = reader.read(&mut buf).await?;
1523            if bytes_read == 0 {
1524                break;
1525            }
1526
1527            let chunk = &buf[..bytes_read];
1528
1529            let mut start = 0;
1530            while let Some(newline_pos) = chunk[start..].iter().position(|&b| b == b'\n') {
1531                let absolute_pos = start + newline_pos;
1532
1533                line_buffer.extend_from_slice(&chunk[start..absolute_pos]);
1534
1535                if !line_buffer.is_empty() {
1536                    if line_buffer.len() > MAX_LINE_SIZE {
1537                        line_buffer.truncate(MAX_LINE_SIZE);
1538                        line_buffer.extend_from_slice(b"... [TRUNCATED]");
1539                    }
1540
1541                    let line_data = std::mem::replace(&mut line_buffer, Vec::with_capacity(2048));
1542                    lines.push(line_data);
1543                }
1544
1545                start = absolute_pos + 1;
1546
1547                // Check if we've reached the max buffer size after adding each line
1548                if lines.len() >= max_buffer_size {
1549                    // We've processed up to and including the current newline
1550                    // The new position is where we should start reading next time
1551                    let new_position = actual_position + processed_bytes + start as u64;
1552
1553                    // Don't save remaining data - we'll re-read it from the new position
1554                    return Ok(FileProcessingResult {
1555                        lines,
1556                        new_position,
1557                        incomplete_line_buffer: Vec::new(),
1558                    });
1559                }
1560            }
1561
1562            // Only add bytes to processed_bytes if we've fully processed this chunk
1563            processed_bytes += bytes_read as u64;
1564
1565            if start < chunk.len() {
1566                line_buffer.extend_from_slice(&chunk[start..]);
1567            }
1568        }
1569
1570        let new_position = actual_position + processed_bytes;
1571
1572        Ok(FileProcessingResult {
1573            lines,
1574            new_position,
1575            incomplete_line_buffer: line_buffer,
1576        })
1577    }
1578
1579    #[tokio::test]
1580    async fn test_forwarding_log_to_client() {
1581        // Setup the basics
1582        let router = DialMailboxRouter::new();
1583        let (proc_addr, client_rx) =
1584            channel::serve(ChannelAddr::any(ChannelTransport::Unix)).unwrap();
1585        let proc = Proc::configured(
1586            test_proc_id("client_0"),
1587            BoxedMailboxSender::new(router.clone()),
1588        );
1589        proc.clone().serve(client_rx);
1590        let proc_ref: ProcAddr = test_proc_id("client_0");
1591        router.bind(proc_ref, proc_addr.clone());
1592        let client = proc.client("client");
1593
1594        // Spin up both the forwarder and the client
1595        let log_channel = ChannelAddr::any(ChannelTransport::Unix);
1596        // SAFETY: Unit test
1597        unsafe {
1598            std::env::set_var(BOOTSTRAP_LOG_CHANNEL, log_channel.to_string());
1599        }
1600        let log_client_actor = LogClientActor::new((), Flattrs::default()).await.unwrap();
1601        let log_client: ActorRef<LogClientActor> = proc.spawn(log_client_actor).bind();
1602        let log_forwarder_actor = LogForwardActor::new(log_client.clone(), Flattrs::default())
1603            .await
1604            .unwrap();
1605        let log_forwarder: ActorRef<LogForwardActor> = proc.spawn(log_forwarder_actor).bind();
1606
1607        // Write some logs that will not be streamed
1608        let tx: ChannelTx<LogMessage> = channel::dial(log_channel).unwrap();
1609        tx.post(LogMessage::Log {
1610            hostname: "my_host".into(),
1611            proc_id: "test_proc".into(),
1612            output_target: OutputTarget::Stderr,
1613            payload: wirevalue::Any::serialize(&"will not stream".to_string()).unwrap(),
1614        });
1615
1616        // Turn on streaming
1617        log_forwarder.set_mode(&client, true).await.unwrap();
1618        tx.post(LogMessage::Log {
1619            hostname: "my_host".into(),
1620            proc_id: "test_proc".into(),
1621            output_target: OutputTarget::Stderr,
1622            payload: wirevalue::Any::serialize(&"will stream".to_string()).unwrap(),
1623        });
1624
1625        // TODO: it is hard to test out anything meaningful here as the client flushes to stdout.
1626    }
1627
1628    #[test]
1629    fn test_deserialize_message_lines_string() {
1630        // Test deserializing a String message with multiple lines
1631        let message = "Line 1\nLine 2\nLine 3".to_string();
1632        let serialized = wirevalue::Any::serialize(&message).unwrap();
1633
1634        let result = deserialize_message_lines(&serialized).unwrap();
1635        assert_eq!(result, vec![vec!["Line 1", "Line 2", "Line 3"]]);
1636
1637        // Test deserializing a Vec<Vec<u8>> message with UTF-8 content
1638        let message_bytes = vec![
1639            "Hello\nWorld".as_bytes().to_vec(),
1640            "UTF-8 \u{1F980}\nTest".as_bytes().to_vec(),
1641        ];
1642        let serialized = wirevalue::Any::serialize(&message_bytes).unwrap();
1643
1644        let result = deserialize_message_lines(&serialized).unwrap();
1645        assert_eq!(
1646            result,
1647            vec![vec!["Hello", "World"], vec!["UTF-8 \u{1F980}", "Test"]]
1648        );
1649
1650        // Test deserializing a single line message
1651        let message = "Single line message".to_string();
1652        let serialized = wirevalue::Any::serialize(&message).unwrap();
1653
1654        let result = deserialize_message_lines(&serialized).unwrap();
1655
1656        assert_eq!(result, vec![vec!["Single line message"]]);
1657
1658        // Test deserializing an empty lines
1659        let message = "\n\n".to_string();
1660        let serialized = wirevalue::Any::serialize(&message).unwrap();
1661
1662        let result = deserialize_message_lines(&serialized).unwrap();
1663
1664        assert_eq!(result, vec![vec!["", ""]]);
1665
1666        // Test error handling for invalid UTF-8 bytes
1667        let invalid_utf8_bytes = vec![vec![0xFF, 0xFE, 0xFD]]; // Invalid UTF-8 sequence in Vec<Vec<u8>>
1668        let serialized = wirevalue::Any::serialize(&invalid_utf8_bytes).unwrap();
1669
1670        let result = deserialize_message_lines(&serialized);
1671
1672        // The function should fail when trying to convert invalid UTF-8 bytes to String
1673        assert!(
1674            result.is_err(),
1675            "Expected deserialization to fail with invalid UTF-8 bytes"
1676        );
1677    }
1678    #[allow(dead_code)]
1679    struct MockLogSender {
1680        log_sender: mpsc::UnboundedSender<(OutputTarget, String)>, // (output_target, content)
1681        flush_called: Arc<Mutex<bool>>,                            // Track if flush was called
1682    }
1683
1684    impl MockLogSender {
1685        #[allow(dead_code)]
1686        fn new(log_sender: mpsc::UnboundedSender<(OutputTarget, String)>) -> Self {
1687            Self {
1688                log_sender,
1689                flush_called: Arc::new(Mutex::new(false)),
1690            }
1691        }
1692    }
1693
1694    #[async_trait]
1695    impl LogSender for MockLogSender {
1696        fn send(
1697            &mut self,
1698            output_target: OutputTarget,
1699            payload: Vec<Vec<u8>>,
1700        ) -> anyhow::Result<()> {
1701            // For testing purposes, convert to string if it's valid UTF-8
1702            let lines: Vec<String> = payload
1703                .iter()
1704                .map(|b| String::from_utf8_lossy(b).trim_end_matches('\n').to_owned())
1705                .collect();
1706
1707            for line in lines {
1708                self.log_sender
1709                    .send((output_target, line))
1710                    .map_err(|e| anyhow::anyhow!("Failed to send log in test: {}", e))?;
1711            }
1712            Ok(())
1713        }
1714
1715        fn flush(&mut self) -> anyhow::Result<()> {
1716            // Mark that flush was called
1717            let mut flush_called = self.flush_called.lock().unwrap();
1718            *flush_called = true;
1719
1720            // For testing purposes, just return Ok
1721            // In a real implementation, this would wait for all messages to be delivered
1722            Ok(())
1723        }
1724    }
1725
1726    #[test]
1727    fn test_string_similarity() {
1728        // Test exact match
1729        assert_eq!(normalized_edit_distance("hello", "hello"), 0.0);
1730
1731        // Test completely different strings
1732        assert_eq!(normalized_edit_distance("hello", "i'mdiff"), 1.0);
1733
1734        // Test similar strings
1735        assert!(normalized_edit_distance("hello", "helo") < 0.5);
1736        assert!(normalized_edit_distance("hello", "hello!") < 0.5);
1737
1738        // Test empty strings
1739        assert_eq!(normalized_edit_distance("", ""), 0.0);
1740        assert_eq!(normalized_edit_distance("hello", ""), 1.0);
1741    }
1742
1743    #[test]
1744    fn test_add_line_to_empty_aggregator() {
1745        let mut aggregator = Aggregator::new();
1746        let result = aggregator.add_line("ERROR 404 not found");
1747
1748        assert!(result.is_ok());
1749        assert_eq!(aggregator.lines.len(), 1);
1750        assert_eq!(aggregator.lines[0].content, "ERROR 404 not found");
1751        assert_eq!(aggregator.lines[0].count, 1);
1752    }
1753
1754    #[test]
1755    fn test_add_line_merges_with_similar_line() {
1756        let mut aggregator = Aggregator::new_with_threshold(0.2);
1757
1758        // Add first line
1759        aggregator.add_line("ERROR 404 timeout").unwrap();
1760        assert_eq!(aggregator.lines.len(), 1);
1761
1762        // Add second line that should merge (similar enough)
1763        aggregator.add_line("ERROR 500 timeout").unwrap();
1764        assert_eq!(aggregator.lines.len(), 1); // Should still be 1 line after merge
1765        assert_eq!(aggregator.lines[0].count, 2);
1766
1767        // Add third line that's too different
1768        aggregator
1769            .add_line("WARNING database connection failed")
1770            .unwrap();
1771        assert_eq!(aggregator.lines.len(), 2); // Should be 2 lines now
1772
1773        // Add fourth line similar to third
1774        aggregator
1775            .add_line("WARNING database connection timed out")
1776            .unwrap();
1777        assert_eq!(aggregator.lines.len(), 2); // Should still be 2 lines
1778        assert_eq!(aggregator.lines[1].count, 2); // Second group has 2 lines
1779    }
1780
1781    #[test]
1782    fn test_aggregation_of_similar_log_lines() {
1783        let mut aggregator = Aggregator::new_with_threshold(0.2);
1784
1785        // Add the provided log lines with small differences
1786        aggregator.add_line("[1 similar log lines] WARNING <<2025, 2025>> -07-30 <<0, 0>> :41:45,366 conda-unpack-fb:292] Found invalid offsets for share/terminfo/i/ims-ansi, falling back to search/replace to update prefixes for this file.").unwrap();
1787        aggregator.add_line("[1 similar log lines] WARNING <<2025, 2025>> -07-30 <<0, 0>> :41:45,351 conda-unpack-fb:292] Found invalid offsets for lib/pkgconfig/ncursesw.pc, falling back to search/replace to update prefixes for this file.").unwrap();
1788        aggregator.add_line("[1 similar log lines] WARNING <<2025, 2025>> -07-30 <<0, 0>> :41:45,366 conda-unpack-fb:292] Found invalid offsets for share/terminfo/k/kt7, falling back to search/replace to update prefixes for this file.").unwrap();
1789
1790        // Check that we have only one aggregated line due to similarity
1791        assert_eq!(aggregator.lines.len(), 1);
1792
1793        // Check that the count is 3
1794        assert_eq!(aggregator.lines[0].count, 3);
1795    }
1796
1797    #[tokio::test]
1798    async fn test_stream_fwd_creation() {
1799        hyperactor_telemetry::initialize_logging_for_test();
1800
1801        let (mut writer, reader) = tokio::io::duplex(1024);
1802        let (log_channel, mut rx) =
1803            channel::serve::<LogMessage>(ChannelAddr::any(ChannelTransport::Unix)).unwrap();
1804
1805        // Create a temporary file for testing the writer
1806        let temp_file = tempfile::NamedTempFile::new().unwrap();
1807        let temp_path = temp_file.path().to_path_buf();
1808
1809        // Create file writer that writes to the temp file (using tokio for async compatibility)
1810        let file_writer = tokio::fs::OpenOptions::new()
1811            .create(true)
1812            .write(true)
1813            .append(true)
1814            .open(&temp_path)
1815            .await
1816            .unwrap();
1817
1818        // Create FileMonitor and get address for stdout
1819        let file_monitor = FileAppender::new();
1820        let file_monitor_addr = file_monitor
1821            .as_ref()
1822            .map(|fm| fm.addr_for(OutputTarget::Stdout));
1823
1824        let the_test_proc_id = test_proc_id("testproc_0");
1825        let monitor = StreamFwder::start_with_writer(
1826            reader,
1827            Box::new(file_writer),
1828            file_monitor_addr,
1829            OutputTarget::Stdout,
1830            3, // max_buffer_size
1831            Some(log_channel),
1832            &the_test_proc_id,
1833            None, // no prefix
1834        );
1835
1836        // Wait a bit for set up to be done
1837        tokio::time::sleep(Duration::from_millis(500)).await;
1838
1839        // Write initial content through the input writer
1840        writer.write_all(b"Initial log line\n").await.unwrap();
1841        writer.flush().await.unwrap();
1842
1843        // Write more content
1844        for i in 1..=5 {
1845            writer
1846                .write_all(format!("New log line {}\n", i).as_bytes())
1847                .await
1848                .unwrap();
1849        }
1850        writer.flush().await.unwrap();
1851
1852        // Wait a bit for the file to be written and the watcher to detect changes
1853        tokio::time::sleep(Duration::from_millis(500)).await;
1854
1855        // Wait until log sender gets message
1856        let timeout = Duration::from_secs(1);
1857        let _ = tokio::time::timeout(timeout, rx.recv())
1858            .await
1859            .unwrap_or_else(|_| panic!("Did not get log message within {:?}", timeout));
1860
1861        // Wait a bit more for all lines to be processed
1862        tokio::time::sleep(Duration::from_millis(200)).await;
1863
1864        let (recent_lines, _result) = monitor.abort().await;
1865
1866        assert!(
1867            recent_lines.len() >= 3,
1868            "Expected buffer with at least 3 lines, got {} lines: {:?}",
1869            recent_lines.len(),
1870            recent_lines
1871        );
1872
1873        let file_contents = std::fs::read_to_string(&temp_path).unwrap();
1874        assert!(
1875            file_contents.contains("Initial log line"),
1876            "Expected temp file to contain 'Initial log line', got: {:?}",
1877            file_contents
1878        );
1879        assert!(
1880            file_contents.contains("New log line 1"),
1881            "Expected temp file to contain 'New log line 1', got: {:?}",
1882            file_contents
1883        );
1884        assert!(
1885            file_contents.contains("New log line 5"),
1886            "Expected temp file to contain 'New log line 5', got: {:?}",
1887            file_contents
1888        );
1889    }
1890
1891    #[test]
1892    fn test_aggregator_custom_threshold() {
1893        // Test with very strict threshold (0.05)
1894        let mut strict_aggregator = Aggregator::new_with_threshold(0.05);
1895        strict_aggregator.add_line("ERROR 404").unwrap();
1896        strict_aggregator.add_line("ERROR 500").unwrap(); // Should not merge due to strict threshold
1897        assert_eq!(strict_aggregator.lines.len(), 2);
1898
1899        // Test with very lenient threshold (0.8)
1900        let mut lenient_aggregator = Aggregator::new_with_threshold(0.8);
1901        lenient_aggregator.add_line("ERROR 404").unwrap();
1902        lenient_aggregator.add_line("WARNING 200").unwrap(); // Should merge due to lenient threshold
1903        assert_eq!(lenient_aggregator.lines.len(), 1);
1904        assert_eq!(lenient_aggregator.lines[0].count, 2);
1905    }
1906
1907    #[test]
1908    fn test_format_system_time() {
1909        let test_time = SystemTime::UNIX_EPOCH + Duration::from_secs(1609459200); // 2021-01-01 00:00:00 UTC
1910        let formatted = format_system_time(test_time);
1911
1912        // Just verify it's a reasonable format (contains date and time components)
1913        assert!(formatted.contains("-"));
1914        assert!(formatted.contains(":"));
1915        assert!(formatted.len() > 10); // Should be reasonable length
1916    }
1917
1918    #[test]
1919    fn test_aggregator_display_formatting() {
1920        let mut aggregator = Aggregator::new();
1921        aggregator.add_line("Test error message").unwrap();
1922        aggregator.add_line("Test error message").unwrap(); // Should merge
1923
1924        let display_string = format!("{}", aggregator);
1925
1926        // Verify the output contains expected elements
1927        assert!(display_string.contains("Aggregated Logs"));
1928        assert!(display_string.contains("[2 similar log lines]"));
1929        assert!(display_string.contains("Test error message"));
1930        assert!(display_string.contains(">>>") && display_string.contains("<<<"));
1931    }
1932
1933    #[tokio::test]
1934    async fn test_local_log_sender_inactive_status() {
1935        let (log_channel, _) =
1936            channel::serve::<LogMessage>(ChannelAddr::any(ChannelTransport::Unix)).unwrap();
1937        let test_proc_id = test_proc_id("testproc_0");
1938        let mut sender = LocalLogSender::new(log_channel, &test_proc_id).unwrap();
1939
1940        // This test verifies that the sender handles inactive status gracefully
1941        // In a real scenario, the channel would be closed, but for testing we just
1942        // verify the send/flush methods don't panic
1943        let result = sender.send(OutputTarget::Stdout, vec![b"test".to_vec()]);
1944        assert!(result.is_ok());
1945
1946        let result = sender.flush();
1947        assert!(result.is_ok());
1948    }
1949
1950    #[test]
1951    fn test_levenshtein_distance_edge_cases() {
1952        // Test with empty strings
1953        assert_eq!(levenshtein_distance("", ""), 0);
1954        assert_eq!(levenshtein_distance("", "hello"), 5);
1955        assert_eq!(levenshtein_distance("hello", ""), 5);
1956
1957        // Test with identical strings
1958        assert_eq!(levenshtein_distance("hello", "hello"), 0);
1959
1960        // Test with single character differences
1961        assert_eq!(levenshtein_distance("hello", "helo"), 1); // deletion
1962        assert_eq!(levenshtein_distance("helo", "hello"), 1); // insertion
1963        assert_eq!(levenshtein_distance("hello", "hallo"), 1); // substitution
1964
1965        // Test with unicode characters
1966        assert_eq!(levenshtein_distance("café", "cafe"), 1);
1967    }
1968
1969    #[test]
1970    fn test_normalized_edit_distance_edge_cases() {
1971        // Test with empty strings
1972        assert_eq!(normalized_edit_distance("", ""), 0.0);
1973
1974        // Test normalization
1975        assert_eq!(normalized_edit_distance("hello", ""), 1.0);
1976        assert_eq!(normalized_edit_distance("", "hello"), 1.0);
1977
1978        // Test that result is always between 0.0 and 1.0
1979        let distance = normalized_edit_distance("completely", "different");
1980        assert!((0.0..=1.0).contains(&distance));
1981    }
1982
1983    #[tokio::test]
1984    async fn test_deserialize_message_lines_edge_cases() {
1985        // Test with empty string
1986        let empty_message = "".to_string();
1987        let serialized = wirevalue::Any::serialize(&empty_message).unwrap();
1988        let result = deserialize_message_lines(&serialized).unwrap();
1989        assert_eq!(result, vec![vec![] as Vec<String>]);
1990
1991        // Test with trailing newline
1992        let trailing_newline = "line1\nline2\n".to_string();
1993        let serialized = wirevalue::Any::serialize(&trailing_newline).unwrap();
1994        let result = deserialize_message_lines(&serialized).unwrap();
1995        assert_eq!(result, vec![vec!["line1", "line2"]]);
1996    }
1997
1998    #[test]
1999    fn test_output_target_serialization() {
2000        // Test that OutputTarget can be serialized and deserialized
2001        let stdout_serialized = serde_json::to_string(&OutputTarget::Stdout).unwrap();
2002        let stderr_serialized = serde_json::to_string(&OutputTarget::Stderr).unwrap();
2003
2004        let stdout_deserialized: OutputTarget = serde_json::from_str(&stdout_serialized).unwrap();
2005        let stderr_deserialized: OutputTarget = serde_json::from_str(&stderr_serialized).unwrap();
2006
2007        assert_eq!(stdout_deserialized, OutputTarget::Stdout);
2008        assert_eq!(stderr_deserialized, OutputTarget::Stderr);
2009    }
2010
2011    #[test]
2012    fn test_log_line_display_formatting() {
2013        let log_line = LogLine::new("Test message".to_string());
2014        let display_string = format!("{}", log_line);
2015
2016        assert!(display_string.contains("[1 similar log lines]"));
2017        assert!(display_string.contains("Test message"));
2018
2019        // Test with higher count
2020        let mut log_line_multi = LogLine::new("Test message".to_string());
2021        log_line_multi.count = 5;
2022        let display_string_multi = format!("{}", log_line_multi);
2023
2024        assert!(display_string_multi.contains("[5 similar log lines]"));
2025        assert!(display_string_multi.contains("Test message"));
2026    }
2027
2028    // Mock reader for testing process_file_content using std::io::Cursor
2029    fn create_mock_reader(data: Vec<u8>) -> std::io::Cursor<Vec<u8>> {
2030        std::io::Cursor::new(data)
2031    }
2032
2033    #[tokio::test]
2034    async fn test_process_file_content_basic() {
2035        let data = b"line1\nline2\nline3\n".to_vec();
2036        let mut reader = create_mock_reader(data.clone());
2037        let max_buf_size = 10;
2038
2039        let result =
2040            process_file_content(&mut reader, 0, data.len() as u64, Vec::new(), max_buf_size)
2041                .await
2042                .unwrap();
2043
2044        assert_eq!(result.lines.len(), 3);
2045        assert_eq!(result.lines[0], b"line1");
2046        assert_eq!(result.lines[1], b"line2");
2047        assert_eq!(result.lines[2], b"line3");
2048        assert_eq!(result.new_position, data.len() as u64);
2049        assert!(result.incomplete_line_buffer.is_empty());
2050    }
2051
2052    #[tokio::test]
2053    async fn test_process_file_content_incomplete_line() {
2054        let data = b"line1\nline2\npartial".to_vec();
2055        let mut reader = create_mock_reader(data.clone());
2056        let max_buf_size = 10;
2057
2058        let result =
2059            process_file_content(&mut reader, 0, data.len() as u64, Vec::new(), max_buf_size)
2060                .await
2061                .unwrap();
2062
2063        assert_eq!(result.lines.len(), 2);
2064        assert_eq!(result.lines[0], b"line1");
2065        assert_eq!(result.lines[1], b"line2");
2066        assert_eq!(result.new_position, data.len() as u64);
2067        assert_eq!(result.incomplete_line_buffer, b"partial");
2068    }
2069
2070    #[tokio::test]
2071    async fn test_process_file_content_with_existing_buffer() {
2072        let data = b"omplete\nline2\nline3\n".to_vec();
2073        let mut reader = create_mock_reader(data.clone());
2074        let existing_buffer = b"inc".to_vec();
2075        let max_buf_size = 10;
2076
2077        let result = process_file_content(
2078            &mut reader,
2079            0,
2080            data.len() as u64,
2081            existing_buffer,
2082            max_buf_size,
2083        )
2084        .await
2085        .unwrap();
2086
2087        assert_eq!(result.lines.len(), 3);
2088        assert_eq!(result.lines[0], b"incomplete");
2089        assert_eq!(result.lines[1], b"line2");
2090        assert_eq!(result.lines[2], b"line3");
2091        assert_eq!(result.new_position, data.len() as u64);
2092        assert!(result.incomplete_line_buffer.is_empty());
2093    }
2094
2095    #[tokio::test]
2096    async fn test_process_file_content_empty_file() {
2097        let data = Vec::new();
2098        let mut reader = create_mock_reader(data.clone());
2099        let max_buf_size = 10;
2100
2101        let result = process_file_content(&mut reader, 0, 0, Vec::new(), max_buf_size)
2102            .await
2103            .unwrap();
2104
2105        assert!(result.lines.is_empty());
2106        assert_eq!(result.new_position, 0);
2107        assert!(result.incomplete_line_buffer.is_empty());
2108    }
2109
2110    #[tokio::test]
2111    async fn test_process_file_content_only_newlines() {
2112        let data = b"\n\n\n".to_vec();
2113        let mut reader = create_mock_reader(data.clone());
2114        let max_buf_size = 10;
2115
2116        let result =
2117            process_file_content(&mut reader, 0, data.len() as u64, Vec::new(), max_buf_size)
2118                .await
2119                .unwrap();
2120
2121        // Empty lines should not be added (the function skips empty line_buffer)
2122        assert!(result.lines.is_empty());
2123        assert_eq!(result.new_position, data.len() as u64);
2124        assert!(result.incomplete_line_buffer.is_empty());
2125    }
2126
2127    #[tokio::test]
2128    async fn test_process_file_content_no_newlines() {
2129        let data = b"no newlines here".to_vec();
2130        let mut reader = create_mock_reader(data.clone());
2131        let max_buf_size = 10;
2132
2133        let result =
2134            process_file_content(&mut reader, 0, data.len() as u64, Vec::new(), max_buf_size)
2135                .await
2136                .unwrap();
2137
2138        assert!(result.lines.is_empty());
2139        assert_eq!(result.new_position, data.len() as u64);
2140        assert_eq!(result.incomplete_line_buffer, b"no newlines here");
2141    }
2142
2143    #[tokio::test]
2144    async fn test_process_file_content_file_truncation() {
2145        let data = b"line1\nline2\n".to_vec();
2146        let mut reader = create_mock_reader(data.clone());
2147
2148        // Simulate current position being beyond file size (file was truncated)
2149        let result = process_file_content(
2150            &mut reader,
2151            100, // position beyond file size
2152            data.len() as u64,
2153            Vec::new(),
2154            10, // max_buf_size
2155        )
2156        .await
2157        .unwrap();
2158
2159        // Should reset to beginning and read all lines
2160        assert_eq!(result.lines.len(), 2);
2161        assert_eq!(result.lines[0], b"line1");
2162        assert_eq!(result.lines[1], b"line2");
2163        assert_eq!(result.new_position, data.len() as u64);
2164        assert!(result.incomplete_line_buffer.is_empty());
2165    }
2166
2167    #[tokio::test]
2168    async fn test_process_file_content_seek_to_position() {
2169        let data = b"line1\nline2\nline3\n".to_vec();
2170        let mut reader = create_mock_reader(data.clone());
2171
2172        // Start reading from position 6 (after "line1\n")
2173        let result = process_file_content(&mut reader, 6, data.len() as u64, Vec::new(), 10)
2174            .await
2175            .unwrap();
2176
2177        assert_eq!(result.lines.len(), 2);
2178        assert_eq!(result.lines[0], b"line2");
2179        assert_eq!(result.lines[1], b"line3");
2180        assert_eq!(result.new_position, data.len() as u64);
2181        assert!(result.incomplete_line_buffer.is_empty());
2182    }
2183
2184    #[tokio::test]
2185    async fn test_process_file_content_position_equals_file_size() {
2186        let data = b"line1\nline2\n".to_vec();
2187        let mut reader = create_mock_reader(data.clone());
2188
2189        // Start reading from end of file
2190        let result = process_file_content(
2191            &mut reader,
2192            data.len() as u64,
2193            data.len() as u64,
2194            Vec::new(),
2195            10,
2196        )
2197        .await
2198        .unwrap();
2199
2200        // Should not read anything new
2201        assert!(
2202            result.lines.is_empty(),
2203            "Expected empty line got {:?}",
2204            result.lines
2205        );
2206        assert_eq!(result.new_position, data.len() as u64);
2207        assert!(result.incomplete_line_buffer.is_empty());
2208    }
2209
2210    #[tokio::test]
2211    async fn test_process_file_content_large_line_truncation() {
2212        // Create a line longer than MAX_LINE_SIZE
2213        let large_line = "x".repeat(MAX_LINE_SIZE + 1000);
2214        let data = format!("{}\nline2\n", large_line).into_bytes();
2215        let mut reader = create_mock_reader(data.clone());
2216
2217        let result = process_file_content(&mut reader, 0, data.len() as u64, Vec::new(), 10)
2218            .await
2219            .unwrap();
2220
2221        assert_eq!(result.lines.len(), 2);
2222
2223        // First line should be truncated
2224        assert_eq!(
2225            result.lines[0].len(),
2226            MAX_LINE_SIZE + b"... [TRUNCATED]".len()
2227        );
2228        assert!(result.lines[0].ends_with(b"... [TRUNCATED]"));
2229
2230        // Second line should be normal
2231        assert_eq!(result.lines[1], b"line2");
2232
2233        assert_eq!(result.new_position, data.len() as u64);
2234        assert!(result.incomplete_line_buffer.is_empty());
2235    }
2236
2237    #[tokio::test]
2238    async fn test_process_file_content_mixed_line_endings() {
2239        let data = b"line1\nline2\r\nline3\n".to_vec();
2240        let mut reader = create_mock_reader(data.clone());
2241
2242        let result = process_file_content(&mut reader, 0, data.len() as u64, Vec::new(), 10)
2243            .await
2244            .unwrap();
2245
2246        assert_eq!(result.lines.len(), 3);
2247        assert_eq!(result.lines[0], b"line1");
2248        assert_eq!(result.lines[1], b"line2\r"); // \r is preserved
2249        assert_eq!(result.lines[2], b"line3");
2250        assert_eq!(result.new_position, data.len() as u64);
2251        assert!(result.incomplete_line_buffer.is_empty());
2252    }
2253
2254    #[tokio::test]
2255    async fn test_process_file_content_existing_buffer_with_truncation() {
2256        // Create a scenario where existing buffer + new data creates a line that needs truncation
2257        let existing_buffer = "x".repeat(MAX_LINE_SIZE - 100);
2258        let data = format!("{}\nline2\n", "y".repeat(200)).into_bytes();
2259        let mut reader = create_mock_reader(data.clone());
2260
2261        let result = process_file_content(
2262            &mut reader,
2263            0,
2264            data.len() as u64,
2265            existing_buffer.into_bytes(),
2266            10,
2267        )
2268        .await
2269        .unwrap();
2270
2271        assert_eq!(result.lines.len(), 2);
2272
2273        // First line should be truncated (existing buffer + new data)
2274        assert_eq!(
2275            result.lines[0].len(),
2276            MAX_LINE_SIZE + b"... [TRUNCATED]".len()
2277        );
2278        assert!(result.lines[0].ends_with(b"... [TRUNCATED]"));
2279
2280        // Second line should be normal
2281        assert_eq!(result.lines[1], b"line2");
2282
2283        assert_eq!(result.new_position, data.len() as u64);
2284        assert!(result.incomplete_line_buffer.is_empty());
2285    }
2286
2287    #[tokio::test]
2288    async fn test_process_file_content_single_character_lines() {
2289        let data = b"a\nb\nc\n".to_vec();
2290        let mut reader = create_mock_reader(data.clone());
2291
2292        let result = process_file_content(&mut reader, 0, data.len() as u64, Vec::new(), 10)
2293            .await
2294            .unwrap();
2295
2296        assert_eq!(result.lines.len(), 3);
2297        assert_eq!(result.lines[0], b"a");
2298        assert_eq!(result.lines[1], b"b");
2299        assert_eq!(result.lines[2], b"c");
2300        assert_eq!(result.new_position, data.len() as u64);
2301        assert!(result.incomplete_line_buffer.is_empty());
2302    }
2303
2304    #[tokio::test]
2305    async fn test_process_file_content_binary_data() {
2306        let data = vec![0x00, 0x01, 0x02, b'\n', 0xFF, 0xFE, b'\n'];
2307        let mut reader = create_mock_reader(data.clone());
2308
2309        let result = process_file_content(&mut reader, 0, data.len() as u64, Vec::new(), 10)
2310            .await
2311            .unwrap();
2312
2313        assert_eq!(result.lines.len(), 2);
2314        assert_eq!(result.lines[0], vec![0x00, 0x01, 0x02]);
2315        assert_eq!(result.lines[1], vec![0xFF, 0xFE]);
2316        assert_eq!(result.new_position, data.len() as u64);
2317        assert!(result.incomplete_line_buffer.is_empty());
2318    }
2319
2320    #[tokio::test]
2321    async fn test_process_file_content_resume_after_max_buffer_size() {
2322        // Test data: 3 lines as specified in the example
2323        let data = b"line 1\nline 2\nline 3\n".to_vec();
2324        let mut reader = create_mock_reader(data.clone());
2325        let max_buffer_size = 2; // Limit to 2 lines per call
2326
2327        // First call: should return first 2 lines
2328        let result1 = process_file_content(
2329            &mut reader,
2330            0, // start from beginning
2331            data.len() as u64,
2332            Vec::new(), // no existing buffer
2333            max_buffer_size,
2334        )
2335        .await
2336        .unwrap();
2337
2338        // Verify first call results
2339        assert_eq!(result1.lines.len(), 2, "First call should return 2 lines");
2340        assert_eq!(result1.lines[0], b"line 1");
2341        assert_eq!(result1.lines[1], b"line 2");
2342        assert!(result1.incomplete_line_buffer.is_empty());
2343
2344        // The position should be after "line 1\nline 2\n" (14 bytes)
2345        let expected_position_after_first_call = b"line 1\nline 2\n".len() as u64;
2346        assert_eq!(result1.new_position, expected_position_after_first_call);
2347
2348        // Second call: resume from where first call left off
2349        let mut reader2 = create_mock_reader(data.clone());
2350        let result2 = process_file_content(
2351            &mut reader2,
2352            result1.new_position, // resume from previous position
2353            data.len() as u64,
2354            result1.incomplete_line_buffer, // pass any incomplete buffer (should be empty)
2355            max_buffer_size,
2356        )
2357        .await
2358        .unwrap();
2359
2360        // Verify second call results
2361        assert_eq!(result2.lines.len(), 1, "Second call should return 1 line");
2362        assert_eq!(result2.lines[0], b"line 3");
2363        assert!(result2.incomplete_line_buffer.is_empty());
2364        assert_eq!(result2.new_position, data.len() as u64);
2365    }
2366
2367    #[tokio::test]
2368    async fn test_utf8_truncation() {
2369        // Test that StreamFwder doesn't panic when truncating lines
2370        // with multi-byte chars.
2371
2372        hyperactor_telemetry::initialize_logging_for_test();
2373
2374        // Create a line longer than MAX_LINE_SIZE with an emoji at the boundary
2375        let mut long_line = "x".repeat(MAX_LINE_SIZE - 1);
2376        long_line.push('🦀'); // 4-byte emoji - truncation will land in the middle
2377        long_line.push('\n');
2378
2379        // Create IO streams
2380        let (mut writer, reader) = tokio::io::duplex(8192);
2381
2382        // Start StreamFwder
2383        let test_proc_id = test_proc_id("testproc_0");
2384        let monitor = StreamFwder::start_with_writer(
2385            reader,
2386            Box::new(tokio::io::sink()), // discard output
2387            None,                        // no file monitor needed
2388            OutputTarget::Stdout,
2389            1,    // tail buffer of 1 (need at least one sink)
2390            None, // no log channel
2391            &test_proc_id,
2392            None, // no prefix
2393        );
2394
2395        // Write the problematic line
2396        writer.write_all(long_line.as_bytes()).await.unwrap();
2397        drop(writer); // Close to signal EOF
2398
2399        // Wait for completion - should NOT panic
2400        let (_lines, result) = monitor.abort().await;
2401        result.expect("Should complete without panic despite UTF-8 truncation");
2402    }
2403}
hyperactor_mesh/logging.rs

hyperactor_mesh/
logging.rs