hyperactor/
data.rs

1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9//! This module contains core traits and implementation to manage remote data
10//! types in Hyperactor.
11
12use std::any::TypeId;
13use std::collections::HashMap;
14use std::fmt;
15use std::io::Cursor;
16use std::sync::LazyLock;
17
18use enum_as_inner::EnumAsInner;
19use serde::Deserialize;
20use serde::Serialize;
21use serde::de::DeserializeOwned;
22
23use crate as hyperactor;
24use crate::config;
25
26/// A [`Named`] type is a type that has a globally unique name.
27pub trait Named: Sized + 'static {
28    /// The globally unique type name for the type.
29    /// This should typically be the fully qualified Rust name of the type.
30    fn typename() -> &'static str;
31
32    /// A globally unique hash for this type.
33    /// TODO: actually enforce perfect hashing
34    fn typehash() -> u64 {
35        // The `Named` macro overrides this implementation with one that
36        // memoizes the hash.
37        cityhasher::hash(Self::typename())
38    }
39
40    /// The TypeId for this type. TypeIds are unique only within a binary,
41    /// and should not be used for global identification.
42    fn typeid() -> TypeId {
43        TypeId::of::<Self>()
44    }
45
46    /// The globally unique port for this type. Typed ports are in the range
47    /// of 1<<63..1<<64-1.
48    fn port() -> u64 {
49        Self::typehash() | (1 << 63)
50    }
51
52    /// If the named type is an enum, this returns the name of the arm
53    /// of the value self.
54    fn arm(&self) -> Option<&'static str> {
55        None
56    }
57
58    /// An unsafe version of 'arm', accepting a pointer to the value,
59    /// for use in type-erased settings.
60    unsafe fn arm_unchecked(self_: *const ()) -> Option<&'static str> {
61        // SAFETY: This isn't safe. We're passing it on.
62        unsafe { &*(self_ as *const Self) }.arm()
63    }
64}
65
66#[doc(hidden)]
67/// Dump trait for Named types that are also serializable/deserializable.
68/// This is a utility used by [`Serialized::dump`], and is not intended
69/// for direct use.
70pub trait NamedDumpable: Named + Serialize + for<'de> Deserialize<'de> {
71    /// Dump the data in Serialized to a JSON value.
72    fn dump(data: Serialized) -> Result<serde_json::Value, anyhow::Error>;
73}
74
75impl<T: Named + Serialize + for<'de> Deserialize<'de>> NamedDumpable for T {
76    fn dump(data: Serialized) -> Result<serde_json::Value, anyhow::Error> {
77        let value = data.deserialized::<Self>()?;
78        Ok(serde_json::to_value(value)?)
79    }
80}
81
82macro_rules! impl_basic {
83    ($t:ty) => {
84        impl Named for $t {
85            fn typename() -> &'static str {
86                stringify!($t)
87            }
88        }
89    };
90}
91
92impl_basic!(());
93impl_basic!(bool);
94impl_basic!(i8);
95impl_basic!(u8);
96impl_basic!(i16);
97impl_basic!(u16);
98impl_basic!(i32);
99impl_basic!(u32);
100impl_basic!(i64);
101impl_basic!(u64);
102impl_basic!(i128);
103impl_basic!(u128);
104impl_basic!(isize);
105impl_basic!(usize);
106impl_basic!(f32);
107impl_basic!(f64);
108impl_basic!(String);
109impl_basic!(std::net::IpAddr);
110impl_basic!(std::net::Ipv4Addr);
111impl_basic!(std::net::Ipv6Addr);
112impl_basic!(std::time::Duration);
113impl_basic!(std::time::SystemTime);
114impl_basic!(bytes::Bytes);
115// This is somewhat unfortunate. We should separate this module out into
116// its own crate, and just derive(Named) in `ndslice`. As it is, this would
117// create a circular (and heavy!) dependency for `ndslice`.
118impl_basic!(ndslice::Point);
119
120impl Named for &'static str {
121    fn typename() -> &'static str {
122        "&str"
123    }
124}
125
126// A macro that implements type-keyed interning of typenames. This is useful
127// for implementing [`Named`] for generic types.
128#[doc(hidden)] // not part of the public API
129#[macro_export]
130macro_rules! intern_typename {
131    ($key:ty, $format_string:expr, $($args:ty),+) => {
132        {
133            static CACHE: std::sync::LazyLock<$crate::dashmap::DashMap<std::any::TypeId, &'static str>> =
134              std::sync::LazyLock::new($crate::dashmap::DashMap::new);
135
136            match CACHE.entry(std::any::TypeId::of::<$key>()) {
137                $crate::dashmap::mapref::entry::Entry::Vacant(entry) => {
138                    let typename = format!($format_string, $(<$args>::typename()),+).leak();
139                    entry.insert(typename);
140                    typename
141                }
142                $crate::dashmap::mapref::entry::Entry::Occupied(entry) => *entry.get(),
143            }
144        }
145    };
146}
147pub use intern_typename;
148
149macro_rules! tuple_format_string {
150    ($a:ident,) => { "{}" };
151    ($a:ident, $($rest_a:ident,)+) => { concat!("{}, ", tuple_format_string!($($rest_a,)+)) };
152}
153
154macro_rules! impl_tuple_peel {
155    ($name:ident, $($other:ident,)*) => (impl_tuple! { $($other,)* })
156}
157
158macro_rules! impl_tuple {
159    () => ();
160    ( $($name:ident,)+ ) => (
161        impl<$($name:Named + 'static),+> Named for ($($name,)+) {
162            fn typename() -> &'static str {
163                intern_typename!(Self, concat!("(", tuple_format_string!($($name,)+), ")"), $($name),+)
164            }
165        }
166        impl_tuple_peel! { $($name,)+ }
167    )
168}
169
170impl_tuple! { E, D, C, B, A, Z, Y, X, W, V, U, T, }
171
172impl<T: Named + 'static> Named for Option<T> {
173    fn typename() -> &'static str {
174        intern_typename!(Self, "Option<{}>", T)
175    }
176}
177
178impl<T: Named + 'static> Named for Vec<T> {
179    fn typename() -> &'static str {
180        intern_typename!(Self, "Vec<{}>", T)
181    }
182}
183
184impl<K: Named + 'static, V: Named + 'static> Named for HashMap<K, V> {
185    fn typename() -> &'static str {
186        intern_typename!(Self, "HashMap<{}, {}>", K, V)
187    }
188}
189
190impl<T: Named + 'static, E: Named + 'static> Named for Result<T, E> {
191    fn typename() -> &'static str {
192        intern_typename!(Self, "Result<{}, {}>", T, E)
193    }
194}
195
196impl<T: Named + 'static> Named for std::ops::Range<T> {
197    fn typename() -> &'static str {
198        intern_typename!(Self, "std::ops::Range<{}>", T)
199    }
200}
201
202static SHAPE_CACHED_TYPEHASH: LazyLock<u64> =
203    LazyLock::new(|| cityhasher::hash(<ndslice::shape::Shape as Named>::typename()));
204
205impl Named for ndslice::shape::Shape {
206    fn typename() -> &'static str {
207        "ndslice::shape::Shape"
208    }
209
210    fn typehash() -> u64 {
211        *SHAPE_CACHED_TYPEHASH
212    }
213}
214
215/// Really internal, but needs to be exposed for macro.
216#[doc(hidden)]
217#[derive(Debug)]
218pub struct TypeInfo {
219    /// Named::typename()
220    pub typename: fn() -> &'static str,
221    /// Named::typehash()
222    pub typehash: fn() -> u64,
223    /// Named::typeid()
224    pub typeid: fn() -> TypeId,
225    /// Named::typehash()
226    pub port: fn() -> u64,
227    /// A function that can transcode a serialized value to JSON.
228    pub dump: Option<fn(Serialized) -> Result<serde_json::Value, anyhow::Error>>,
229    /// Return the arm for this type, if available.
230    pub arm_unchecked: unsafe fn(*const ()) -> Option<&'static str>,
231}
232
233#[allow(dead_code)]
234impl TypeInfo {
235    /// Get the typeinfo for the provided type hash.
236    pub(crate) fn get(typehash: u64) -> Option<&'static TypeInfo> {
237        TYPE_INFO.get(&typehash).map(|v| &**v)
238    }
239
240    /// Get the typeinfo for the provided type id.
241    pub(crate) fn get_by_typeid(typeid: TypeId) -> Option<&'static TypeInfo> {
242        TYPE_INFO_BY_TYPE_ID.get(&typeid).map(|v| &**v)
243    }
244
245    /// Get the typeinfo for the provided type.
246    pub(crate) fn of<T: ?Sized + 'static>() -> Option<&'static TypeInfo> {
247        Self::get_by_typeid(TypeId::of::<T>())
248    }
249
250    pub(crate) fn typename(&self) -> &'static str {
251        (self.typename)()
252    }
253    pub(crate) fn typehash(&self) -> u64 {
254        (self.typehash)()
255    }
256    pub(crate) fn typeid(&self) -> TypeId {
257        (self.typeid)()
258    }
259    pub(crate) fn port(&self) -> u64 {
260        (self.port)()
261    }
262    pub(crate) fn dump(&self, data: Serialized) -> Result<serde_json::Value, anyhow::Error> {
263        if let Some(dump) = self.dump {
264            (dump)(data)
265        } else {
266            anyhow::bail!("binary does not have dumper for {}", self.typehash())
267        }
268    }
269    pub(crate) unsafe fn arm_unchecked(&self, value: *const ()) -> Option<&'static str> {
270        // SAFETY: This isn't safe, we're passing it on.
271        unsafe { (self.arm_unchecked)(value) }
272    }
273}
274
275inventory::collect!(TypeInfo);
276
277/// Type infos for all types that have been linked into the binary, keyed by typehash.
278static TYPE_INFO: LazyLock<HashMap<u64, &'static TypeInfo>> = LazyLock::new(|| {
279    inventory::iter::<TypeInfo>()
280        .map(|entry| (entry.typehash(), entry))
281        .collect()
282});
283
284/// Type infos for all types that have been linked into the binary, keyed by typeid.
285static TYPE_INFO_BY_TYPE_ID: LazyLock<HashMap<std::any::TypeId, &'static TypeInfo>> =
286    LazyLock::new(|| {
287        TYPE_INFO
288            .values()
289            .map(|info| (info.typeid(), &**info))
290            .collect()
291    });
292
293/// Register a (concrete) type so that it may be looked up by name or hash. Type registration
294/// is required only to improve diagnostics, as it allows a binary to introspect serialized
295/// payloads under type erasure.
296///
297/// The provided type must implement [`hyperactor::data::Named`], and must be concrete.
298#[macro_export]
299macro_rules! register_type {
300    ($type:ty) => {
301        hyperactor::submit! {
302            hyperactor::data::TypeInfo {
303                typename: <$type as hyperactor::data::Named>::typename,
304                typehash: <$type as hyperactor::data::Named>::typehash,
305                typeid: <$type as hyperactor::data::Named>::typeid,
306                port: <$type as hyperactor::data::Named>::port,
307                dump: Some(<$type as hyperactor::data::NamedDumpable>::dump),
308                arm_unchecked: <$type as hyperactor::data::Named>::arm_unchecked,
309            }
310        }
311    };
312}
313
314/// An enumeration containing the supported encodings of Serialized
315/// values.
316#[derive(
317    Debug,
318    Clone,
319    Copy,
320    Serialize,
321    Deserialize,
322    PartialEq,
323    Eq,
324    crate::AttrValue,
325    crate::Named,
326    strum::EnumIter,
327    strum::Display,
328    strum::EnumString
329)]
330pub enum Encoding {
331    /// Serde bincode encoding.
332    #[strum(to_string = "bincode")]
333    Bincode,
334    /// Serde JSON encoding.
335    #[strum(to_string = "serde_json")]
336    Json,
337    /// Serde multipart encoding.
338    #[strum(to_string = "serde_multipart")]
339    Multipart,
340}
341
342/// The encoding used for a serialized value.
343#[derive(Clone, Serialize, Deserialize, PartialEq, EnumAsInner)]
344enum Encoded {
345    Bincode(bytes::Bytes),
346    Json(bytes::Bytes),
347    Multipart(serde_multipart::Message),
348}
349
350impl Encoded {
351    /// The length of the underlying serialized message
352    pub fn len(&self) -> usize {
353        match &self {
354            Encoded::Bincode(data) => data.len(),
355            Encoded::Json(data) => data.len(),
356            Encoded::Multipart(message) => message.len(),
357        }
358    }
359
360    /// Is the message empty. This should always return false.
361    pub fn is_empty(&self) -> bool {
362        match &self {
363            Encoded::Bincode(data) => data.is_empty(),
364            Encoded::Json(data) => data.is_empty(),
365            Encoded::Multipart(message) => message.is_empty(),
366        }
367    }
368
369    /// Returns the encoding of this serialized value.
370    pub fn encoding(&self) -> Encoding {
371        match &self {
372            Encoded::Bincode(_) => Encoding::Bincode,
373            Encoded::Json(_) => Encoding::Json,
374            Encoded::Multipart(_) => Encoding::Multipart,
375        }
376    }
377
378    /// Computes the 32bit crc of the encoded data
379    pub fn crc(&self) -> u32 {
380        match &self {
381            Encoded::Bincode(data) => crc32fast::hash(data),
382            Encoded::Json(data) => crc32fast::hash(data),
383            Encoded::Multipart(message) => {
384                let mut hasher = crc32fast::Hasher::new();
385                hasher.update(message.body().as_ref());
386                for part in message.parts() {
387                    hasher.update(part.as_ref());
388                }
389                hasher.finalize()
390            }
391        }
392    }
393}
394
395impl std::fmt::Debug for Encoded {
396    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
397        match self {
398            Encoded::Bincode(data) => write!(f, "Encoded::Bincode({})", HexFmt(data)),
399            Encoded::Json(data) => write!(f, "Encoded::Json({})", HexFmt(data)),
400            Encoded::Multipart(message) => {
401                write!(
402                    f,
403                    "Encoded::Multipart(illegal?={} body={}",
404                    message.is_illegal(),
405                    HexFmt(message.body())
406                )?;
407                for (index, part) in message.parts().iter().enumerate() {
408                    write!(f, ", part[{}]={}", index, HexFmt(part))?;
409                }
410                write!(f, ")")
411            }
412        }
413    }
414}
415
416/// The type of error returned by operations on [`Serialized`].
417#[derive(Debug, thiserror::Error)]
418pub enum Error {
419    /// Errors returned from serde bincode.
420    #[error(transparent)]
421    Bincode(#[from] bincode::Error),
422
423    /// Errors returned from serde JSON.
424    #[error(transparent)]
425    Json(#[from] serde_json::Error),
426
427    /// The encoding was not recognized.
428    #[error("unknown encoding: {0}")]
429    InvalidEncoding(String),
430}
431
432/// Represents a serialized value, wrapping the underlying serialization
433/// and deserialization details, while ensuring that we pass correctly-serialized
434/// message throughout the system.
435///
436/// Currently, Serialized passes through to bincode, but in the future we may include
437/// content-encoding information to allow for other codecs as well.
438#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
439pub struct Serialized {
440    /// The encoded data
441    encoded: Encoded,
442    /// The typehash of the serialized value. This is used to provide
443    /// typed introspection of the value.
444    typehash: u64,
445}
446
447impl std::fmt::Display for Serialized {
448    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
449        match self.dump() {
450            Ok(value) => {
451                // unwrap okay, self.dump() would return Err otherwise.
452                let typename = self.typename().unwrap();
453                // take the basename of the type (e.g. "foo::bar::baz" -> "baz")
454                let basename = typename.split("::").last().unwrap_or(typename);
455                write!(f, "{}{}", basename, JsonFmt(&value))
456            }
457            Err(_) => write!(f, "{:?}", self.encoded),
458        }
459    }
460}
461
462impl Serialized {
463    /// Construct a new serialized value by serializing the provided T-typed value.
464    /// Serialize uses the default encoding defined by the configuration key
465    /// [`config::DEFAULT_ENCODING`] in the global configuration; use [`serialize_with_encoding`]
466    /// to serialize values with a specific encoding.
467    pub fn serialize<T: Serialize + Named>(value: &T) -> Result<Self, Error> {
468        Self::serialize_with_encoding(config::global::get(config::DEFAULT_ENCODING), value)
469    }
470
471    /// Serialize U-typed value as a T-typed value. This should be used with care
472    /// (typically only in testing), as the value's representation may be illegally
473    /// coerced.
474    pub fn serialize_as<T: Named, U: Serialize>(value: &U) -> Result<Self, Error> {
475        Self::serialize_with_encoding_as::<T, U>(
476            config::global::get(config::DEFAULT_ENCODING),
477            value,
478        )
479    }
480
481    /// Serialize the value with the using the provided encoding.
482    pub fn serialize_with_encoding<T: Serialize + Named>(
483        encoding: Encoding,
484        value: &T,
485    ) -> Result<Self, Error> {
486        Self::serialize_with_encoding_as::<T, T>(encoding, value)
487    }
488
489    /// Serialize U-typed value as a T-typed value. This should be used with care
490    /// (typically only in testing), as the value's representation may be illegally
491    /// coerced.
492    pub fn serialize_with_encoding_as<T: Named, U: Serialize>(
493        encoding: Encoding,
494        value: &U,
495    ) -> Result<Self, Error> {
496        Ok(Self {
497            encoded: match encoding {
498                Encoding::Bincode => Encoded::Bincode(bincode::serialize(value)?.into()),
499                Encoding::Json => Encoded::Json(serde_json::to_vec(value)?.into()),
500                Encoding::Multipart => {
501                    Encoded::Multipart(serde_multipart::serialize_bincode(value)?)
502                }
503            },
504            typehash: T::typehash(),
505        })
506    }
507
508    /// Deserialize a value to the provided type T.
509    pub fn deserialized<T: DeserializeOwned + Named>(&self) -> Result<T, anyhow::Error> {
510        anyhow::ensure!(
511            self.is::<T>(),
512            "attempted to serialize {}-typed serialized into type {}",
513            self.typename().unwrap_or("unknown"),
514            T::typename()
515        );
516        self.deserialized_unchecked()
517    }
518
519    /// Deserialize a value to the provided type T, without checking for type conformance.
520    /// This should be used carefully, only when you know that the dynamic type check is
521    /// not needed.
522    pub fn deserialized_unchecked<T: DeserializeOwned>(&self) -> Result<T, anyhow::Error> {
523        match &self.encoded {
524            Encoded::Bincode(data) => bincode::deserialize(data).map_err(anyhow::Error::from),
525            Encoded::Json(data) => serde_json::from_slice(data).map_err(anyhow::Error::from),
526            Encoded::Multipart(message) => {
527                serde_multipart::deserialize_bincode(message.clone()).map_err(anyhow::Error::from)
528            }
529        }
530    }
531
532    /// Transcode the serialized value to JSON. This operation will succeed if the type hash
533    /// is embedded in the value, and the corresponding type is available in this binary.
534    pub fn transcode_to_json(self) -> Result<Self, Self> {
535        match self.encoded {
536            Encoded::Bincode(_) | Encoded::Multipart(_) => {
537                let json_value = match self.dump() {
538                    Ok(json_value) => json_value,
539                    Err(_) => return Err(self),
540                };
541                let json_data = match serde_json::to_vec(&json_value) {
542                    Ok(json_data) => json_data,
543                    Err(_) => return Err(self),
544                };
545                Ok(Self {
546                    encoded: Encoded::Json(json_data.into()),
547                    typehash: self.typehash,
548                })
549            }
550            Encoded::Json(_) => Ok(self),
551        }
552    }
553
554    /// Dump the Serialized message into a JSON value. This will succeed if: 1) the typehash is embedded
555    /// in the serialized value; 2) the named type is linked into the binary.
556    pub fn dump(&self) -> Result<serde_json::Value, anyhow::Error> {
557        match &self.encoded {
558            Encoded::Bincode(_) | Encoded::Multipart(_) => {
559                let Some(typeinfo) = TYPE_INFO.get(&self.typehash) else {
560                    anyhow::bail!("binary does not have typeinfo for {}", self.typehash);
561                };
562                typeinfo.dump(self.clone())
563            }
564            Encoded::Json(data) => serde_json::from_slice(data).map_err(anyhow::Error::from),
565        }
566    }
567
568    /// The encoding used by this serialized value.
569    pub fn encoding(&self) -> Encoding {
570        self.encoded.encoding()
571    }
572
573    /// The typehash of the serialized value.
574    pub fn typehash(&self) -> u64 {
575        self.typehash
576    }
577
578    /// The typename of the serialized value, if available.
579    pub fn typename(&self) -> Option<&'static str> {
580        TYPE_INFO
581            .get(&self.typehash)
582            .map(|typeinfo| typeinfo.typename())
583    }
584
585    /// Deserialize a prefix of the value. This is currently only supported
586    /// for bincode-serialized values.
587    // TODO: we should support this by formalizing the notion of a 'prefix'
588    // serialization, and generalize it to other codecs as well.
589    pub fn prefix<T: DeserializeOwned>(&self) -> Result<T, anyhow::Error> {
590        match &self.encoded {
591            Encoded::Bincode(data) => bincode::deserialize(data).map_err(anyhow::Error::from),
592            _ => anyhow::bail!("only bincode supports prefix emplacement"),
593        }
594    }
595
596    /// Emplace a new prefix to this value. This is currently only supported
597    /// for bincode-serialized values.
598    pub fn emplace_prefix<T: Serialize + DeserializeOwned>(
599        &mut self,
600        prefix: T,
601    ) -> Result<(), anyhow::Error> {
602        let data = match &self.encoded {
603            Encoded::Bincode(data) => data,
604            _ => anyhow::bail!("only bincode supports prefix emplacement"),
605        };
606
607        // This is a bit ugly, but: we first deserialize out the old prefix,
608        // then serialize the new prefix, then splice the two together.
609        // This is safe because we know that the prefix is the first thing
610        // in the serialized value, and that the serialization format is stable.
611        let mut cursor = Cursor::new(data.clone());
612        let _prefix: T = bincode::deserialize_from(&mut cursor).unwrap();
613        let position = cursor.position() as usize;
614        let suffix = &cursor.into_inner()[position..];
615        let mut data = bincode::serialize(&prefix)?;
616        data.extend_from_slice(suffix);
617        self.encoded = Encoded::Bincode(data.into());
618
619        Ok(())
620    }
621
622    /// The length of the underlying serialized message
623    pub fn len(&self) -> usize {
624        self.encoded.len()
625    }
626
627    /// Is the message empty. This should always return false.
628    pub fn is_empty(&self) -> bool {
629        self.encoded.is_empty()
630    }
631
632    /// Returns the 32bit crc of the serialized data
633    pub fn crc(&self) -> u32 {
634        self.encoded.crc()
635    }
636
637    /// Returns whether this value contains a serialized M-typed value. Returns None
638    /// when type information is unavailable.
639    pub fn is<M: Named>(&self) -> bool {
640        self.typehash == M::typehash()
641    }
642}
643
644const MAX_BYTE_PREVIEW_LENGTH: usize = 8;
645
646fn display_bytes_as_hash(f: &mut impl std::fmt::Write, bytes: &[u8]) -> std::fmt::Result {
647    let hash = crc32fast::hash(bytes);
648    write!(f, "CRC:{:x}", hash)?;
649    // Implementing in this way lets us print without allocating a new intermediate string.
650    for &byte in bytes.iter().take(MAX_BYTE_PREVIEW_LENGTH) {
651        write!(f, " {:x}", byte)?;
652    }
653    if bytes.len() > MAX_BYTE_PREVIEW_LENGTH {
654        write!(f, " [...{} bytes]", bytes.len() - MAX_BYTE_PREVIEW_LENGTH)?;
655    }
656    Ok(())
657}
658
659/// Formats a binary slice as hex when its display function is called.
660pub struct HexFmt<'a>(pub &'a [u8]);
661
662impl<'a> std::fmt::Display for HexFmt<'a> {
663    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
664        // calculate a 2 byte checksum to prepend to the message
665        display_bytes_as_hash(f, self.0)
666    }
667}
668
669/// Formats a JSON value for display, printing all keys but
670/// truncating and displaying a hash if the content is too long.
671pub struct JsonFmt<'a>(pub &'a serde_json::Value);
672
673const MAX_JSON_VALUE_DISPLAY_LENGTH: usize = 8;
674
675impl<'a> std::fmt::Display for JsonFmt<'a> {
676    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
677        /// Truncate the input string to MAX_JSON_VALUE_DISPLAY_LENGTH and append
678        /// the truncated hash of the full value for easy comparison.
679        fn truncate_and_hash(value_str: &str) -> String {
680            let truncate_at = MAX_JSON_VALUE_DISPLAY_LENGTH.min(value_str.len());
681
682            // Respect UTF-8 boundaries (multi-byte chars like emojis can be up to 4 bytes)
683            let mut safe_truncate_at = truncate_at;
684            while safe_truncate_at > 0 && !value_str.is_char_boundary(safe_truncate_at) {
685                safe_truncate_at -= 1;
686            }
687
688            let truncated_str = &value_str[..safe_truncate_at];
689            let mut result = truncated_str.to_string();
690            result.push_str(&format!("[...{} chars] ", value_str.len()));
691            display_bytes_as_hash(&mut result, value_str.as_bytes()).unwrap();
692            result
693        }
694
695        /// Recursively truncate a serde_json::Value object.
696        fn truncate_json_values(value: &serde_json::Value) -> serde_json::Value {
697            match value {
698                serde_json::Value::String(s) => {
699                    if s.len() > MAX_JSON_VALUE_DISPLAY_LENGTH {
700                        serde_json::Value::String(truncate_and_hash(s))
701                    } else {
702                        value.clone()
703                    }
704                }
705                serde_json::Value::Array(arr) => {
706                    let array_str = serde_json::to_string(arr).unwrap();
707                    if array_str.len() > MAX_JSON_VALUE_DISPLAY_LENGTH {
708                        serde_json::Value::String(truncate_and_hash(&array_str))
709                    } else {
710                        value.clone()
711                    }
712                }
713                serde_json::Value::Object(obj) => {
714                    let truncated_obj: serde_json::Map<_, _> = obj
715                        .iter()
716                        .map(|(k, v)| (k.clone(), truncate_json_values(v)))
717                        .collect();
718                    serde_json::Value::Object(truncated_obj)
719                }
720                _ => value.clone(),
721            }
722        }
723
724        let truncated = truncate_json_values(self.0);
725        write!(f, "{}", truncated)
726    }
727}
728
729#[cfg(test)]
730mod tests {
731
732    use serde::Deserialize;
733    use serde::Serialize;
734    use serde_multipart::Part;
735    use strum::IntoEnumIterator;
736
737    use super::*;
738    use crate as hyperactor; // for macros
739    use crate::Named;
740
741    #[derive(Named, Serialize, Deserialize)]
742    struct TestStruct;
743
744    #[test]
745    fn test_names() {
746        assert_eq!(String::typename(), "String");
747        assert_eq!(Option::<String>::typename(), "Option<String>");
748        assert_eq!(Vec::<String>::typename(), "Vec<String>");
749        assert_eq!(Vec::<Vec::<String>>::typename(), "Vec<Vec<String>>");
750        assert_eq!(
751            Vec::<Vec::<Vec::<String>>>::typename(),
752            "Vec<Vec<Vec<String>>>"
753        );
754        assert_eq!(
755            <(u64, String, Option::<isize>)>::typename(),
756            "(u64, String, Option<isize>)"
757        );
758        assert_eq!(
759            TestStruct::typename(),
760            "hyperactor::data::tests::TestStruct"
761        );
762        assert_eq!(
763            Vec::<TestStruct>::typename(),
764            "Vec<hyperactor::data::tests::TestStruct>"
765        );
766    }
767
768    #[test]
769    fn test_ports() {
770        assert_eq!(String::typehash(), 3947244799002047352u64);
771        assert_eq!(String::port(), 13170616835856823160u64);
772        assert_ne!(
773            Vec::<Vec::<Vec::<String>>>::typehash(),
774            Vec::<Vec::<Vec::<Vec::<String>>>>::typehash(),
775        );
776    }
777
778    #[derive(Named, Serialize, Deserialize, PartialEq, Eq, Debug)]
779    struct TestDumpStruct {
780        a: String,
781        b: u64,
782        c: Option<i32>,
783        d: Option<Part>,
784    }
785    crate::register_type!(TestDumpStruct);
786
787    #[test]
788    fn test_dump_struct() {
789        let data = TestDumpStruct {
790            a: "hello".to_string(),
791            b: 1234,
792            c: Some(5678),
793            d: None,
794        };
795        let serialized = Serialized::serialize(&data).unwrap();
796        let serialized_json = serialized.clone().transcode_to_json().unwrap();
797
798        assert!(serialized.encoded.is_multipart());
799        assert!(serialized_json.encoded.is_json());
800
801        let json_string =
802            String::from_utf8(serialized_json.encoded.as_json().unwrap().to_vec().clone()).unwrap();
803        // The serialized data for JSON is just the (compact) JSON string.
804        assert_eq!(
805            json_string,
806            "{\"a\":\"hello\",\"b\":1234,\"c\":5678,\"d\":null}"
807        );
808
809        for serialized in [serialized, serialized_json] {
810            // Note, at this point, serialized has no knowledge other than its embedded typehash.
811
812            assert_eq!(
813                serialized.typename(),
814                Some("hyperactor::data::tests::TestDumpStruct")
815            );
816
817            let json = serialized.dump().unwrap();
818            assert_eq!(
819                json,
820                serde_json::json!({
821                    "a": "hello",
822                    "b": 1234,
823                    "c": 5678,
824                    "d": null,
825                })
826            );
827
828            assert_eq!(
829                format!("{}", serialized),
830                "TestDumpStruct{\"a\":\"hello\",\"b\":1234,\"c\":5678,\"d\":null}",
831            );
832        }
833    }
834
835    #[test]
836    fn test_emplace_prefix() {
837        let config = config::global::lock();
838        let _guard = config.override_key(config::DEFAULT_ENCODING, Encoding::Bincode);
839        let data = TestDumpStruct {
840            a: "hello".to_string(),
841            b: 1234,
842            c: Some(5678),
843            d: None,
844        };
845
846        let mut ser = Serialized::serialize(&data).unwrap();
847        assert_eq!(ser.prefix::<String>().unwrap(), "hello".to_string());
848
849        ser.emplace_prefix("hello, world, 123!".to_string())
850            .unwrap();
851
852        assert_eq!(
853            ser.deserialized::<TestDumpStruct>().unwrap(),
854            TestDumpStruct {
855                a: "hello, world, 123!".to_string(),
856                b: 1234,
857                c: Some(5678),
858                d: None,
859            }
860        );
861    }
862
863    #[test]
864    fn test_arms() {
865        #[derive(Named, Serialize, Deserialize)]
866        enum TestArm {
867            #[allow(dead_code)]
868            A(u32),
869            B,
870            C(),
871            D {
872                #[allow(dead_code)]
873                a: u32,
874                #[allow(dead_code)]
875                b: String,
876            },
877        }
878
879        assert_eq!(TestArm::A(1234).arm(), Some("A"));
880        assert_eq!(TestArm::B.arm(), Some("B"));
881        assert_eq!(TestArm::C().arm(), Some("C"));
882        assert_eq!(
883            TestArm::D {
884                a: 1234,
885                b: "hello".to_string()
886            }
887            .arm(),
888            Some("D")
889        );
890    }
891
892    #[test]
893    fn display_hex() {
894        assert_eq!(
895            format!("{}", HexFmt("hello world".as_bytes())),
896            "CRC:d4a1185 68 65 6c 6c 6f 20 77 6f [...3 bytes]"
897        );
898        assert_eq!(format!("{}", HexFmt("".as_bytes())), "CRC:0");
899        assert_eq!(
900            format!("{}", HexFmt("a very long string that is long".as_bytes())),
901            "CRC:c7e24f62 61 20 76 65 72 79 20 6c [...23 bytes]"
902        );
903    }
904
905    #[test]
906    fn test_json_fmt() {
907        let json_value = serde_json::json!({
908            "name": "test",
909            "number": 42,
910            "nested": {
911                "key": "value"
912            }
913        });
914        // JSON values with short values should print normally
915        assert_eq!(
916            format!("{}", JsonFmt(&json_value)),
917            "{\"name\":\"test\",\"nested\":{\"key\":\"value\"},\"number\":42}",
918        );
919
920        let empty_json = serde_json::json!({});
921        assert_eq!(format!("{}", JsonFmt(&empty_json)), "{}");
922
923        let simple_array = serde_json::json!([1, 2, 3]);
924        assert_eq!(format!("{}", JsonFmt(&simple_array)), "[1,2,3]");
925
926        // JSON values with very long strings should be truncated
927        let long_string_json = serde_json::json!({
928            "long_string": "a".repeat(MAX_JSON_VALUE_DISPLAY_LENGTH * 5)
929        });
930        assert_eq!(
931            format!("{}", JsonFmt(&long_string_json)),
932            "{\"long_string\":\"aaaaaaaa[...40 chars] CRC:c95b8a25 61 61 61 61 61 61 61 61 [...32 bytes]\"}"
933        );
934
935        // JSON values with very long arrays should be truncated
936        let long_array_json =
937            serde_json::json!((1..=(MAX_JSON_VALUE_DISPLAY_LENGTH + 4)).collect::<Vec<_>>());
938        assert_eq!(
939            format!("{}", JsonFmt(&long_array_json)),
940            "\"[1,2,3,4[...28 chars] CRC:e5c881af 5b 31 2c 32 2c 33 2c 34 [...20 bytes]\""
941        );
942
943        // Test for truncation within nested blocks
944        let nested_json = serde_json::json!({
945            "simple_number": 42,
946            "simple_bool": true,
947            "outer": {
948                "long_string": "a".repeat(MAX_JSON_VALUE_DISPLAY_LENGTH + 10),
949                "long_array": (1..=(MAX_JSON_VALUE_DISPLAY_LENGTH + 4)).collect::<Vec<_>>(),
950                "inner": {
951                    "simple_value": "short",
952                }
953            }
954        });
955        println!("{}", JsonFmt(&nested_json));
956        assert_eq!(
957            format!("{}", JsonFmt(&nested_json)),
958            "{\"outer\":{\"inner\":{\"simple_value\":\"short\"},\"long_array\":\"[1,2,3,4[...28 chars] CRC:e5c881af 5b 31 2c 32 2c 33 2c 34 [...20 bytes]\",\"long_string\":\"aaaaaaaa[...18 chars] CRC:b8ac0e31 61 61 61 61 61 61 61 61 [...10 bytes]\"},\"simple_bool\":true,\"simple_number\":42}",
959        );
960    }
961
962    #[test]
963    fn test_json_fmt_utf8_truncation() {
964        // Test that UTF-8 character boundaries are respected during truncation
965        // Create a string with multi-byte characters that would be truncated
966
967        // String with 7 ASCII chars + 4-byte emoji (total 11 bytes, truncates at 8)
968        let utf8_json = serde_json::json!({
969            "emoji": "1234567🦀"  // 7 + 4 = 11 bytes, MAX is 8
970        });
971
972        // Should truncate at byte 7 (before the emoji) to respect UTF-8 boundary
973        let result = format!("{}", JsonFmt(&utf8_json));
974
975        // Verify it doesn't panic and produces valid output
976        assert!(result.contains("1234567"));
977        assert!(!result.contains("🦀")); // Emoji should be truncated away
978
979        // Test with all multi-byte characters
980        let all_multibyte = serde_json::json!({
981            "chinese": "你好世界"  // Each char is 3 bytes = 12 bytes total
982        });
983        let result3 = format!("{}", JsonFmt(&all_multibyte));
984        assert!(!result3.is_empty());
985    }
986
987    #[test]
988    fn test_encodings() {
989        let value = TestDumpStruct {
990            a: "hello, world".to_string(),
991            b: 123,
992            c: Some(321),
993            d: Some(Part::from("hello, world, again")),
994        };
995        for enc in Encoding::iter() {
996            let ser = Serialized::serialize_with_encoding(enc, &value).unwrap();
997            assert_eq!(ser.encoding(), enc);
998            assert_eq!(ser.deserialized::<TestDumpStruct>().unwrap(), value);
999        }
1000    }
1001}