monarch_conda/
hash_utils.rs

1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9use std::path::Path;
10
11use anyhow::Result;
12use anyhow::bail;
13use digest::Digest;
14use tokio::fs;
15use walkdir::WalkDir;
16
17/// Compute a hash of a directory tree using the provided hasher.
18///
19/// This function traverses the directory tree deterministically (sorted by file name)
20/// and includes both file paths and file contents in the hash computation.
21///
22/// # Arguments
23/// * `dir` - The directory to hash
24/// * `hasher` - A hasher implementing the Digest trait (e.g., Sha256::new())
25///
26/// # Returns
27/// () - The hasher is updated with the directory tree data
28pub async fn hash_directory_tree<D: Digest>(dir: &Path, hasher: &mut D) -> Result<()> {
29    // Iterate entries with deterministic ordering
30    for entry in WalkDir::new(dir).sort_by_file_name().into_iter() {
31        let entry = entry?;
32        let path = entry.path();
33        let relative_path = path.strip_prefix(dir)?;
34
35        // Hash the relative path (normalized to use forward slashes)
36        let path_str = relative_path.to_string_lossy().replace('\\', "/");
37        hasher.update(path_str.as_bytes());
38        hasher.update(b"\0"); // null separator
39
40        if entry.file_type().is_file() {
41            // Hash file type marker, size, and contents
42            hasher.update(b"FILE:");
43            let contents = fs::read(path).await?;
44            hasher.update(contents.len().to_le_bytes());
45            hasher.update(&contents);
46        } else if entry.file_type().is_dir() {
47            // For directories, hash a type marker
48            hasher.update(b"DIR:");
49        } else if entry.file_type().is_symlink() {
50            // For symlinks, hash type marker, target size, and target
51            hasher.update(b"SYMLINK:");
52            let target = fs::read_link(path).await?;
53            let target_string = target.to_string_lossy().into_owned();
54            let target_bytes = target_string.as_bytes();
55            hasher.update(target_bytes.len().to_le_bytes());
56            hasher.update(target_bytes);
57        } else {
58            // Unexpected file type
59            bail!("Unexpected file type for path: {}", path.display());
60        }
61
62        hasher.update(b"\n"); // entry separator
63    }
64
65    Ok(())
66}
67
68#[cfg(test)]
69mod tests {
70    use sha2::Sha256;
71    use tempfile::TempDir;
72    use tokio::fs;
73
74    use super::*;
75
76    #[tokio::test]
77    async fn test_hash_directory_tree() -> Result<()> {
78        // Create a temporary directory with some test files
79        let temp_dir = TempDir::new()?;
80        let dir_path = temp_dir.path();
81
82        // Create test files
83        fs::write(dir_path.join("file1.txt"), "Hello, world!").await?;
84        fs::write(dir_path.join("file2.txt"), "Another file").await?;
85        fs::create_dir(dir_path.join("subdir")).await?;
86        fs::write(dir_path.join("subdir").join("file3.txt"), "Nested file").await?;
87
88        // Hash the directory
89        let mut hasher1 = Sha256::new();
90        let mut hasher2 = Sha256::new();
91        hash_directory_tree(dir_path, &mut hasher1).await?;
92        hash_directory_tree(dir_path, &mut hasher2).await?;
93
94        let hash1 = hasher1.finalize();
95        let hash2 = hasher2.finalize();
96
97        // Should be deterministic
98        assert_eq!(hash1, hash2);
99        assert_eq!(hash1.len(), 32); // SHA256 raw bytes length
100
101        Ok(())
102    }
103
104    #[tokio::test]
105    async fn test_no_hash_collision_between_file_and_dir() -> Result<()> {
106        // Test that a file containing "DIR:" and an empty directory don't collide
107        let temp_dir1 = TempDir::new()?;
108        let temp_dir2 = TempDir::new()?;
109
110        // Create a file with content that could collide with directory marker
111        fs::write(temp_dir1.path().join("test"), "DIR:").await?;
112
113        // Create an empty directory with the same name
114        fs::create_dir(temp_dir2.path().join("test")).await?;
115
116        // Hash both scenarios
117        let mut hasher_file = Sha256::new();
118        let mut hasher_dir = Sha256::new();
119        hash_directory_tree(temp_dir1.path(), &mut hasher_file).await?;
120        hash_directory_tree(temp_dir2.path(), &mut hasher_dir).await?;
121
122        let hash_file = hasher_file.finalize();
123        let hash_dir = hasher_dir.finalize();
124
125        // Should be different due to type prefixes
126        assert_ne!(hash_file, hash_dir);
127
128        Ok(())
129    }
130
131    #[tokio::test]
132    async fn test_no_structural_marker_collision() -> Result<()> {
133        // Test that files containing our structural markers don't cause collisions
134        let temp_dir1 = TempDir::new()?;
135        let temp_dir2 = TempDir::new()?;
136
137        // Create a file that could potentially collide without size prefixes:
138        // Path: "test1", Content: "foo\n"
139        // Without size prefixes: test1\0FILE:foo\n\n
140        fs::write(temp_dir1.path().join("test1"), "foo\n").await?;
141
142        // Create a file with path that includes our structural markers:
143        // Path: "test1\nFILE:", Content: "foo\n"
144        // Without size prefixes: test1\nFILE:\0FILE:foo\n\n
145        // This could potentially collide with the above
146        fs::write(temp_dir2.path().join("test1\nFILE:"), "foo\n").await?;
147
148        // Hash both scenarios
149        let mut hasher1 = Sha256::new();
150        let mut hasher2 = Sha256::new();
151        hash_directory_tree(temp_dir1.path(), &mut hasher1).await?;
152        hash_directory_tree(temp_dir2.path(), &mut hasher2).await?;
153
154        let hash1 = hasher1.finalize();
155        let hash2 = hasher2.finalize();
156
157        // Should be different - size prefixes prevent structural marker confusion
158        assert_ne!(hash1, hash2);
159
160        Ok(())
161    }
162}