From 8f58c4777ed977f24b81c80efca7cfb36081cafb Mon Sep 17 00:00:00 2001 From: Hocuri Date: Wed, 22 Jan 2025 16:44:59 +0100 Subject: [PATCH] feat: Keep file extension on deduplicated files (#6463) fix #6461 --- deltachat-ffi/deltachat.h | 2 +- src/blob.rs | 80 +++++++++++++++++----------- src/chat/chat_tests.rs | 2 +- src/imex/transfer.rs | 2 +- src/message.rs | 23 ++++---- src/mimeparser.rs | 21 ++++---- src/receive_imf/receive_imf_tests.rs | 4 +- 7 files changed, 78 insertions(+), 56 deletions(-) diff --git a/deltachat-ffi/deltachat.h b/deltachat-ffi/deltachat.h index 0a38dff73..5a5e87b3d 100644 --- a/deltachat-ffi/deltachat.h +++ b/deltachat-ffi/deltachat.h @@ -4766,7 +4766,7 @@ void dc_msg_set_file (dc_msg_t* msg, const char* file, * otherwise it will be copied to the blobdir first. * * In order to deduplicate files that contain the same data, - * the file will be named as a hash of the file data. + * the file will be named `.`, e.g. `ce940175885d7b78f7b7e9f1396611f.jpg`. * * NOTE: * - This function will rename the file. To get the new file path, call `get_file()`. diff --git a/src/blob.rs b/src/blob.rs index 2f2c8959e..2ccf9b0af 100644 --- a/src/blob.rs +++ b/src/blob.rs @@ -150,11 +150,16 @@ impl<'a> BlobObject<'a> { /// otherwise it will be copied to the blobdir first. /// /// In order to deduplicate files that contain the same data, - /// the file will be named as the hash of the file data. + /// the file will be named `.`, e.g. `ce940175885d7b78f7b7e9f1396611f.jpg`. + /// The `original_name` param is only used to get the extension. /// /// This is done in a in way which avoids race-conditions when multiple files are /// concurrently created. - pub fn create_and_deduplicate(context: &'a Context, src: &Path) -> Result> { + pub fn create_and_deduplicate( + context: &'a Context, + src: &Path, + original_name: &str, + ) -> Result> { // `create_and_deduplicate{_from_bytes}()` do blocking I/O, but can still be called // from an async context thanks to `block_in_place()`. // Tokio's "async" I/O functions are also just thin wrappers around the blocking I/O syscalls, @@ -180,7 +185,25 @@ impl<'a> BlobObject<'a> { src_in_blobdir = &temp_path; } - let blob = BlobObject::from_hash(blobdir, file_hash(src_in_blobdir)?); + let hash = file_hash(src_in_blobdir)?.to_hex(); + let hash = hash.as_str(); + let hash = hash.get(0..31).unwrap_or(hash); + let new_file = if let Some(extension) = Path::new(original_name) + .extension() + .filter(|e| e.len() <= 32) + { + format!( + "$BLOBDIR/{hash}.{}", + extension.to_string_lossy().to_lowercase() + ) + } else { + format!("$BLOBDIR/{hash}") + }; + + let blob = BlobObject { + blobdir, + name: new_file, + }; let new_path = blob.to_abs_path(); // This will also replace an already-existing file. @@ -194,7 +217,8 @@ impl<'a> BlobObject<'a> { /// Creates a new blob object with the file contents in `data`. /// In order to deduplicate files that contain the same data, - /// the file will be renamed to a hash of the file data. + /// the file will be named `.`, e.g. `ce940175885d7b78f7b7e9f1396611f.jpg`. + /// The `original_name` param is only used to get the extension. /// /// The `data` will be written into the file without race-conditions. /// @@ -203,6 +227,7 @@ impl<'a> BlobObject<'a> { pub fn create_and_deduplicate_from_bytes( context: &'a Context, data: &[u8], + original_name: &str, ) -> Result> { task::block_in_place(|| { let blobdir = context.get_blobdir(); @@ -213,20 +238,10 @@ impl<'a> BlobObject<'a> { std::fs::write(&temp_path, data).context("writing new blobfile failed")?; }; - BlobObject::create_and_deduplicate(context, &temp_path) + BlobObject::create_and_deduplicate(context, &temp_path, original_name) }) } - fn from_hash(blobdir: &Path, hash: blake3::Hash) -> BlobObject<'_> { - let hash = hash.to_hex(); - let hash = hash.as_str(); - let hash = hash.get(0..31).unwrap_or(hash); - BlobObject { - blobdir, - name: format!("$BLOBDIR/{hash}"), - } - } - /// Creates a blob from a file, possibly copying it to the blobdir. /// /// If the source file is not a path to into the blob directory @@ -674,7 +689,7 @@ impl<'a> BlobObject<'a> { encode_img(&img, ofmt, &mut encoded)?; } - self.name = BlobObject::create_and_deduplicate_from_bytes(context, &encoded) + self.name = BlobObject::create_and_deduplicate_from_bytes(context, &encoded, &name) .context("failed to write recoded blob to file")? .name; } @@ -905,8 +920,11 @@ mod tests { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn test_lowercase_ext() { let t = TestContext::new().await; - let blob = BlobObject::create(&t, "foo.TXT", b"hello").await.unwrap(); - assert_eq!(blob.as_name(), "$BLOBDIR/foo.txt"); + let blob = BlobObject::create_and_deduplicate_from_bytes(&t, b"hello", "foo.TXT").unwrap(); + assert!( + blob.as_name().ends_with(".txt"), + "Blob {blob:?} should end with .txt" + ); } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] @@ -980,10 +998,10 @@ mod tests { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn test_create_long_names() { let t = TestContext::new().await; - let s = "1".repeat(150); - let blob = BlobObject::create(&t, &s, b"data").await.unwrap(); + let s = format!("file.{}", "a".repeat(100)); + let blob = BlobObject::create_and_deduplicate_from_bytes(&t, b"data", &s).unwrap(); let blobname = blob.as_name().split('/').last().unwrap(); - assert!(blobname.len() < 128); + assert!(blobname.len() < 70); } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] @@ -1618,28 +1636,28 @@ mod tests { let path = t.get_blobdir().join("anyfile.dat"); fs::write(&path, b"bla").await?; - let blob = BlobObject::create_and_deduplicate(&t, &path)?; - assert_eq!(blob.name, "$BLOBDIR/ce940175885d7b78f7b7e9f1396611f"); + let blob = BlobObject::create_and_deduplicate(&t, &path, "anyfile.dat")?; + assert_eq!(blob.name, "$BLOBDIR/ce940175885d7b78f7b7e9f1396611f.dat"); assert_eq!(path.exists(), false); assert_eq!(fs::read(&blob.to_abs_path()).await?, b"bla"); fs::write(&path, b"bla").await?; - let blob2 = BlobObject::create_and_deduplicate(&t, &path)?; + let blob2 = BlobObject::create_and_deduplicate(&t, &path, "anyfile.dat")?; assert_eq!(blob2.name, blob.name); let path_outside_blobdir = t.dir.path().join("anyfile.dat"); fs::write(&path_outside_blobdir, b"bla").await?; - let blob3 = BlobObject::create_and_deduplicate(&t, &path_outside_blobdir)?; + let blob3 = BlobObject::create_and_deduplicate(&t, &path_outside_blobdir, "anyfile.dat")?; assert!(path_outside_blobdir.exists()); assert_eq!(blob3.name, blob.name); fs::write(&path, b"blabla").await?; - let blob4 = BlobObject::create_and_deduplicate(&t, &path)?; + let blob4 = BlobObject::create_and_deduplicate(&t, &path, "anyfile.dat")?; assert_ne!(blob4.name, blob.name); fs::remove_dir_all(t.get_blobdir()).await?; - let blob5 = BlobObject::create_and_deduplicate(&t, &path_outside_blobdir)?; + let blob5 = BlobObject::create_and_deduplicate(&t, &path_outside_blobdir, "anyfile.dat")?; assert_eq!(blob5.name, blob.name); Ok(()) @@ -1650,7 +1668,7 @@ mod tests { let t = TestContext::new().await; fs::remove_dir(t.get_blobdir()).await?; - let blob = BlobObject::create_and_deduplicate_from_bytes(&t, b"bla")?; + let blob = BlobObject::create_and_deduplicate_from_bytes(&t, b"bla", "file")?; assert_eq!(blob.name, "$BLOBDIR/ce940175885d7b78f7b7e9f1396611f"); assert_eq!(fs::read(&blob.to_abs_path()).await?, b"bla"); @@ -1662,7 +1680,7 @@ mod tests { // which we can't mock from our code. tokio::time::sleep(Duration::from_millis(1100)).await; - let blob2 = BlobObject::create_and_deduplicate_from_bytes(&t, b"bla")?; + let blob2 = BlobObject::create_and_deduplicate_from_bytes(&t, b"bla", "file")?; assert_eq!(blob2.name, blob.name); let modified2 = blob.to_abs_path().metadata()?.modified()?; @@ -1675,7 +1693,7 @@ mod tests { sql::housekeeping(&t).await?; assert_eq!(blob2.to_abs_path().exists(), false); - let blob3 = BlobObject::create_and_deduplicate_from_bytes(&t, b"blabla")?; + let blob3 = BlobObject::create_and_deduplicate_from_bytes(&t, b"blabla", "file")?; assert_ne!(blob3.name, blob.name); { @@ -1683,7 +1701,7 @@ mod tests { // the correct content should be restored: fs::write(blob3.to_abs_path(), b"bloblo").await?; - let blob4 = BlobObject::create_and_deduplicate_from_bytes(&t, b"blabla")?; + let blob4 = BlobObject::create_and_deduplicate_from_bytes(&t, b"blabla", "file")?; let blob4_content = fs::read(blob4.to_abs_path()).await?; assert_eq!(blob4_content, b"blabla"); } diff --git a/src/chat/chat_tests.rs b/src/chat/chat_tests.rs index fba4be118..ce64df022 100644 --- a/src/chat/chat_tests.rs +++ b/src/chat/chat_tests.rs @@ -2832,7 +2832,7 @@ async fn test_blob_renaming() -> Result<()> { // the file bob receives should not contain BIDI-control characters assert_eq!( - Some("$BLOBDIR/30c0f9c6a167fc2a91285c85be7ea34"), + Some("$BLOBDIR/30c0f9c6a167fc2a91285c85be7ea34.exe"), msg.param.get(Param::File), ); assert_eq!( diff --git a/src/imex/transfer.rs b/src/imex/transfer.rs index 0fe28893b..0496ea151 100644 --- a/src/imex/transfer.rs +++ b/src/imex/transfer.rs @@ -431,7 +431,7 @@ mod tests { let path = msg.get_file(&ctx1).unwrap(); assert_eq!( // That's the hash of the file: - path.with_file_name("ac1d2d284757656a8d41dc40aae4136"), + path.with_file_name("ac1d2d284757656a8d41dc40aae4136.txt"), path ); assert_eq!("hello.txt", msg.get_filename().unwrap()); diff --git a/src/message.rs b/src/message.rs index 0781c8f26..b45177a40 100644 --- a/src/message.rs +++ b/src/message.rs @@ -1,7 +1,6 @@ //! # Messages and their identifiers. use std::collections::BTreeSet; -use std::ffi::OsStr; use std::path::{Path, PathBuf}; use std::str; @@ -1095,7 +1094,7 @@ impl Message { /// otherwise it will be copied to the blobdir first. /// /// In order to deduplicate files that contain the same data, - /// the file will be named as a hash of the file data. + /// the file will be named `.`, e.g. `ce940175885d7b78f7b7e9f1396611f.jpg`. /// /// NOTE: /// - This function will rename the file. To get the new file path, call `get_file()`. @@ -1107,14 +1106,18 @@ impl Message { name: Option<&str>, filemime: Option<&str>, ) -> Result<()> { - let blob = BlobObject::create_and_deduplicate(context, file)?; - if let Some(name) = name { - self.param.set(Param::Filename, name); + let name = if let Some(name) = name { + name.to_string() } else { - let file_name = file.file_name().map(OsStr::to_string_lossy); - self.param.set_optional(Param::Filename, file_name); - } + file.file_name() + .map(|s| s.to_string_lossy().to_string()) + .unwrap_or_else(|| "unknown_file".to_string()) + }; + + let blob = BlobObject::create_and_deduplicate(context, file, &name)?; self.param.set(Param::File, blob.as_name()); + + self.param.set(Param::Filename, name); self.param.set_optional(Param::MimeType, filemime); Ok(()) @@ -1123,7 +1126,7 @@ impl Message { /// Creates a new blob and sets it as a file associated with a message. /// /// In order to deduplicate files that contain the same data, - /// the filename will be a hash of the file data. + /// the file will be named `.`, e.g. `ce940175885d7b78f7b7e9f1396611f.jpg`. /// /// NOTE: The file must not be modified after this function was called. pub fn set_file_from_bytes( @@ -1133,7 +1136,7 @@ impl Message { data: &[u8], filemime: Option<&str>, ) -> Result<()> { - let blob = BlobObject::create_and_deduplicate_from_bytes(context, data)?; + let blob = BlobObject::create_and_deduplicate_from_bytes(context, data, name)?; self.param.set(Param::Filename, name); self.param.set(Param::File, blob.as_name()); self.param.set_optional(Param::MimeType, filemime); diff --git a/src/mimeparser.rs b/src/mimeparser.rs index 1f8ad2b23..821620369 100644 --- a/src/mimeparser.rs +++ b/src/mimeparser.rs @@ -1384,16 +1384,17 @@ impl MimeMessage { /* we have a regular file attachment, write decoded data to new blob object */ - let blob = match BlobObject::create_and_deduplicate_from_bytes(context, decoded_data) { - Ok(blob) => blob, - Err(err) => { - error!( - context, - "Could not add blob for mime part {}, error {:#}", filename, err - ); - return Ok(()); - } - }; + let blob = + match BlobObject::create_and_deduplicate_from_bytes(context, decoded_data, filename) { + Ok(blob) => blob, + Err(err) => { + error!( + context, + "Could not add blob for mime part {}, error {:#}", filename, err + ); + return Ok(()); + } + }; info!(context, "added blobfile: {:?}", blob.as_name()); if mime_type.type_() == mime::IMAGE { diff --git a/src/receive_imf/receive_imf_tests.rs b/src/receive_imf/receive_imf_tests.rs index 4f4e479e8..de14d07a7 100644 --- a/src/receive_imf/receive_imf_tests.rs +++ b/src/receive_imf/receive_imf_tests.rs @@ -1667,7 +1667,7 @@ async fn test_pdf_filename_simple() { assert_eq!( file_path, // That's the blake3 hash of the file content: - "$BLOBDIR/24a6af459cec5d733374aeaa19a6133" + "$BLOBDIR/24a6af459cec5d733374aeaa19a6133.pdf" ); assert_eq!(msg.param.get(Param::Filename).unwrap(), "simple.pdf"); } @@ -3268,7 +3268,7 @@ async fn test_weird_and_duplicated_filenames() -> Result<()> { msg.save_file(t, &path2).await.unwrap(); assert_eq!( path.file_name().unwrap().to_str().unwrap(), - "79402cb76f44c5761888f9036992a76", + "79402cb76f44c5761888f9036992a76.gz", "The hash of the content should always be the same" ); assert_eq!(fs::read_to_string(&path).await.unwrap(), content);