File deduplication (#6332)

When receiving messages, blobs will be deduplicated with the new function `create_and_deduplicate_from_bytes()`. For sending files, this adds a new function `set_file_and_deduplicate()` instead of deduplicating by default. This is for https://github.com/deltachat/deltachat-core-rust/issues/6265; read the issue description there for more details. TODO: - [x] Set files as read-only - [x] Don't do a write when the file is already identical - [x] The first 32 chars or so of the 64-character hash are enough. I calculated that if 10b people (i.e. all of humanity) use DC, and each of them has 200k distinct blob files (I have 4k in my day-to-day account), and we used 20 chars, then the expected value for the number of name collisions would be ~0.0002 (and the probability that there is a least one name collision is lower than that) [^1]. I added 12 more characters to be on the super safe side, but this wouldn't be necessary and I could also make it 20 instead of 32. - Not 100% sure whether that's necessary at all - it would mainly be necessary if we might hit a length limit on some file systems (the blobdir is usually sth like `accounts/2ff9fc096d2f46b6832b24a1ed99c0d6/dc.db-blobs` (53 chars), plus 64 chars for the filename would be 117). - [x] "touch" the files to prevent them from being deleted - [x] TODOs in the code For later PRs: - Replace `BlobObject::create(…)` with `BlobObject::create_and_deduplicate(…)` in order to deduplicate everytime core creates a file - Modify JsonRPC to deduplicate blob files - Possibly rename BlobObject.name to BlobObject.file in order to prevent confusion (because `name` usually means "user-visible-name", not "name of the file on disk"). [^1]: Calculated with both https://printfn.github.io/fend/ and https://www.geogebra.org/calculator, both of which came to the same result ([1](https://github.com/user-attachments/assets/bbb62550-3781-48b5-88b1-ba0e29c28c0d), [2](https://github.com/user-attachments/assets/82171212-b797-4117-a39f-0e132eac7252)) --------- Co-authored-by: l <link2xt@testrun.org>
2026-04-26 09:56:35 +03:00 · 2025-01-21 19:42:19 +01:00
parent 22a7cfe9c3
commit 65a9c4b79b
23 changed files with 583 additions and 240 deletions
--- a/src/mimefactory.rs
+++ b/src/mimefactory.rs
@@ -1,6 +1,7 @@
 //! # MIME message production.

 use std::collections::HashSet;
+use std::path::Path;

 use anyhow::{bail, Context as _, Result};
 use base64::Engine as _;
@@ -1605,12 +1606,17 @@ pub(crate) fn wrapped_base64_encode(buf: &[u8]) -> String {
 }

 async fn build_body_file(context: &Context, msg: &Message) -> Result<PartBuilder> {
+    let file_name = msg.get_filename().context("msg has no file")?;
+    let suffix = Path::new(&file_name)
+        .extension()
+        .and_then(|e| e.to_str())
+        .unwrap_or("dat");
+
    let blob = msg
        .param
        .get_blob(Param::File, context)
        .await?
        .context("msg has no file")?;
-    let suffix = blob.suffix().unwrap_or("dat");

    // Get file name to use for sending.  For privacy purposes, we do
    // not transfer the original filenames eg. for images; these names
@@ -1650,18 +1656,14 @@ async fn build_body_file(context: &Context, msg: &Message) -> Result<PartBuilder
                ),
            &suffix
        ),
-        _ => msg
-            .param
-            .get(Param::Filename)
-            .unwrap_or_else(|| blob.as_file_name())
-            .to_string(),
+        _ => file_name,
    };

    /* check mimetype */
    let mimetype: mime::Mime = match msg.param.get(Param::MimeType) {
        Some(mtype) => mtype.parse()?,
        None => {
-            if let Some(res) = message::guess_msgtype_from_suffix(blob.as_rel_path()) {
+            if let Some(res) = message::guess_msgtype_from_suffix(msg) {
                res.1.parse()?
            } else {
                mime::APPLICATION_OCTET_STREAM
@@ -2624,8 +2626,7 @@ mod tests {
        // Long messages are truncated and MimeMessage::decoded_data is set for them. We need
        // decoded_data to check presence of the necessary headers.
        msg.set_text("a".repeat(constants::DC_DESIRED_TEXT_LEN + 1));
-        msg.set_file_from_bytes(&bob, "foo.bar", "content".as_bytes(), None)
-            .await?;
+        msg.set_file_from_bytes(&bob, "foo.bar", "content".as_bytes(), None)?;
        let sent = bob.send_msg(chat, &mut msg).await;
        assert!(msg.get_showpadlock());
        assert!(sent.payload.contains("\r\nSubject: [...]\r\n"));