File deduplication (#6332)

When receiving messages, blobs will be deduplicated with the new
function `create_and_deduplicate_from_bytes()`. For sending files, this
adds a new function `set_file_and_deduplicate()` instead of
deduplicating by default.

This is for
https://github.com/deltachat/deltachat-core-rust/issues/6265; read the
issue description there for more details.

TODO:
- [x] Set files as read-only
- [x] Don't do a write when the file is already identical
- [x] The first 32 chars or so of the 64-character hash are enough. I
calculated that if 10b people (i.e. all of humanity) use DC, and each of
them has 200k distinct blob files (I have 4k in my day-to-day account),
and we used 20 chars, then the expected value for the number of name
collisions would be ~0.0002 (and the probability that there is a least
one name collision is lower than that) [^1]. I added 12 more characters
to be on the super safe side, but this wouldn't be necessary and I could
also make it 20 instead of 32.
- Not 100% sure whether that's necessary at all - it would mainly be
necessary if we might hit a length limit on some file systems (the
blobdir is usually sth like
`accounts/2ff9fc096d2f46b6832b24a1ed99c0d6/dc.db-blobs` (53 chars), plus
64 chars for the filename would be 117).
- [x] "touch" the files to prevent them from being deleted
- [x] TODOs in the code

For later PRs:
- Replace `BlobObject::create(…)` with
`BlobObject::create_and_deduplicate(…)` in order to deduplicate
everytime core creates a file
- Modify JsonRPC to deduplicate blob files
- Possibly rename BlobObject.name to BlobObject.file in order to prevent
confusion (because `name` usually means "user-visible-name", not "name
of the file on disk").

[^1]: Calculated with both https://printfn.github.io/fend/ and
https://www.geogebra.org/calculator, both of which came to the same
result
([1](https://github.com/user-attachments/assets/bbb62550-3781-48b5-88b1-ba0e29c28c0d),

[2](https://github.com/user-attachments/assets/82171212-b797-4117-a39f-0e132eac7252))

---------

Co-authored-by: l <link2xt@testrun.org>
This commit is contained in:
Hocuri
2025-01-21 19:42:19 +01:00
committed by GitHub
parent 22a7cfe9c3
commit 65a9c4b79b
23 changed files with 583 additions and 240 deletions

View File

@@ -1825,7 +1825,7 @@ async fn test_sticker(
tokio::fs::write(&file, bytes).await?;
let mut msg = Message::new(Viewtype::Sticker);
msg.set_file(file.to_str().unwrap(), None);
msg.set_file_and_deduplicate(&alice, &file, Some(filename), None)?;
let sent_msg = alice.send_msg(alice_chat.id, &mut msg).await;
let mime = sent_msg.payload();
@@ -1889,14 +1889,17 @@ async fn test_sticker_jpeg_force() {
// Images without force_sticker should be turned into [Viewtype::Image]
let mut msg = Message::new(Viewtype::Sticker);
msg.set_file(file.to_str().unwrap(), None);
msg.set_file_and_deduplicate(&alice, &file, Some("sticker.jpg"), None)
.unwrap();
let file = msg.get_file(&alice).unwrap();
let sent_msg = alice.send_msg(alice_chat.id, &mut msg).await;
let msg = bob.recv_msg(&sent_msg).await;
assert_eq!(msg.get_viewtype(), Viewtype::Image);
// Images with `force_sticker = true` should keep [Viewtype::Sticker]
let mut msg = Message::new(Viewtype::Sticker);
msg.set_file(file.to_str().unwrap(), None);
msg.set_file_and_deduplicate(&alice, &file, Some("sticker.jpg"), None)
.unwrap();
msg.force_sticker();
let sent_msg = alice.send_msg(alice_chat.id, &mut msg).await;
let msg = bob.recv_msg(&sent_msg).await;
@@ -1905,7 +1908,8 @@ async fn test_sticker_jpeg_force() {
// Images with `force_sticker = true` should keep [Viewtype::Sticker]
// even on drafted messages
let mut msg = Message::new(Viewtype::Sticker);
msg.set_file(file.to_str().unwrap(), None);
msg.set_file_and_deduplicate(&alice, &file, Some("sticker.jpg"), None)
.unwrap();
msg.force_sticker();
alice_chat
.id
@@ -1944,7 +1948,7 @@ async fn test_sticker_forward() -> Result<()> {
let file = alice.get_blobdir().join(file_name);
tokio::fs::write(&file, bytes).await?;
let mut msg = Message::new(Viewtype::Sticker);
msg.set_file(file.to_str().unwrap(), None);
msg.set_file_and_deduplicate(&alice, &file, Some("sticker.jpg"), None)?;
// send sticker to bob
let sent_msg = alice.send_msg(alice_chat.get_id(), &mut msg).await;
@@ -2669,7 +2673,7 @@ async fn test_get_chat_media() -> Result<()> {
let file = t.get_blobdir().join(name);
tokio::fs::write(&file, bytes).await?;
let mut msg = Message::new(msg_type);
msg.set_file(file.to_str().unwrap(), None);
msg.set_file_and_deduplicate(t, &file, Some(name), None)?;
send_msg(t, chat_id, &mut msg).await
}
@@ -2820,18 +2824,21 @@ async fn test_blob_renaming() -> Result<()> {
Contact::create(&alice, "bob", "bob@example.net").await?,
)
.await?;
let dir = tempfile::tempdir()?;
let file = dir.path().join("harmless_file.\u{202e}txt.exe");
let file = alice.get_blobdir().join("harmless_file.\u{202e}txt.exe");
fs::write(&file, "aaa").await?;
let mut msg = Message::new(Viewtype::File);
msg.set_file(file.to_str().unwrap(), None);
msg.set_file_and_deduplicate(&alice, &file, Some("harmless_file.\u{202e}txt.exe"), None)?;
let msg = bob.recv_msg(&alice.send_msg(chat_id, &mut msg).await).await;
// the file bob receives should not contain BIDI-control characters
assert_eq!(
Some("$BLOBDIR/harmless_file.txt.exe"),
Some("$BLOBDIR/30c0f9c6a167fc2a91285c85be7ea34"),
msg.param.get(Param::File),
);
assert_eq!(
Some("harmless_file.txt.exe"),
msg.param.get(Param::Filename),
);
Ok(())
}
@@ -3156,7 +3163,7 @@ async fn test_jpeg_with_png_ext() -> Result<()> {
let file = alice.get_blobdir().join("screenshot.png");
tokio::fs::write(&file, bytes).await?;
let mut msg = Message::new(Viewtype::Image);
msg.set_file(file.to_str().unwrap(), None);
msg.set_file_and_deduplicate(&alice, &file, Some("screenshot.png"), None)?;
let alice_chat = alice.create_chat(&bob).await;
let sent_msg = alice.send_msg(alice_chat.get_id(), &mut msg).await;