mirror of
https://github.com/chatmail/core.git
synced 2026-04-21 15:36:30 +03:00
File deduplication (#6332)
When receiving messages, blobs will be deduplicated with the new function `create_and_deduplicate_from_bytes()`. For sending files, this adds a new function `set_file_and_deduplicate()` instead of deduplicating by default. This is for https://github.com/deltachat/deltachat-core-rust/issues/6265; read the issue description there for more details. TODO: - [x] Set files as read-only - [x] Don't do a write when the file is already identical - [x] The first 32 chars or so of the 64-character hash are enough. I calculated that if 10b people (i.e. all of humanity) use DC, and each of them has 200k distinct blob files (I have 4k in my day-to-day account), and we used 20 chars, then the expected value for the number of name collisions would be ~0.0002 (and the probability that there is a least one name collision is lower than that) [^1]. I added 12 more characters to be on the super safe side, but this wouldn't be necessary and I could also make it 20 instead of 32. - Not 100% sure whether that's necessary at all - it would mainly be necessary if we might hit a length limit on some file systems (the blobdir is usually sth like `accounts/2ff9fc096d2f46b6832b24a1ed99c0d6/dc.db-blobs` (53 chars), plus 64 chars for the filename would be 117). - [x] "touch" the files to prevent them from being deleted - [x] TODOs in the code For later PRs: - Replace `BlobObject::create(…)` with `BlobObject::create_and_deduplicate(…)` in order to deduplicate everytime core creates a file - Modify JsonRPC to deduplicate blob files - Possibly rename BlobObject.name to BlobObject.file in order to prevent confusion (because `name` usually means "user-visible-name", not "name of the file on disk"). [^1]: Calculated with both https://printfn.github.io/fend/ and https://www.geogebra.org/calculator, both of which came to the same result ([1](https://github.com/user-attachments/assets/bbb62550-3781-48b5-88b1-ba0e29c28c0d), [2](https://github.com/user-attachments/assets/82171212-b797-4117-a39f-0e132eac7252)) --------- Co-authored-by: l <link2xt@testrun.org>
This commit is contained in:
@@ -1664,8 +1664,12 @@ async fn test_pdf_filename_simple() {
|
||||
assert_eq!(msg.viewtype, Viewtype::File);
|
||||
assert_eq!(msg.text, "mail body");
|
||||
let file_path = msg.param.get(Param::File).unwrap();
|
||||
assert!(file_path.starts_with("$BLOBDIR/simple"));
|
||||
assert!(file_path.ends_with(".pdf"));
|
||||
assert_eq!(
|
||||
file_path,
|
||||
// That's the blake3 hash of the file content:
|
||||
"$BLOBDIR/24a6af459cec5d733374aeaa19a6133"
|
||||
);
|
||||
assert_eq!(msg.param.get(Param::Filename).unwrap(), "simple.pdf");
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
|
||||
@@ -1680,8 +1684,8 @@ async fn test_pdf_filename_continuation() {
|
||||
assert_eq!(msg.viewtype, Viewtype::File);
|
||||
assert_eq!(msg.text, "mail body");
|
||||
let file_path = msg.param.get(Param::File).unwrap();
|
||||
assert!(file_path.starts_with("$BLOBDIR/test pdf äöüß"));
|
||||
assert!(file_path.ends_with(".pdf"));
|
||||
assert!(file_path.starts_with("$BLOBDIR/"));
|
||||
assert_eq!(msg.get_filename().unwrap(), "test pdf äöüß.pdf");
|
||||
}
|
||||
|
||||
/// HTML-images may come with many embedded images, eg. tiny icons, corners for formatting,
|
||||
@@ -3244,11 +3248,11 @@ async fn test_weird_and_duplicated_filenames() -> Result<()> {
|
||||
"a. tar.tar.gz",
|
||||
] {
|
||||
let attachment = alice.blobdir.join(filename_sent);
|
||||
let content = format!("File content of {filename_sent}");
|
||||
let content = "File content of tar.gz archive".to_string();
|
||||
tokio::fs::write(&attachment, content.as_bytes()).await?;
|
||||
|
||||
let mut msg_alice = Message::new(Viewtype::File);
|
||||
msg_alice.set_file(attachment.to_str().unwrap(), None);
|
||||
msg_alice.set_file_and_deduplicate(&alice, &attachment, None, None)?;
|
||||
let alice_chat = alice.create_chat(&bob).await;
|
||||
let sent = alice.send_msg(alice_chat.id, &mut msg_alice).await;
|
||||
println!("{}", sent.payload());
|
||||
@@ -3262,9 +3266,10 @@ async fn test_weird_and_duplicated_filenames() -> Result<()> {
|
||||
let path = msg.get_file(t).unwrap();
|
||||
let path2 = path.with_file_name("saved.txt");
|
||||
msg.save_file(t, &path2).await.unwrap();
|
||||
assert!(
|
||||
path.to_str().unwrap().ends_with(".tar.gz"),
|
||||
"path {path:?} doesn't end with .tar.gz"
|
||||
assert_eq!(
|
||||
path.file_name().unwrap().to_str().unwrap(),
|
||||
"79402cb76f44c5761888f9036992a76",
|
||||
"The hash of the content should always be the same"
|
||||
);
|
||||
assert_eq!(fs::read_to_string(&path).await.unwrap(), content);
|
||||
assert_eq!(fs::read_to_string(&path2).await.unwrap(), content);
|
||||
@@ -4700,8 +4705,7 @@ async fn test_create_group_with_big_msg() -> Result<()> {
|
||||
let bob_grp_id = create_group_chat(&bob, ProtectionStatus::Unprotected, "Group").await?;
|
||||
add_contact_to_chat(&bob, bob_grp_id, ba_contact).await?;
|
||||
let mut msg = Message::new(Viewtype::Image);
|
||||
msg.set_file_from_bytes(&bob, "a.jpg", file_bytes, None)
|
||||
.await?;
|
||||
msg.set_file_from_bytes(&bob, "a.jpg", file_bytes, None)?;
|
||||
let sent_msg = bob.send_msg(bob_grp_id, &mut msg).await;
|
||||
assert!(!msg.get_showpadlock());
|
||||
|
||||
@@ -4737,8 +4741,7 @@ async fn test_create_group_with_big_msg() -> Result<()> {
|
||||
let bob_grp_id = create_group_chat(&bob, ProtectionStatus::Unprotected, "Group1").await?;
|
||||
add_contact_to_chat(&bob, bob_grp_id, ba_contact).await?;
|
||||
let mut msg = Message::new(Viewtype::Image);
|
||||
msg.set_file_from_bytes(&bob, "a.jpg", file_bytes, None)
|
||||
.await?;
|
||||
msg.set_file_from_bytes(&bob, "a.jpg", file_bytes, None)?;
|
||||
let sent_msg = bob.send_msg(bob_grp_id, &mut msg).await;
|
||||
assert!(msg.get_showpadlock());
|
||||
|
||||
@@ -5179,8 +5182,7 @@ async fn test_prefer_references_to_downloaded_msgs() -> Result<()> {
|
||||
|
||||
let file_bytes = include_bytes!("../../test-data/image/screenshot.gif");
|
||||
let mut msg = Message::new(Viewtype::File);
|
||||
msg.set_file_from_bytes(alice, "file", file_bytes, None)
|
||||
.await?;
|
||||
msg.set_file_from_bytes(alice, "file", file_bytes, None)?;
|
||||
let mut sent = alice.send_msg(alice_chat_id, &mut msg).await;
|
||||
sent.payload = sent
|
||||
.payload
|
||||
@@ -5192,8 +5194,7 @@ async fn test_prefer_references_to_downloaded_msgs() -> Result<()> {
|
||||
assert_eq!(received.chat_id, bob.get_chat(alice).await.id);
|
||||
|
||||
let mut msg = Message::new(Viewtype::File);
|
||||
msg.set_file_from_bytes(alice, "file", file_bytes, None)
|
||||
.await?;
|
||||
msg.set_file_from_bytes(alice, "file", file_bytes, None)?;
|
||||
let sent = alice.send_msg(alice_chat_id, &mut msg).await;
|
||||
let received = bob.recv_msg(&sent).await;
|
||||
assert_eq!(received.download_state, DownloadState::Available);
|
||||
@@ -5252,7 +5253,6 @@ async fn test_receive_vcard() -> Result<()> {
|
||||
.as_bytes(),
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let alice_bob_chat = alice.create_chat(bob).await;
|
||||
|
||||
Reference in New Issue
Block a user