mirror of
https://github.com/chatmail/core.git
synced 2026-04-19 06:26:30 +03:00
File deduplication (#6332)
When receiving messages, blobs will be deduplicated with the new function `create_and_deduplicate_from_bytes()`. For sending files, this adds a new function `set_file_and_deduplicate()` instead of deduplicating by default. This is for https://github.com/deltachat/deltachat-core-rust/issues/6265; read the issue description there for more details. TODO: - [x] Set files as read-only - [x] Don't do a write when the file is already identical - [x] The first 32 chars or so of the 64-character hash are enough. I calculated that if 10b people (i.e. all of humanity) use DC, and each of them has 200k distinct blob files (I have 4k in my day-to-day account), and we used 20 chars, then the expected value for the number of name collisions would be ~0.0002 (and the probability that there is a least one name collision is lower than that) [^1]. I added 12 more characters to be on the super safe side, but this wouldn't be necessary and I could also make it 20 instead of 32. - Not 100% sure whether that's necessary at all - it would mainly be necessary if we might hit a length limit on some file systems (the blobdir is usually sth like `accounts/2ff9fc096d2f46b6832b24a1ed99c0d6/dc.db-blobs` (53 chars), plus 64 chars for the filename would be 117). - [x] "touch" the files to prevent them from being deleted - [x] TODOs in the code For later PRs: - Replace `BlobObject::create(…)` with `BlobObject::create_and_deduplicate(…)` in order to deduplicate everytime core creates a file - Modify JsonRPC to deduplicate blob files - Possibly rename BlobObject.name to BlobObject.file in order to prevent confusion (because `name` usually means "user-visible-name", not "name of the file on disk"). [^1]: Calculated with both https://printfn.github.io/fend/ and https://www.geogebra.org/calculator, both of which came to the same result ([1](https://github.com/user-attachments/assets/bbb62550-3781-48b5-88b1-ba0e29c28c0d), [2](https://github.com/user-attachments/assets/82171212-b797-4117-a39f-0e132eac7252)) --------- Co-authored-by: l <link2xt@testrun.org>
This commit is contained in:
@@ -1373,8 +1373,8 @@ async fn test_x_microsoft_original_message_id() {
|
||||
\n\
|
||||
Does it work with outlook now?\n\
|
||||
", None)
|
||||
.await
|
||||
.unwrap();
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
message.get_rfc724_mid(),
|
||||
Some("Mr.6Dx7ITn4w38.n9j7epIcuQI@outlook.com".to_string())
|
||||
@@ -1505,8 +1505,8 @@ async fn test_ignore_read_receipt_to_self() -> Result<()> {
|
||||
// Due to a bug in the old version running on the other device, Alice receives a read
|
||||
// receipt from self.
|
||||
receive_imf(
|
||||
&alice,
|
||||
"Received: (Postfix, from userid 1000); Mon, 4 Dec 2006 14:51:39 +0100 (CET)\n\
|
||||
&alice,
|
||||
"Received: (Postfix, from userid 1000); Mon, 4 Dec 2006 14:51:39 +0100 (CET)\n\
|
||||
From: alice@example.org\n\
|
||||
To: alice@example.org\n\
|
||||
Subject: message opened\n\
|
||||
@@ -1532,10 +1532,10 @@ async fn test_ignore_read_receipt_to_self() -> Result<()> {
|
||||
\n\
|
||||
\n\
|
||||
--SNIPP--"
|
||||
.as_bytes(),
|
||||
false,
|
||||
)
|
||||
.await?;
|
||||
.as_bytes(),
|
||||
false,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Check that the state has not changed to `MessageState::OutMdnRcvd`.
|
||||
let msg = Message::load_from_db(&alice, msg.id).await?;
|
||||
@@ -1601,8 +1601,8 @@ async fn test_receive_eml() -> Result<()> {
|
||||
"this is a classic email – I attached the .EML file".to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
mime_message.parts[0].param.get(Param::File),
|
||||
Some("$BLOBDIR/.eml")
|
||||
mime_message.parts[0].param.get(Param::Filename),
|
||||
Some(".eml")
|
||||
);
|
||||
|
||||
assert_eq!(mime_message.parts[0].org_filename, Some(".eml".to_string()));
|
||||
|
||||
Reference in New Issue
Block a user