File deduplication (#6332)

When receiving messages, blobs will be deduplicated with the new
function `create_and_deduplicate_from_bytes()`. For sending files, this
adds a new function `set_file_and_deduplicate()` instead of
deduplicating by default.

This is for
https://github.com/deltachat/deltachat-core-rust/issues/6265; read the
issue description there for more details.

TODO:
- [x] Set files as read-only
- [x] Don't do a write when the file is already identical
- [x] The first 32 chars or so of the 64-character hash are enough. I
calculated that if 10b people (i.e. all of humanity) use DC, and each of
them has 200k distinct blob files (I have 4k in my day-to-day account),
and we used 20 chars, then the expected value for the number of name
collisions would be ~0.0002 (and the probability that there is a least
one name collision is lower than that) [^1]. I added 12 more characters
to be on the super safe side, but this wouldn't be necessary and I could
also make it 20 instead of 32.
- Not 100% sure whether that's necessary at all - it would mainly be
necessary if we might hit a length limit on some file systems (the
blobdir is usually sth like
`accounts/2ff9fc096d2f46b6832b24a1ed99c0d6/dc.db-blobs` (53 chars), plus
64 chars for the filename would be 117).
- [x] "touch" the files to prevent them from being deleted
- [x] TODOs in the code

For later PRs:
- Replace `BlobObject::create(…)` with
`BlobObject::create_and_deduplicate(…)` in order to deduplicate
everytime core creates a file
- Modify JsonRPC to deduplicate blob files
- Possibly rename BlobObject.name to BlobObject.file in order to prevent
confusion (because `name` usually means "user-visible-name", not "name
of the file on disk").

[^1]: Calculated with both https://printfn.github.io/fend/ and
https://www.geogebra.org/calculator, both of which came to the same
result
([1](https://github.com/user-attachments/assets/bbb62550-3781-48b5-88b1-ba0e29c28c0d),

[2](https://github.com/user-attachments/assets/82171212-b797-4117-a39f-0e132eac7252))

---------

Co-authored-by: l <link2xt@testrun.org>
This commit is contained in:
Hocuri
2025-01-21 19:42:19 +01:00
committed by GitHub
parent 22a7cfe9c3
commit 65a9c4b79b
23 changed files with 583 additions and 240 deletions

View File

@@ -261,9 +261,6 @@ impl Context {
/// Ensure that a file is an acceptable webxdc for sending.
pub(crate) async fn ensure_sendable_webxdc_file(&self, path: &Path) -> Result<()> {
let filename = path.to_str().unwrap_or_default();
if !filename.ends_with(WEBXDC_SUFFIX) {
bail!("{} is not a valid webxdc file", filename);
}
let valid = match FsZipFileReader::new(path).await {
Ok(archive) => {
@@ -1047,9 +1044,9 @@ mod tests {
Ok(())
}
async fn create_webxdc_instance(t: &TestContext, name: &str, bytes: &[u8]) -> Result<Message> {
fn create_webxdc_instance(t: &TestContext, name: &str, bytes: &[u8]) -> Result<Message> {
let mut instance = Message::new(Viewtype::File);
instance.set_file_from_bytes(t, name, bytes, None).await?;
instance.set_file_from_bytes(t, name, bytes, None)?;
Ok(instance)
}
@@ -1058,8 +1055,7 @@ mod tests {
t,
"minimal.xdc",
include_bytes!("../test-data/webxdc/minimal.xdc"),
)
.await?;
)?;
let instance_msg_id = send_msg(t, chat_id, &mut instance).await?;
assert_eq!(instance.viewtype, Viewtype::Webxdc);
Message::load_from_db(t, instance_msg_id).await
@@ -1078,9 +1074,7 @@ mod tests {
// sending using bad extension is not working, even when setting Viewtype to webxdc
let mut instance = Message::new(Viewtype::Webxdc);
instance
.set_file_from_bytes(&t, "index.html", b"<html>ola!</html>", None)
.await?;
instance.set_file_from_bytes(&t, "index.html", b"<html>ola!</html>", None)?;
assert!(send_msg(&t, chat_id, &mut instance).await.is_err());
Ok(())
@@ -1096,8 +1090,7 @@ mod tests {
&t,
"invalid-no-zip-but-7z.xdc",
include_bytes!("../test-data/webxdc/invalid-no-zip-but-7z.xdc"),
)
.await?;
)?;
let instance_id = send_msg(&t, chat_id, &mut instance).await?;
assert_eq!(instance.viewtype, Viewtype::File);
let test = Message::load_from_db(&t, instance_id).await?;
@@ -1105,14 +1098,12 @@ mod tests {
// sending invalid .xdc as Viewtype::Webxdc should fail already on sending
let mut instance = Message::new(Viewtype::Webxdc);
instance
.set_file_from_bytes(
&t,
"invalid2.xdc",
include_bytes!("../test-data/webxdc/invalid-no-zip-but-7z.xdc"),
None,
)
.await?;
instance.set_file_from_bytes(
&t,
"invalid2.xdc",
include_bytes!("../test-data/webxdc/invalid-no-zip-but-7z.xdc"),
None,
)?;
assert!(send_msg(&t, chat_id, &mut instance).await.is_err());
Ok(())
@@ -1128,8 +1119,7 @@ mod tests {
&t,
"chess.xdc",
include_bytes!("../test-data/webxdc/chess.xdc"),
)
.await?;
)?;
let instance_id = send_msg(&t, chat_id, &mut instance).await?;
let instance = Message::load_from_db(&t, instance_id).await?;
assert_eq!(instance.viewtype, Viewtype::Webxdc);
@@ -1315,8 +1305,7 @@ mod tests {
&alice,
"chess.xdc",
include_bytes!("../test-data/webxdc/chess.xdc"),
)
.await?;
)?;
let sent1 = alice.send_msg(chat.id, &mut alice_instance).await;
let alice_instance = sent1.load_from_db().await;
alice
@@ -1445,8 +1434,7 @@ mod tests {
&t,
"minimal.xdc",
include_bytes!("../test-data/webxdc/minimal.xdc"),
)
.await?;
)?;
chat_id.set_draft(&t, Some(&mut instance)).await?;
let instance = chat_id.get_draft(&t).await?.unwrap();
t.send_webxdc_status_update(instance.id, r#"{"payload": 42}"#)
@@ -1882,8 +1870,7 @@ mod tests {
&t,
"minimal.xdc",
include_bytes!("../test-data/webxdc/minimal.xdc"),
)
.await?;
)?;
chat_id.set_draft(&t, Some(&mut instance)).await?;
let (first, last) = (StatusUpdateSerial(1), StatusUpdateSerial::MAX);
assert_eq!(
@@ -2028,8 +2015,7 @@ mod tests {
&alice,
"minimal.xdc",
include_bytes!("../test-data/webxdc/minimal.xdc"),
)
.await?;
)?;
alice_chat_id
.set_draft(&alice, Some(&mut alice_instance))
.await?;
@@ -2143,8 +2129,7 @@ mod tests {
&t,
"some-files.xdc",
include_bytes!("../test-data/webxdc/some-files.xdc"),
)
.await?;
)?;
chat_id.set_draft(&t, Some(&mut instance)).await?;
let buf = instance.get_webxdc_blob(&t, "index.html").await?;
@@ -2243,8 +2228,7 @@ sth_for_the = "future""#
&t,
"with-min-api-1001.xdc",
include_bytes!("../test-data/webxdc/with-min-api-1001.xdc"),
)
.await?;
)?;
send_msg(&t, chat_id, &mut instance).await?;
let instance = t.get_last_msg().await;
@@ -2270,8 +2254,7 @@ sth_for_the = "future""#
&t,
"with-manifest-empty-name.xdc",
include_bytes!("../test-data/webxdc/with-manifest-empty-name.xdc"),
)
.await?;
)?;
chat_id.set_draft(&t, Some(&mut instance)).await?;
let info = instance.get_webxdc_info(&t).await?;
assert_eq!(info.name, "with-manifest-empty-name.xdc");
@@ -2281,8 +2264,7 @@ sth_for_the = "future""#
&t,
"with-manifest-no-name.xdc",
include_bytes!("../test-data/webxdc/with-manifest-no-name.xdc"),
)
.await?;
)?;
chat_id.set_draft(&t, Some(&mut instance)).await?;
let info = instance.get_webxdc_info(&t).await?;
assert_eq!(info.name, "with-manifest-no-name.xdc");
@@ -2292,8 +2274,7 @@ sth_for_the = "future""#
&t,
"with-minimal-manifest.xdc",
include_bytes!("../test-data/webxdc/with-minimal-manifest.xdc"),
)
.await?;
)?;
chat_id.set_draft(&t, Some(&mut instance)).await?;
let info = instance.get_webxdc_info(&t).await?;
assert_eq!(info.name, "nice app!");
@@ -2303,8 +2284,7 @@ sth_for_the = "future""#
&t,
"with-manifest-and-png-icon.xdc",
include_bytes!("../test-data/webxdc/with-manifest-and-png-icon.xdc"),
)
.await?;
)?;
chat_id.set_draft(&t, Some(&mut instance)).await?;
let info = instance.get_webxdc_info(&t).await?;
assert_eq!(info.name, "with some icon");
@@ -2314,8 +2294,7 @@ sth_for_the = "future""#
&t,
"with-png-icon.xdc",
include_bytes!("../test-data/webxdc/with-png-icon.xdc"),
)
.await?;
)?;
chat_id.set_draft(&t, Some(&mut instance)).await?;
let info = instance.get_webxdc_info(&t).await?;
assert_eq!(info.name, "with-png-icon.xdc");
@@ -2325,8 +2304,7 @@ sth_for_the = "future""#
&t,
"with-jpg-icon.xdc",
include_bytes!("../test-data/webxdc/with-jpg-icon.xdc"),
)
.await?;
)?;
chat_id.set_draft(&t, Some(&mut instance)).await?;
let info = instance.get_webxdc_info(&t).await?;
assert_eq!(info.name, "with-jpg-icon.xdc");
@@ -2667,8 +2645,7 @@ sth_for_the = "future""#
} else {
include_bytes!("../test-data/webxdc/minimal.xdc")
},
)
.await?;
)?;
let instance_id = send_msg(&t, chat_id, &mut instance).await?;
t.send_webxdc_status_update(
instance_id,
@@ -2693,8 +2670,7 @@ sth_for_the = "future""#
&t,
"with-minimal-manifest.xdc",
include_bytes!("../test-data/webxdc/with-minimal-manifest.xdc"),
)
.await?;
)?;
send_msg(&t, chat_id, &mut instance).await?;
let chatlist = Chatlist::try_load(&t, 0, None, None).await?;
@@ -2717,8 +2693,7 @@ sth_for_the = "future""#
&alice,
"minimal.xdc",
include_bytes!("../test-data/webxdc/minimal.xdc"),
)
.await?;
)?;
alice_instance.set_text("user added text".to_string());
send_msg(&alice, alice_chat.id, &mut alice_instance).await?;
let alice_instance = alice.get_last_msg().await;
@@ -2821,8 +2796,7 @@ sth_for_the = "future""#
&alice,
"debug_logging.xdc",
include_bytes!("../test-data/webxdc/minimal.xdc"),
)
.await?;
)?;
assert!(alice.debug_logging.read().unwrap().is_none());
send_msg(&alice, chat_id, &mut instance).await?;
assert!(alice.debug_logging.read().unwrap().is_some());