File deduplication (#6332)

When receiving messages, blobs will be deduplicated with the new
function `create_and_deduplicate_from_bytes()`. For sending files, this
adds a new function `set_file_and_deduplicate()` instead of
deduplicating by default.

This is for
https://github.com/deltachat/deltachat-core-rust/issues/6265; read the
issue description there for more details.

TODO:
- [x] Set files as read-only
- [x] Don't do a write when the file is already identical
- [x] The first 32 chars or so of the 64-character hash are enough. I
calculated that if 10b people (i.e. all of humanity) use DC, and each of
them has 200k distinct blob files (I have 4k in my day-to-day account),
and we used 20 chars, then the expected value for the number of name
collisions would be ~0.0002 (and the probability that there is a least
one name collision is lower than that) [^1]. I added 12 more characters
to be on the super safe side, but this wouldn't be necessary and I could
also make it 20 instead of 32.
- Not 100% sure whether that's necessary at all - it would mainly be
necessary if we might hit a length limit on some file systems (the
blobdir is usually sth like
`accounts/2ff9fc096d2f46b6832b24a1ed99c0d6/dc.db-blobs` (53 chars), plus
64 chars for the filename would be 117).
- [x] "touch" the files to prevent them from being deleted
- [x] TODOs in the code

For later PRs:
- Replace `BlobObject::create(…)` with
`BlobObject::create_and_deduplicate(…)` in order to deduplicate
everytime core creates a file
- Modify JsonRPC to deduplicate blob files
- Possibly rename BlobObject.name to BlobObject.file in order to prevent
confusion (because `name` usually means "user-visible-name", not "name
of the file on disk").

[^1]: Calculated with both https://printfn.github.io/fend/ and
https://www.geogebra.org/calculator, both of which came to the same
result
([1](https://github.com/user-attachments/assets/bbb62550-3781-48b5-88b1-ba0e29c28c0d),

[2](https://github.com/user-attachments/assets/82171212-b797-4117-a39f-0e132eac7252))

---------

Co-authored-by: l <link2xt@testrun.org>
This commit is contained in:
Hocuri
2025-01-21 19:42:19 +01:00
committed by GitHub
parent 22a7cfe9c3
commit 65a9c4b79b
23 changed files with 583 additions and 240 deletions

View File

@@ -1,6 +1,7 @@
//! # Messages and their identifiers.
use std::collections::BTreeSet;
use std::ffi::OsStr;
use std::path::{Path, PathBuf};
use std::str;
@@ -623,8 +624,8 @@ impl Message {
pub fn get_filemime(&self) -> Option<String> {
if let Some(m) = self.param.get(Param::MimeType) {
return Some(m.to_string());
} else if let Some(file) = self.param.get(Param::File) {
if let Some((_, mime)) = guess_msgtype_from_suffix(Path::new(file)) {
} else if self.param.exists(Param::File) {
if let Some((_, mime)) = guess_msgtype_from_suffix(self) {
return Some(mime.to_string());
}
// we have a file but no mimetype, let's use a generic one
@@ -1085,18 +1086,58 @@ impl Message {
self.param.set_optional(Param::MimeType, filemime);
}
/// Creates a new blob and sets it as a file associated with a message.
pub async fn set_file_from_bytes(
/// Sets the file associated with a message, deduplicating files with the same name.
///
/// If `name` is Some, it is used as the file name
/// and the actual current name of the file is ignored.
///
/// If the source file is already in the blobdir, it will be renamed,
/// otherwise it will be copied to the blobdir first.
///
/// In order to deduplicate files that contain the same data,
/// the file will be named as a hash of the file data.
///
/// NOTE:
/// - This function will rename the file. To get the new file path, call `get_file()`.
/// - The file must not be modified after this function was called.
pub fn set_file_and_deduplicate(
&mut self,
context: &Context,
suggested_name: &str,
file: &Path,
name: Option<&str>,
filemime: Option<&str>,
) -> Result<()> {
let blob = BlobObject::create_and_deduplicate(context, file)?;
if let Some(name) = name {
self.param.set(Param::Filename, name);
} else {
let file_name = file.file_name().map(OsStr::to_string_lossy);
self.param.set_optional(Param::Filename, file_name);
}
self.param.set(Param::File, blob.as_name());
self.param.set_optional(Param::MimeType, filemime);
Ok(())
}
/// Creates a new blob and sets it as a file associated with a message.
///
/// In order to deduplicate files that contain the same data,
/// the filename will be a hash of the file data.
///
/// NOTE: The file must not be modified after this function was called.
pub fn set_file_from_bytes(
&mut self,
context: &Context,
name: &str,
data: &[u8],
filemime: Option<&str>,
) -> Result<()> {
let blob = BlobObject::create(context, suggested_name, data).await?;
self.param.set(Param::Filename, suggested_name);
let blob = BlobObject::create_and_deduplicate_from_bytes(context, data)?;
self.param.set(Param::Filename, name);
self.param.set(Param::File, blob.as_name());
self.param.set_optional(Param::MimeType, filemime);
Ok(())
}
@@ -1109,7 +1150,6 @@ impl Message {
);
let vcard = contact::make_vcard(context, contacts).await?;
self.set_file_from_bytes(context, "vcard.vcf", vcard.as_bytes(), None)
.await
}
/// Updates message state from the vCard attachment.
@@ -1467,7 +1507,14 @@ pub async fn get_msg_read_receipts(
.await
}
pub(crate) fn guess_msgtype_from_suffix(path: &Path) -> Option<(Viewtype, &str)> {
pub(crate) fn guess_msgtype_from_suffix(msg: &Message) -> Option<(Viewtype, &'static str)> {
msg.param
.get(Param::Filename)
.or_else(|| msg.param.get(Param::File))
.and_then(|file| guess_msgtype_from_path_suffix(Path::new(file)))
}
pub(crate) fn guess_msgtype_from_path_suffix(path: &Path) -> Option<(Viewtype, &'static str)> {
let extension: &str = &path.extension()?.to_str()?.to_lowercase();
let info = match extension {
// before using viewtype other than Viewtype::File,
@@ -2213,15 +2260,15 @@ mod tests {
#[test]
fn test_guess_msgtype_from_suffix() {
assert_eq!(
guess_msgtype_from_suffix(Path::new("foo/bar-sth.mp3")),
guess_msgtype_from_path_suffix(Path::new("foo/bar-sth.mp3")),
Some((Viewtype::Audio, "audio/mpeg"))
);
assert_eq!(
guess_msgtype_from_suffix(Path::new("foo/file.html")),
guess_msgtype_from_path_suffix(Path::new("foo/file.html")),
Some((Viewtype::File, "text/html"))
);
assert_eq!(
guess_msgtype_from_suffix(Path::new("foo/file.xdc")),
guess_msgtype_from_path_suffix(Path::new("foo/file.xdc")),
Some((Viewtype::Webxdc, "application/webxdc+zip"))
);
}
@@ -2627,8 +2674,7 @@ mod tests {
let file_bytes = include_bytes!("../test-data/image/screenshot.png");
let mut msg = Message::new(Viewtype::Image);
msg.set_file_from_bytes(bob, "a.jpg", file_bytes, None)
.await?;
msg.set_file_from_bytes(bob, "a.jpg", file_bytes, None)?;
let sent_msg = bob.send_msg(bob_chat_id, &mut msg).await;
let msg = alice.recv_msg(&sent_msg).await;
assert_eq!(msg.download_state, DownloadState::Available);
@@ -2697,8 +2743,7 @@ mod tests {
let file_bytes = include_bytes!("../test-data/image/screenshot.png");
let mut msg = Message::new(Viewtype::Image);
msg.set_file_from_bytes(bob, "a.jpg", file_bytes, None)
.await?;
msg.set_file_from_bytes(bob, "a.jpg", file_bytes, None)?;
let sent_msg = bob.send_msg(bob_chat_id, &mut msg).await;
let msg = alice.recv_msg(&sent_msg).await;
assert_eq!(msg.download_state, DownloadState::Available);