feat: Case-insensitive search for non-ASCII messages (#5052)

SQLite search with `LIKE` is case-insensitive only for ASCII chars. To make it case-insensitive for
all messages, create a new column `msgs.txt_normalized` defaulting to `NULL` (so we do not bump up
the database size in a migration) and storing lowercased/normalized text there when the row is
created/updated. When doing a search, search over `IFNULL(txt_normalized, txt)`.
This commit is contained in:
iequidoo
2024-03-04 20:44:08 -03:00
committed by iequidoo
parent a5d14b377d
commit f6f4ccc6ea
6 changed files with 51 additions and 18 deletions

View File

@@ -927,12 +927,13 @@ impl ChatId {
.sql .sql
.execute( .execute(
"UPDATE msgs "UPDATE msgs
SET timestamp=?,type=?,txt=?, param=?,mime_in_reply_to=? SET timestamp=?,type=?,txt=?,txt_normalized=?,param=?,mime_in_reply_to=?
WHERE id=?;", WHERE id=?;",
( (
time(), time(),
msg.viewtype, msg.viewtype,
&msg.text, &msg.text,
message::normalize_text(&msg.text),
msg.param.to_string(), msg.param.to_string(),
msg.in_reply_to.as_deref().unwrap_or_default(), msg.in_reply_to.as_deref().unwrap_or_default(),
msg.id, msg.id,
@@ -956,10 +957,11 @@ impl ChatId {
type, type,
state, state,
txt, txt,
txt_normalized,
param, param,
hidden, hidden,
mime_in_reply_to) mime_in_reply_to)
VALUES (?,?,?, ?,?,?,?,?,?);", VALUES (?,?,?,?,?,?,?,?,?,?);",
( (
self, self,
ContactId::SELF, ContactId::SELF,
@@ -967,6 +969,7 @@ impl ChatId {
msg.viewtype, msg.viewtype,
MessageState::OutDraft, MessageState::OutDraft,
&msg.text, &msg.text,
message::normalize_text(&msg.text),
msg.param.to_string(), msg.param.to_string(),
1, 1,
msg.in_reply_to.as_deref().unwrap_or_default(), msg.in_reply_to.as_deref().unwrap_or_default(),
@@ -2075,7 +2078,7 @@ impl Chat {
.execute( .execute(
"UPDATE msgs "UPDATE msgs
SET rfc724_mid=?, chat_id=?, from_id=?, to_id=?, timestamp=?, type=?, SET rfc724_mid=?, chat_id=?, from_id=?, to_id=?, timestamp=?, type=?,
state=?, txt=?, subject=?, param=?, state=?, txt=?, txt_normalized=?, subject=?, param=?,
hidden=?, mime_in_reply_to=?, mime_references=?, mime_modified=?, hidden=?, mime_in_reply_to=?, mime_references=?, mime_modified=?,
mime_headers=?, mime_compressed=1, location_id=?, ephemeral_timer=?, mime_headers=?, mime_compressed=1, location_id=?, ephemeral_timer=?,
ephemeral_timestamp=? ephemeral_timestamp=?
@@ -2089,6 +2092,7 @@ impl Chat {
msg.viewtype, msg.viewtype,
msg.state, msg.state,
msg.text, msg.text,
message::normalize_text(&msg.text),
&msg.subject, &msg.subject,
msg.param.to_string(), msg.param.to_string(),
msg.hidden, msg.hidden,
@@ -2117,6 +2121,7 @@ impl Chat {
type, type,
state, state,
txt, txt,
txt_normalized,
subject, subject,
param, param,
hidden, hidden,
@@ -2128,7 +2133,7 @@ impl Chat {
location_id, location_id,
ephemeral_timer, ephemeral_timer,
ephemeral_timestamp) ephemeral_timestamp)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,1,?,?,?);", VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,1,?,?,?);",
params_slice![ params_slice![
msg.rfc724_mid, msg.rfc724_mid,
msg.chat_id, msg.chat_id,
@@ -2138,6 +2143,7 @@ impl Chat {
msg.viewtype, msg.viewtype,
msg.state, msg.state,
msg.text, msg.text,
message::normalize_text(&msg.text),
&msg.subject, &msg.subject,
msg.param.to_string(), msg.param.to_string(),
msg.hidden, msg.hidden,
@@ -4370,9 +4376,10 @@ pub async fn add_device_msg_with_importance(
timestamp_rcvd, timestamp_rcvd,
type,state, type,state,
txt, txt,
txt_normalized,
param, param,
rfc724_mid) rfc724_mid)
VALUES (?,?,?,?,?,?,?,?,?,?,?);", VALUES (?,?,?,?,?,?,?,?,?,?,?,?);",
( (
chat_id, chat_id,
ContactId::DEVICE, ContactId::DEVICE,
@@ -4383,6 +4390,7 @@ pub async fn add_device_msg_with_importance(
msg.viewtype, msg.viewtype,
state, state,
&msg.text, &msg.text,
message::normalize_text(&msg.text),
msg.param.to_string(), msg.param.to_string(),
rfc724_mid, rfc724_mid,
), ),
@@ -4486,8 +4494,8 @@ pub(crate) async fn add_info_msg_with_cmd(
let row_id = let row_id =
context.sql.insert( context.sql.insert(
"INSERT INTO msgs (chat_id,from_id,to_id,timestamp,timestamp_sent,timestamp_rcvd,type,state,txt,rfc724_mid,ephemeral_timer, param,mime_in_reply_to) "INSERT INTO msgs (chat_id,from_id,to_id,timestamp,timestamp_sent,timestamp_rcvd,type,state,txt,txt_normalized,rfc724_mid,ephemeral_timer,param,mime_in_reply_to)
VALUES (?,?,?, ?,?,?,?,?, ?,?,?, ?,?);", VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?);",
( (
chat_id, chat_id,
from_id.unwrap_or(ContactId::INFO), from_id.unwrap_or(ContactId::INFO),
@@ -4498,6 +4506,7 @@ pub(crate) async fn add_info_msg_with_cmd(
Viewtype::Text, Viewtype::Text,
MessageState::InNoticed, MessageState::InNoticed,
text, text,
message::normalize_text(text),
rfc724_mid, rfc724_mid,
ephemeral_timer, ephemeral_timer,
param.to_string(), param.to_string(),
@@ -4542,8 +4551,8 @@ pub(crate) async fn update_msg_text_and_timestamp(
context context
.sql .sql
.execute( .execute(
"UPDATE msgs SET txt=?, timestamp=? WHERE id=?;", "UPDATE msgs SET txt=?, txt_normalized=?, timestamp=? WHERE id=?;",
(text, timestamp, msg_id), (text, message::normalize_text(text), timestamp, msg_id),
) )
.await?; .await?;
context.emit_msgs_changed(chat_id, msg_id); context.emit_msgs_changed(chat_id, msg_id);

View File

@@ -1259,12 +1259,12 @@ impl Context {
Ok(list) Ok(list)
} }
/// Searches for messages containing the query string. /// Searches for messages containing the query string case-insensitively.
/// ///
/// If `chat_id` is provided this searches only for messages in this chat, if `chat_id` /// If `chat_id` is provided this searches only for messages in this chat, if `chat_id`
/// is `None` this searches messages from all chats. /// is `None` this searches messages from all chats.
pub async fn search_msgs(&self, chat_id: Option<ChatId>, query: &str) -> Result<Vec<MsgId>> { pub async fn search_msgs(&self, chat_id: Option<ChatId>, query: &str) -> Result<Vec<MsgId>> {
let real_query = query.trim(); let real_query = query.trim().to_lowercase();
if real_query.is_empty() { if real_query.is_empty() {
return Ok(Vec::new()); return Ok(Vec::new());
} }
@@ -1280,7 +1280,7 @@ impl Context {
WHERE m.chat_id=? WHERE m.chat_id=?
AND m.hidden=0 AND m.hidden=0
AND ct.blocked=0 AND ct.blocked=0
AND txt LIKE ? AND IFNULL(txt_normalized, txt) LIKE ?
ORDER BY m.timestamp,m.id;", ORDER BY m.timestamp,m.id;",
(chat_id, str_like_in_text), (chat_id, str_like_in_text),
|row| row.get::<_, MsgId>("id"), |row| row.get::<_, MsgId>("id"),
@@ -1316,7 +1316,7 @@ impl Context {
AND m.hidden=0 AND m.hidden=0
AND c.blocked!=1 AND c.blocked!=1
AND ct.blocked=0 AND ct.blocked=0
AND m.txt LIKE ? AND IFNULL(txt_normalized, txt) LIKE ?
ORDER BY m.id DESC LIMIT 1000", ORDER BY m.id DESC LIMIT 1000",
(str_like_in_text,), (str_like_in_text,),
|row| row.get::<_, MsgId>("id"), |row| row.get::<_, MsgId>("id"),
@@ -1721,6 +1721,8 @@ mod tests {
msg2.set_text("barbaz".to_string()); msg2.set_text("barbaz".to_string());
send_msg(&alice, chat.id, &mut msg2).await?; send_msg(&alice, chat.id, &mut msg2).await?;
alice.send_text(chat.id, "Δ-Chat").await;
// Global search with a part of text finds the message. // Global search with a part of text finds the message.
let res = alice.search_msgs(None, "ob").await?; let res = alice.search_msgs(None, "ob").await?;
assert_eq!(res.len(), 1); assert_eq!(res.len(), 1);
@@ -1733,6 +1735,12 @@ mod tests {
assert_eq!(res.first(), Some(&msg2.id)); assert_eq!(res.first(), Some(&msg2.id));
assert_eq!(res.get(1), Some(&msg1.id)); assert_eq!(res.get(1), Some(&msg1.id));
// Search is case-insensitive.
for chat_id in [None, Some(chat.id)] {
let res = alice.search_msgs(chat_id, "δ-chat").await?;
assert_eq!(res.len(), 1);
}
// Global search with longer text does not find any message. // Global search with longer text does not find any message.
let res = alice.search_msgs(None, "foobarbaz").await?; let res = alice.search_msgs(None, "foobarbaz").await?;
assert!(res.is_empty()); assert!(res.is_empty());

View File

@@ -447,7 +447,7 @@ pub(crate) async fn delete_expired_messages(context: &Context, now: i64) -> Resu
for (msg_id, chat_id, viewtype, location_id) in rows { for (msg_id, chat_id, viewtype, location_id) in rows {
transaction.execute( transaction.execute(
"UPDATE msgs "UPDATE msgs
SET chat_id=?, txt='', subject='', txt_raw='', SET chat_id=?, txt='', txt_normalized=NULL, subject='', txt_raw='',
mime_headers='', from_id=0, to_id=0, param='' mime_headers='', from_id=0, to_id=0, param=''
WHERE id=?", WHERE id=?",
(DC_CHAT_ID_TRASH, msg_id), (DC_CHAT_ID_TRASH, msg_id),

View File

@@ -113,7 +113,7 @@ impl MsgId {
r#" r#"
UPDATE msgs UPDATE msgs
SET SET
chat_id=?, txt='', chat_id=?, txt='', txt_normalized=NULL,
subject='', txt_raw='', subject='', txt_raw='',
mime_headers='', mime_headers='',
from_id=0, to_id=0, from_id=0, to_id=0,
@@ -2072,6 +2072,15 @@ impl Viewtype {
} }
} }
/// Returns text for storing in the `msgs.txt_normalized` column (to make case-insensitive search
/// possible for non-ASCII messages).
pub(crate) fn normalize_text(text: &str) -> Option<String> {
if text.is_ascii() {
return None;
};
Some(text.to_lowercase()).filter(|t| t != text)
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use num_traits::FromPrimitive; use num_traits::FromPrimitive;

View File

@@ -1540,7 +1540,7 @@ INSERT INTO msgs
rfc724_mid, chat_id, rfc724_mid, chat_id,
from_id, to_id, timestamp, timestamp_sent, from_id, to_id, timestamp, timestamp_sent,
timestamp_rcvd, type, state, msgrmsg, timestamp_rcvd, type, state, msgrmsg,
txt, subject, txt_raw, param, hidden, txt, txt_normalized, subject, txt_raw, param, hidden,
bytes, mime_headers, mime_compressed, mime_in_reply_to, bytes, mime_headers, mime_compressed, mime_in_reply_to,
mime_references, mime_modified, error, ephemeral_timer, mime_references, mime_modified, error, ephemeral_timer,
ephemeral_timestamp, download_state, hop_info ephemeral_timestamp, download_state, hop_info
@@ -1550,7 +1550,7 @@ INSERT INTO msgs
?, ?, ?, ?, ?, ?, ?, ?,
?, ?, ?, ?, ?, ?, ?, ?,
?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
?, ?, ?, ?, 1, ?, ?, ?, ?, ?, 1,
?, ?, ?, ?, ?, ?, ?, ?,
?, ?, ?, ? ?, ?, ?, ?
) )
@@ -1558,7 +1558,8 @@ ON CONFLICT (id) DO UPDATE
SET rfc724_mid=excluded.rfc724_mid, chat_id=excluded.chat_id, SET rfc724_mid=excluded.rfc724_mid, chat_id=excluded.chat_id,
from_id=excluded.from_id, to_id=excluded.to_id, timestamp_sent=excluded.timestamp_sent, from_id=excluded.from_id, to_id=excluded.to_id, timestamp_sent=excluded.timestamp_sent,
type=excluded.type, msgrmsg=excluded.msgrmsg, type=excluded.type, msgrmsg=excluded.msgrmsg,
txt=excluded.txt, subject=excluded.subject, txt_raw=excluded.txt_raw, param=excluded.param, txt=excluded.txt, txt_normalized=excluded.txt_normalized, subject=excluded.subject,
txt_raw=excluded.txt_raw, param=excluded.param,
hidden=excluded.hidden,bytes=excluded.bytes, mime_headers=excluded.mime_headers, hidden=excluded.hidden,bytes=excluded.bytes, mime_headers=excluded.mime_headers,
mime_compressed=excluded.mime_compressed, mime_in_reply_to=excluded.mime_in_reply_to, mime_compressed=excluded.mime_compressed, mime_in_reply_to=excluded.mime_in_reply_to,
mime_references=excluded.mime_references, mime_modified=excluded.mime_modified, error=excluded.error, ephemeral_timer=excluded.ephemeral_timer, mime_references=excluded.mime_references, mime_modified=excluded.mime_modified, error=excluded.error, ephemeral_timer=excluded.ephemeral_timer,
@@ -1578,6 +1579,7 @@ RETURNING id
state, state,
is_dc_message, is_dc_message,
if trash { "" } else { msg }, if trash { "" } else { msg },
if trash { None } else { message::normalize_text(msg) },
if trash { "" } else { &subject }, if trash { "" } else { &subject },
// txt_raw might contain invalid utf8 // txt_raw might contain invalid utf8
if trash { "" } else { &txt_raw }, if trash { "" } else { &txt_raw },

View File

@@ -937,6 +937,11 @@ CREATE INDEX msgs_status_updates_index2 ON msgs_status_updates (uid);
.await?; .await?;
} }
if dbversion < 115 {
sql.execute_migration("ALTER TABLE msgs ADD COLUMN txt_normalized TEXT", 115)
.await?;
}
let new_version = sql let new_version = sql
.get_raw_config_int(VERSION_CFG) .get_raw_config_int(VERSION_CFG)
.await? .await?