feat: Case-insensitive search for non-ASCII messages (#5052)

SQLite search with `LIKE` is case-insensitive only for ASCII chars. To make it case-insensitive for
all messages, create a new column `msgs.txt_normalized` defaulting to `NULL` (so we do not bump up
the database size in a migration) and storing lowercased/normalized text there when the row is
created/updated. When doing a search, search over `IFNULL(txt_normalized, txt)`.
This commit is contained in:
iequidoo
2024-03-04 20:44:08 -03:00
committed by iequidoo
parent a5d14b377d
commit f6f4ccc6ea
6 changed files with 51 additions and 18 deletions

View File

@@ -927,12 +927,13 @@ impl ChatId {
.sql
.execute(
"UPDATE msgs
SET timestamp=?,type=?,txt=?, param=?,mime_in_reply_to=?
SET timestamp=?,type=?,txt=?,txt_normalized=?,param=?,mime_in_reply_to=?
WHERE id=?;",
(
time(),
msg.viewtype,
&msg.text,
message::normalize_text(&msg.text),
msg.param.to_string(),
msg.in_reply_to.as_deref().unwrap_or_default(),
msg.id,
@@ -956,10 +957,11 @@ impl ChatId {
type,
state,
txt,
txt_normalized,
param,
hidden,
mime_in_reply_to)
VALUES (?,?,?, ?,?,?,?,?,?);",
VALUES (?,?,?,?,?,?,?,?,?,?);",
(
self,
ContactId::SELF,
@@ -967,6 +969,7 @@ impl ChatId {
msg.viewtype,
MessageState::OutDraft,
&msg.text,
message::normalize_text(&msg.text),
msg.param.to_string(),
1,
msg.in_reply_to.as_deref().unwrap_or_default(),
@@ -2075,7 +2078,7 @@ impl Chat {
.execute(
"UPDATE msgs
SET rfc724_mid=?, chat_id=?, from_id=?, to_id=?, timestamp=?, type=?,
state=?, txt=?, subject=?, param=?,
state=?, txt=?, txt_normalized=?, subject=?, param=?,
hidden=?, mime_in_reply_to=?, mime_references=?, mime_modified=?,
mime_headers=?, mime_compressed=1, location_id=?, ephemeral_timer=?,
ephemeral_timestamp=?
@@ -2089,6 +2092,7 @@ impl Chat {
msg.viewtype,
msg.state,
msg.text,
message::normalize_text(&msg.text),
&msg.subject,
msg.param.to_string(),
msg.hidden,
@@ -2117,6 +2121,7 @@ impl Chat {
type,
state,
txt,
txt_normalized,
subject,
param,
hidden,
@@ -2128,7 +2133,7 @@ impl Chat {
location_id,
ephemeral_timer,
ephemeral_timestamp)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,1,?,?,?);",
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,1,?,?,?);",
params_slice![
msg.rfc724_mid,
msg.chat_id,
@@ -2138,6 +2143,7 @@ impl Chat {
msg.viewtype,
msg.state,
msg.text,
message::normalize_text(&msg.text),
&msg.subject,
msg.param.to_string(),
msg.hidden,
@@ -4370,9 +4376,10 @@ pub async fn add_device_msg_with_importance(
timestamp_rcvd,
type,state,
txt,
txt_normalized,
param,
rfc724_mid)
VALUES (?,?,?,?,?,?,?,?,?,?,?);",
VALUES (?,?,?,?,?,?,?,?,?,?,?,?);",
(
chat_id,
ContactId::DEVICE,
@@ -4383,6 +4390,7 @@ pub async fn add_device_msg_with_importance(
msg.viewtype,
state,
&msg.text,
message::normalize_text(&msg.text),
msg.param.to_string(),
rfc724_mid,
),
@@ -4486,8 +4494,8 @@ pub(crate) async fn add_info_msg_with_cmd(
let row_id =
context.sql.insert(
"INSERT INTO msgs (chat_id,from_id,to_id,timestamp,timestamp_sent,timestamp_rcvd,type,state,txt,rfc724_mid,ephemeral_timer, param,mime_in_reply_to)
VALUES (?,?,?, ?,?,?,?,?, ?,?,?, ?,?);",
"INSERT INTO msgs (chat_id,from_id,to_id,timestamp,timestamp_sent,timestamp_rcvd,type,state,txt,txt_normalized,rfc724_mid,ephemeral_timer,param,mime_in_reply_to)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?);",
(
chat_id,
from_id.unwrap_or(ContactId::INFO),
@@ -4498,6 +4506,7 @@ pub(crate) async fn add_info_msg_with_cmd(
Viewtype::Text,
MessageState::InNoticed,
text,
message::normalize_text(text),
rfc724_mid,
ephemeral_timer,
param.to_string(),
@@ -4542,8 +4551,8 @@ pub(crate) async fn update_msg_text_and_timestamp(
context
.sql
.execute(
"UPDATE msgs SET txt=?, timestamp=? WHERE id=?;",
(text, timestamp, msg_id),
"UPDATE msgs SET txt=?, txt_normalized=?, timestamp=? WHERE id=?;",
(text, message::normalize_text(text), timestamp, msg_id),
)
.await?;
context.emit_msgs_changed(chat_id, msg_id);

View File

@@ -1259,12 +1259,12 @@ impl Context {
Ok(list)
}
/// Searches for messages containing the query string.
/// Searches for messages containing the query string case-insensitively.
///
/// If `chat_id` is provided this searches only for messages in this chat, if `chat_id`
/// is `None` this searches messages from all chats.
pub async fn search_msgs(&self, chat_id: Option<ChatId>, query: &str) -> Result<Vec<MsgId>> {
let real_query = query.trim();
let real_query = query.trim().to_lowercase();
if real_query.is_empty() {
return Ok(Vec::new());
}
@@ -1280,7 +1280,7 @@ impl Context {
WHERE m.chat_id=?
AND m.hidden=0
AND ct.blocked=0
AND txt LIKE ?
AND IFNULL(txt_normalized, txt) LIKE ?
ORDER BY m.timestamp,m.id;",
(chat_id, str_like_in_text),
|row| row.get::<_, MsgId>("id"),
@@ -1316,7 +1316,7 @@ impl Context {
AND m.hidden=0
AND c.blocked!=1
AND ct.blocked=0
AND m.txt LIKE ?
AND IFNULL(txt_normalized, txt) LIKE ?
ORDER BY m.id DESC LIMIT 1000",
(str_like_in_text,),
|row| row.get::<_, MsgId>("id"),
@@ -1721,6 +1721,8 @@ mod tests {
msg2.set_text("barbaz".to_string());
send_msg(&alice, chat.id, &mut msg2).await?;
alice.send_text(chat.id, "Δ-Chat").await;
// Global search with a part of text finds the message.
let res = alice.search_msgs(None, "ob").await?;
assert_eq!(res.len(), 1);
@@ -1733,6 +1735,12 @@ mod tests {
assert_eq!(res.first(), Some(&msg2.id));
assert_eq!(res.get(1), Some(&msg1.id));
// Search is case-insensitive.
for chat_id in [None, Some(chat.id)] {
let res = alice.search_msgs(chat_id, "δ-chat").await?;
assert_eq!(res.len(), 1);
}
// Global search with longer text does not find any message.
let res = alice.search_msgs(None, "foobarbaz").await?;
assert!(res.is_empty());

View File

@@ -447,7 +447,7 @@ pub(crate) async fn delete_expired_messages(context: &Context, now: i64) -> Resu
for (msg_id, chat_id, viewtype, location_id) in rows {
transaction.execute(
"UPDATE msgs
SET chat_id=?, txt='', subject='', txt_raw='',
SET chat_id=?, txt='', txt_normalized=NULL, subject='', txt_raw='',
mime_headers='', from_id=0, to_id=0, param=''
WHERE id=?",
(DC_CHAT_ID_TRASH, msg_id),

View File

@@ -113,7 +113,7 @@ impl MsgId {
r#"
UPDATE msgs
SET
chat_id=?, txt='',
chat_id=?, txt='', txt_normalized=NULL,
subject='', txt_raw='',
mime_headers='',
from_id=0, to_id=0,
@@ -2072,6 +2072,15 @@ impl Viewtype {
}
}
/// Returns text for storing in the `msgs.txt_normalized` column (to make case-insensitive search
/// possible for non-ASCII messages).
pub(crate) fn normalize_text(text: &str) -> Option<String> {
if text.is_ascii() {
return None;
};
Some(text.to_lowercase()).filter(|t| t != text)
}
#[cfg(test)]
mod tests {
use num_traits::FromPrimitive;

View File

@@ -1540,7 +1540,7 @@ INSERT INTO msgs
rfc724_mid, chat_id,
from_id, to_id, timestamp, timestamp_sent,
timestamp_rcvd, type, state, msgrmsg,
txt, subject, txt_raw, param, hidden,
txt, txt_normalized, subject, txt_raw, param, hidden,
bytes, mime_headers, mime_compressed, mime_in_reply_to,
mime_references, mime_modified, error, ephemeral_timer,
ephemeral_timestamp, download_state, hop_info
@@ -1550,7 +1550,7 @@ INSERT INTO msgs
?, ?, ?, ?,
?, ?, ?, ?,
?, ?, ?, ?, ?,
?, ?, ?, ?, 1,
?, ?, ?, ?, ?, 1,
?, ?, ?, ?,
?, ?, ?, ?
)
@@ -1558,7 +1558,8 @@ ON CONFLICT (id) DO UPDATE
SET rfc724_mid=excluded.rfc724_mid, chat_id=excluded.chat_id,
from_id=excluded.from_id, to_id=excluded.to_id, timestamp_sent=excluded.timestamp_sent,
type=excluded.type, msgrmsg=excluded.msgrmsg,
txt=excluded.txt, subject=excluded.subject, txt_raw=excluded.txt_raw, param=excluded.param,
txt=excluded.txt, txt_normalized=excluded.txt_normalized, subject=excluded.subject,
txt_raw=excluded.txt_raw, param=excluded.param,
hidden=excluded.hidden,bytes=excluded.bytes, mime_headers=excluded.mime_headers,
mime_compressed=excluded.mime_compressed, mime_in_reply_to=excluded.mime_in_reply_to,
mime_references=excluded.mime_references, mime_modified=excluded.mime_modified, error=excluded.error, ephemeral_timer=excluded.ephemeral_timer,
@@ -1578,6 +1579,7 @@ RETURNING id
state,
is_dc_message,
if trash { "" } else { msg },
if trash { None } else { message::normalize_text(msg) },
if trash { "" } else { &subject },
// txt_raw might contain invalid utf8
if trash { "" } else { &txt_raw },

View File

@@ -937,6 +937,11 @@ CREATE INDEX msgs_status_updates_index2 ON msgs_status_updates (uid);
.await?;
}
if dbversion < 115 {
sql.execute_migration("ALTER TABLE msgs ADD COLUMN txt_normalized TEXT", 115)
.await?;
}
let new_version = sql
.get_raw_config_int(VERSION_CFG)
.await?