From d75c0991cc7bb912c87a3cb3fdaf846e5e29fe5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mart=C3=ADn=20Lucas=20Golini?= <spartanj@gmail.com>
Date: Sun, 12 Oct 2025 14:04:21 -0300
Subject: [PATCH] Some improvements to `TextDocument::fileMightBeBinary` and
 added some unit tests for it.

---
 .ecode/project_build.json                     |   2 +-
 .../assets/textfiles/test-arabic.txt          |   1 +
 .../assets/textfiles/test-bengali.txt         |  24 +++
 .../assets/textfiles/test-emoji.txt           |  15 ++
 .../assets/textfiles/test-flags.txt           |   4 +
 .../textfiles/test-j-shift_jis.copy.txt       |   6 +
 .../assets/textfiles/test-j-shift_jis.txt     |   8 +
 bin/unit_tests/assets/textfiles/test-j.txt    |   6 +
 bin/unit_tests/assets/textfiles/test-k.txt    |   2 +
 bin/unit_tests/assets/textfiles/test-sc.txt   |   2 +
 bin/unit_tests/assets/textfiles/test-tc.txt   |   2 +
 bin/unit_tests/assets/textfiles/test.xit      | 203 ++++++++++++++++++
 src/eepp/ui/doc/syntaxdefinitionmanager.cpp   |   5 +-
 src/eepp/ui/doc/textdocument.cpp              | 201 ++++++++++++++++-
 src/tests/unit_tests/textdocument.cpp         |  11 +
 src/tests/unit_tests/textformat.cpp           |  16 +-
 16 files changed, 494 insertions(+), 14 deletions(-)
 create mode 100644 bin/unit_tests/assets/textfiles/test-arabic.txt
 create mode 100644 bin/unit_tests/assets/textfiles/test-bengali.txt
 create mode 100644 bin/unit_tests/assets/textfiles/test-emoji.txt
 create mode 100644 bin/unit_tests/assets/textfiles/test-flags.txt
 create mode 100644 bin/unit_tests/assets/textfiles/test-j-shift_jis.copy.txt
 create mode 100644 bin/unit_tests/assets/textfiles/test-j-shift_jis.txt
 create mode 100644 bin/unit_tests/assets/textfiles/test-j.txt
 create mode 100644 bin/unit_tests/assets/textfiles/test-k.txt
 create mode 100644 bin/unit_tests/assets/textfiles/test-sc.txt
 create mode 100644 bin/unit_tests/assets/textfiles/test-tc.txt
 create mode 100644 bin/unit_tests/assets/textfiles/test.xit

diff --git a/.ecode/project_build.json b/.ecode/project_build.json
index b276eed4f..fb6c399d1 100644
--- a/.ecode/project_build.json
+++ b/.ecode/project_build.json
@@ -217,7 +217,7 @@
       },
       {
         "args": "",
-        "command": "${project_root}/bin/unit_tests/eepp-unit-tests-debug",
+        "command": "${project_root}/bin/unit_tests/eepp-unit_tests-debug",
         "name": "eepp-unit_tests-debug",
         "run_in_terminal": true,
         "working_dir": "${project_root}/bin/unit_tests/"
diff --git a/bin/unit_tests/assets/textfiles/test-arabic.txt b/bin/unit_tests/assets/textfiles/test-arabic.txt
new file mode 100644
index 000000000..646cae5e6
--- /dev/null
+++ b/bin/unit_tests/assets/textfiles/test-arabic.txt
@@ -0,0 +1 @@
+Ø§Ø³Ú©Ù… Ø´Ø§Ø® Ùˆ Ø¯Ù… Ù†Ø¯Ø§Ø±Ù‡ Ù‡Ù…ÛŒÙ† Ú©Ù‡ Ú©Ø§Ø±Ø¨Ø± Ø´Ø¨Ú©Ù‡ Ø®ÙˆØ¯Øª. Ú©Ø³ÛŒ Ú©Ù‡ Ø±Ùˆ Ø´Ø¨Ú©Ù‡ ØªÙˆ ÙÛŒ Ø¯Ø§Ø¯Ù‡ Ø³Ø§Ù„Ù‡Ø§ Ø²Ø­Ù…Øª Ú©Ø´ÛŒØ¯Ù‡ Ø±Ùˆ Ù†Ø§Ø¯ÛŒØ¯Ù‡ Ù…ÛŒÚ¯ÛŒØ±ÛŒ Ùˆ Ø¨Ù‡ Ú©Ø§Ø±Ø¨Ø± ÛŒÙ‡ Ø´Ø¨Ú©Ù‡ Ø¯ÛŒÚ¯Ù‡ ØªÙˆÚ©Ù† Ø³Ù†Ú¯ÛŒÙ† Ù…ÛŒØ¯ÛŒ Ù…ÛŒØ´Ù‡ Ø§Ø³Ú©Ù… Ø¹Ù„Ù†ÛŒ. Ø¨Ø§ÛŒØ¯ Ú©Ø§Ø±ÛŒ Ø¨Ø§Ù‡Ø§Ø´ Ú©Ù†ÛŒÙ… Ú©Ù‡ Ù…Ø«Ù„ Ø§Ø³ØªØ§Ø±Ú© Ø¨Ù‡ ØºÙ„Ø· Ú©Ø±Ø¯Ù† Ø¨ÛŒØ§ÙØªÙ‡ Ø§Ø±Ù‡ ØªÙˆ endgame Ù‡Ø³ØªÛŒ Ø§Ø®Ø± Ø§Ø³Ú©Ù…Ø±Ø§ÛŒÛŒ 
diff --git a/bin/unit_tests/assets/textfiles/test-bengali.txt b/bin/unit_tests/assets/textfiles/test-bengali.txt
new file mode 100644
index 000000000..5f6394e9c
--- /dev/null
+++ b/bin/unit_tests/assets/textfiles/test-bengali.txt
@@ -0,0 +1,24 @@
+Hello: à¦¹à§à¦¯à¦¾à¦²à§‹ / à¦¨à¦®à¦¸à§à¦•à¦¾à¦°  
+Good morning: à¦¸à§à¦ªà§à¦°à¦­à¦¾à¦¤  
+Good night: à¦¶à§à¦­ à¦°à¦¾à¦¤à§à¦°à¦¿  
+Thank you: à¦§à¦¨à§à¦¯à¦¬à¦¾à¦¦  
+You're welcome: à¦†à¦ªà¦¨à¦¿ à¦¸à§à¦¬à¦¾à¦—à¦¤ à¦œà¦¾à¦¨à¦¾à¦‡  
+Yes / No: à¦¹à§à¦¯à¦¾à¦ / à¦¨à¦¾  
+Please: à¦…à¦¨à§à¦—à§à¦°à¦¹ à¦•à¦°à§‡  
+Excuse me / Sorry: à¦®à¦¾à¦« à¦•à¦°à¦¬à§‡à¦¨ / à¦¦à§à¦ƒà¦–à¦¿à¦¤  
+How are you?: à¦†à¦ªà¦¨à¦¿ à¦•à§‡à¦®à¦¨ à¦†à¦›à§‡à¦¨?  
+I'm fine. And you?: à¦†à¦®à¦¿ à¦­à¦¾à¦²à§‹ à¦†à¦›à¦¿à¥¤ à¦à¦¬à¦‚ à¦†à¦ªà¦¨à¦¿?  
+What's your name?: à¦†à¦ªà¦¨à¦¾à¦° à¦¨à¦¾à¦® à¦•à¦¿?  
+My name is...: à¦†à¦®à¦¾à¦° à¦¨à¦¾à¦®...  
+Nice to meet you: à¦†à¦ªà¦¨à¦¾à¦° à¦¸à¦¾à¦¥à§‡ à¦¦à§‡à¦–à¦¾ à¦•à¦°à§‡ à¦–à§à¦¶à¦¿  
+Where are you from?: à¦†à¦ªà¦¨à¦¿ à¦•à§‹à¦¥à¦¾ à¦¥à§‡à¦•à§‡ à¦à¦¸à§‡à¦›à§‡à¦¨?  
+I'm from...: à¦†à¦®à¦¿ ... à¦¥à§‡à¦•à§‡ à¦à¦¸à§‡à¦›à¦¿à¥¤  
+Do you speak English?: à¦†à¦ªà¦¨à¦¿ à¦•à¦¿ à¦‡à¦‚à¦°à§‡à¦œà¦¿ à¦¬à¦²à¦¤à§‡ à¦ªà¦¾à¦°à§‡à¦¨?  
+I don't understand: à¦†à¦®à¦¿ à¦¬à§à¦à¦¤à§‡ à¦ªà¦¾à¦°à¦›à¦¿ à¦¨à¦¾à¥¤  
+Please speak more slowly: à¦…à¦¨à§à¦—à§à¦°à¦¹ à¦•à¦°à§‡ à¦§à§€à¦°à§‡ à¦¬à¦²à§à¦¨à¥¤  
+Please write it down: à¦…à¦¨à§à¦—à§à¦°à¦¹ à¦•à¦°à§‡ à¦à¦Ÿà¦¿ à¦²à¦¿à¦–à§‡ à¦¦à¦¿à¦¨à¥¤  
+How much is this?: à¦à¦Ÿà¦¾à¦° à¦¦à¦¾à¦® à¦•à¦¤?  
+Where is the bathroom?: à¦¬à¦¾à¦¥à¦°à§à¦® à¦•à§‹à¦¥à¦¾à¦¯à¦¼?  
+Help!: à¦¬à¦¾à¦à¦šà¦¾à¦“!  
+Stop!: à¦¥à¦¾à¦®à§à¦¨!  
+Call the police!: à¦ªà§à¦²à¦¿à¦¶ à¦¡à¦¾à¦•à§à¦¨!  
diff --git a/bin/unit_tests/assets/textfiles/test-emoji.txt b/bin/unit_tests/assets/textfiles/test-emoji.txt
new file mode 100644
index 000000000..067e07e84
--- /dev/null
+++ b/bin/unit_tests/assets/textfiles/test-emoji.txt
@@ -0,0 +1,15 @@
+# subgroup: face-smiling
+1F600                                                  ; fully-qualified     # ðŸ˜€ E1.0 grinning face
+1F603                                                  ; fully-qualified     # ðŸ˜ƒ E0.6 grinning face with big eyes
+1F604                                                  ; fully-qualified     # ðŸ˜„ E0.6 grinning face with smiling eyes
+1F601                                                  ; fully-qualified     # ðŸ˜ E0.6 beaming face with smiling eyes
+1F606                                                  ; fully-qualified     # ðŸ˜† E0.6 grinning squinting face
+1F605                                                  ; fully-qualified     # ðŸ˜… E0.6 grinning face with sweat
+1F923                                                  ; fully-qualified     # ðŸ¤£ E3.0 rolling on the floor laughing
+1F602                                                  ; fully-qualified     # ðŸ˜‚ E0.6 face with tears of joy
+1F642                                                  ; fully-qualified     # ðŸ™‚ E1.0 slightly smiling face
+1F643                                                  ; fully-qualified     # ðŸ™ƒ E1.0 upside-down face
+1FAE0                                                  ; fully-qualified     # ðŸ«  E14.0 melting face
+1F609                                                  ; fully-qualified     # ðŸ˜‰ E0.6 winking face
+1F60A                                                  ; fully-qualified     # ðŸ˜Š E0.6 smiling face with smiling eyes
+1F607                                                  ; fully-qualified     # ðŸ˜‡ E1.0 smiling face with halo
diff --git a/bin/unit_tests/assets/textfiles/test-flags.txt b/bin/unit_tests/assets/textfiles/test-flags.txt
new file mode 100644
index 000000000..85cba7709
--- /dev/null
+++ b/bin/unit_tests/assets/textfiles/test-flags.txt
@@ -0,0 +1,4 @@
+ðŸ‡¦ðŸ‡·ðŸ‡¦ðŸ‡·ðŸ‡¦ðŸ‡·ðŸ‡¦ðŸ‡·ðŸ‡¦ðŸ‡·
+ðŸ‡¦ðŸ‡· ðŸ‡¦ðŸ‡· ðŸ‡¦ðŸ‡· ðŸ‡¦ðŸ‡·
+
+ðŸ‡¦ðŸ‡·ðŸ‡¦ðŸ‡·ðŸ‡¦ðŸ‡· Awante ðŸ‡¦ðŸ‡· Argentina ðŸ‡¦ðŸ‡·ðŸ‡¦ðŸ‡·ðŸ‡¦ðŸ‡·
diff --git a/bin/unit_tests/assets/textfiles/test-j-shift_jis.copy.txt b/bin/unit_tests/assets/textfiles/test-j-shift_jis.copy.txt
new file mode 100644
index 000000000..c5a9698dd
--- /dev/null
+++ b/bin/unit_tests/assets/textfiles/test-j-shift_jis.copy.txt
@@ -0,0 +1,6 @@
+ƒjƒƒ[ƒjƒƒ[, ‚É‚á‚ñ
+ƒƒ“ƒƒ“ / ‚í‚ñ‚í‚ñ
+ƒRƒ“ƒRƒ“ / ‚±‚ñ‚±‚ñ
+
+‚±‚ñ‚É‚¿‚ÍAŽ„‚Ì–¼‘O‚Íƒ}[ƒeƒBƒ“‚Å‚·B ‚ ‚È‚½‚Ì–¼‘O‚Í‰½‚Å‚·‚©H
+Ž„‚ÍƒAƒ‹ƒ[ƒ“ƒ`ƒ“‚ÉZ‚ñ‚Å‚¨‚èAƒRƒ“ƒsƒ…[ƒ^[ ƒvƒƒOƒ‰ƒ}[‚Å‚·B E‹Æ‚Í‚È‚ñ‚Å‚·‚©H
diff --git a/bin/unit_tests/assets/textfiles/test-j-shift_jis.txt b/bin/unit_tests/assets/textfiles/test-j-shift_jis.txt
new file mode 100644
index 000000000..daffbeb5f
--- /dev/null
+++ b/bin/unit_tests/assets/textfiles/test-j-shift_jis.txt
@@ -0,0 +1,8 @@
+ƒjƒƒ[ƒjƒƒ[, ‚É‚á‚ñ
+ƒƒ“ƒƒ“ / ‚í‚ñ‚í‚ñ
+ƒRƒ“ƒRƒ“ / ‚±‚ñ‚±‚ñ
+
+esto anda
+
+‚±‚ñ‚É‚¿‚ÍAŽ„‚Ì–¼‘O‚Íƒ}[ƒeƒBƒ“‚Å‚·B ‚ ‚È‚½‚Ì–¼‘O‚Í‰½‚Å‚·‚©H
+Ž„‚ÍƒAƒ‹ƒ[ƒ“ƒ`ƒ“‚ÉZ‚ñ‚Å‚¨‚èAƒRƒ“ƒsƒ…[ƒ^[ ƒvƒƒOƒ‰ƒ}[‚Å‚·B E‹Æ‚Í‚È‚ñ‚Å‚·‚©H
diff --git a/bin/unit_tests/assets/textfiles/test-j.txt b/bin/unit_tests/assets/textfiles/test-j.txt
new file mode 100644
index 000000000..fb4c13543
--- /dev/null
+++ b/bin/unit_tests/assets/textfiles/test-j.txt
@@ -0,0 +1,6 @@
+ãƒ‹ãƒ£ãƒ¼ãƒ‹ãƒ£ãƒ¼, ã«ã‚ƒã‚“
+ãƒ¯ãƒ³ãƒ¯ãƒ³ / ã‚ã‚“ã‚ã‚“
+ã‚³ãƒ³ã‚³ãƒ³ / ã“ã‚“ã“ã‚“
+
+ã“ã‚“ã«ã¡ã¯ã€ç§ã®åå‰ã¯ãƒžãƒ¼ãƒ†ã‚£ãƒ³ã§ã™ã€‚ ã‚ãªãŸã®åå‰ã¯ä½•ã§ã™ã‹ï¼Ÿ
+ç§ã¯ã‚¢ãƒ«ã‚¼ãƒ³ãƒãƒ³ã«ä½ã‚“ã§ãŠã‚Šã€ã‚³ãƒ³ãƒ”ãƒ¥ãƒ¼ã‚¿ãƒ¼ ãƒ—ãƒ­ã‚°ãƒ©ãƒžãƒ¼ã§ã™ã€‚ è·æ¥­ã¯ãªã‚“ã§ã™ã‹ï¼Ÿ
diff --git a/bin/unit_tests/assets/textfiles/test-k.txt b/bin/unit_tests/assets/textfiles/test-k.txt
new file mode 100644
index 000000000..ae833ab57
--- /dev/null
+++ b/bin/unit_tests/assets/textfiles/test-k.txt
@@ -0,0 +1,2 @@
+ì•ˆë…•í•˜ì„¸ìš” ì˜¤ëŠ˜ ê°•ì˜í•´ ë“œë¦´ ë‚´ìš©ì€ 12 ì•ˆë§ˆë‹¹ ë¹Œë“œìž…ë‹ˆë‹¤. 12 ì•ˆë§ˆë‹¹ì˜ ì¢…ë¥˜ì™€ ìž¥ë‹¨ì ë“¤ ê·¸ë¦¬ê³  ë¹Œë“œ ì˜¤ë”ë¥¼ ê°„ë‹¨í•˜ì§€ë§Œ ìžì„¸í•˜ê²Œ ì•Œë ¤ë“œë¦¬ë ¤ê³  í•©ë‹ˆë‹¤. í† ìŠ¤ì „ì—ì„œëŠ” ê°€ìž¥ ë¶€ìœ í•˜ê²Œ ì‹œìž‘í•˜ê³  ì‹¶ì„ ë•Œ ì‚¬ìš©í•˜ëŠ” ë¹Œë“œê³ ìš”. í…Œëž€ì „ì—ì„œëŠ” 12 ì•ˆë§ˆë‹¹ìœ¼ë¡œ í•  ìˆ˜ ìžˆëŠ” ë¹Œë“œê°€ ì—¬ëŸ¬ê°€ì§€ê°€ ìžˆìŠµë‹ˆë‹¤. ê·¸ëž˜ì„œ ê°€ìž¥ ë§Žì´ ì‚¬ìš©í•˜ëŠ” ë¹Œë“œë“¤ì„ ëª‡ê°€ì§€ ì•Œë ¤ë“œë¦¬ë ¤ê³  í•©ë‹ˆë‹¤. ì²« ë²ˆì§¸ë¡œ íˆ¬ì—ì²˜ë¦¬ ë¹Œë“œì¸ë° 12 ì•ˆë§ˆë‹¹ìœ¼ë¡œ ì‹œìž‘í•˜ëŠ” ë¹Œë“œìž…ë‹ˆë‹¤. 12 ì•ˆë§ˆë‹¹ 11 ìŠ¤í¬ë‹í’€ 10ê°€ìŠ¤ ì´ì œ ë¹ ë¥¸ ê°€ìŠ¤ë¥¼ í™œìš©í•œ ë¹Œë“œì¸ë°ìš”. ì´ ë¹Œë“œëŠ” íˆ¬ì—ì²˜ë¦¬ ë¹Œë“œë¥¼ í•˜ì‹¤ë•Œ 3ì—ì²˜ë¦¬ë¥¼ ë¹ ë¥´ê²Œ 3ê°€ìŠ¤ ë©€í‹°ì— ê°€ì ¸ê°€ë©´ì„œ í”Œë ˆì´ë¥¼ í•  ë•Œ ë§Žì´ ì‚¬ìš©ì„ í•˜ê³ ìš”. ë‘ë²ˆì§¸ë¡œ 12ì•• 12í’€ 12ê°€ìŠ¤ ì ë‹¹ížˆ ë¹ ë¥¸ í…Œí¬íŠ¸ë¦¬ì™€ ì ë‹¹ížˆ ë¹ ë¥¸ 3ì—ì²˜ë¦¬ ë¹Œë“œìž…ë‹ˆë‹¤. 12ì•• 12ê°€ìŠ¤ ì ë‹¹ížˆ ë¹ ë¥¸ í…Œí¬íŠ¸ë¦¬ì™€ ì ë‹¹ížˆ ë¹ ë¥¸ 3ì—ì²˜ë¦¬ ë¹Œë“œìž…ë‹ˆë‹¤. ì´ ë¹Œë“œ ê°™ì€ ê²½ìš°ëŠ” í”ížˆë“¤ ë§í•˜ëŠ” ì•ˆ 3ì—ì²˜ë¦¬ë¼ê³  ë§Žì´ë“¤ ì–˜ê¸°ë¥¼ í•˜ëŠ”ë° ë®¤íƒˆë¦¬ìŠ¤í¬ë„ ë¹ ë¥´ê³  3ì—ì²˜ë¦¬ë„ ë¹ ë¥¸ ê·¸ëŸ° ë¹Œë“œë¼ê³  ìƒê°í•˜ì‹œë©´ ë˜ìš”. 
+
diff --git a/bin/unit_tests/assets/textfiles/test-sc.txt b/bin/unit_tests/assets/textfiles/test-sc.txt
new file mode 100644
index 000000000..f794590f4
--- /dev/null
+++ b/bin/unit_tests/assets/textfiles/test-sc.txt
@@ -0,0 +1,2 @@
+ä½ è¢«å…³åœ¨ä¸€ä¸ªå°æˆ¿é—´é‡Œã€‚ä½ å¹¶ä¸è®°å¾—å‘ç”Ÿäº†ä»€ä¹ˆï¼Œä¹Ÿä¸çŸ¥é“ä¸ºä»€ä¹ˆè¢«å…³åœ¨è¿™é‡Œã€‚ä½ ä»¥å‰ä»Žæˆ¿é—¨çš„çª—å£é‚£å„¿å¾—åˆ°é£Ÿç‰©ï¼Œ
+ä½†æ˜¯ä½ ç”¨åŠ›æ•²é—¨æˆ–è€…å¤§å«éƒ½æ²¡æœ‰ç”¨ã€‚ä½ å†³å®šä¸€å®šè¦é€ƒè·‘ï¼Œè¦ä¸ç„¶æƒ…å†µå¯èƒ½ä¼šå˜æ›´ä¸å¥½ã€‚ 
diff --git a/bin/unit_tests/assets/textfiles/test-tc.txt b/bin/unit_tests/assets/textfiles/test-tc.txt
new file mode 100644
index 000000000..1f674c391
--- /dev/null
+++ b/bin/unit_tests/assets/textfiles/test-tc.txt
@@ -0,0 +1,2 @@
+ä½ è¢«é—œåœ¨ä¸€å€‹å°æˆ¿é–“è£¡ã€‚ä½ ä¸¦ä¸è¨˜å¾—ç™¼ç”Ÿäº†ä»€éº¼ï¼Œä¹Ÿä¸çŸ¥é“ç‚ºä»€éº¼è¢«é—œåœ¨é€™è£¡ã€‚
+ä½ ä»¥å‰å¾žæˆ¿é–€çš„çª—å£é‚£å…’å¾—åˆ°é£Ÿç‰©ï¼Œä½†æ˜¯ä½ ç”¨åŠ›æ•²é–€æˆ–è€…å¤§å«éƒ½æ²’æœ‰ç”¨ã€‚ä½ æ±ºå®šä¸€å®šè¦é€ƒè·‘ï¼Œè¦ä¸ç„¶æƒ…æ³å¯èƒ½æœƒè®Šæ›´ä¸å¥½
diff --git a/bin/unit_tests/assets/textfiles/test.xit b/bin/unit_tests/assets/textfiles/test.xit
new file mode 100644
index 000000000..23936e869
--- /dev/null
+++ b/bin/unit_tests/assets/textfiles/test.xit
@@ -0,0 +1,203 @@
+[ ] Open
+[x] Checked
+[@] Ongoing
+[~] Obsolete
+
+[*] Invalid
+[o] Invalid
+[X] Invalid (uppercase)
+[Â ] Invalid (non-breaking space)
+
+[] Invalid
+[  ] Invalid
+[ x ] Invalid
+[@@] Invalid
+
+ [x] Invalid
+    [x] Invalid
+
+[ ] Do this
+
+[ ]   Do this
+
+[ ]
+[ ]
+[ ]
+
+[ ]Invalid
+[ ]! Invalid
+[ ]. Invalid
+[ ]!!. Invalid
+[ ]#invalid
+[ ]->2022-02-16 Invalid
+
+[ ] This is a longer ...
+    description text
+[ ] And this one ...
+    is even ...
+    longer
+
+[ ] The following is just ...
+    [ ] description text
+
+[x] These lines ...
+    should all ...
+    look the same
+
+[ ] This has some ...
+       more spaces
+[ ] And this one ...
+             as well
+
+[ ] The next line is ...
+invalid
+[ ] The next line is ...
+ invalid
+[ ] The next line is ...
+   invalid
+[ ] The next line is ...
+	invalid (itâ€™s a tab)
+
+[ ] ! This is important
+[ ] !!! This is very important
+[ ] !!!!!!!!!! This super important
+
+[ ] ..! This is important
+[ ] !!. This is more important
+[ ] ... This is not important
+
+[ ] !   Do something
+[ ] .   Do something
+
+[ ]    ! Do something
+[ ]    . Do something
+
+[ ] .!. Invalid
+[ ] !.! Invalid
+
+[ ] !This has regular priority
+[ ] .The dot is not priority
+[ ] This is also
+    !!! not important
+
+[ ] ! !!! This is important!
+[ ] !! ! ! This ! is also important
+[ ] !. ... This . is also important
+[ ] . ! This is not important
+
+[ ] -> 2022-01-31
+[ ] Do this -> 2022-01-31
+[ ] -> 2022-01-31 (something)
+[ ] Do something until ...
+    -> 2022-01-31
+
+[ ] -> 2022-01-31
+[ ] -> 2022-01
+[ ] -> 2022
+[ ] -> 2022-W01
+[ ] -> 2022-Q1
+
+[ ] -> 2022/01/31
+[ ] -> 2022/W01
+
+[ ] -> 2022-01/31
+
+[ ] -> 2022-01-31 -> 2022-01-31
+
+[ ] Do this soon -> 2022-01-31!!!
+[ ] Do this (-> 2022-01-31)
+
+[ ] ---> 2022-01-31
+[ ] Due-> 2022-01-31
+[ ] -> 2022-01-31very urgent
+[ ] -> 2022-01-31T10:00
+[ ] -> 2022-01-31-0
+[ ] -> 2022/01/31/0
+
+[ ] ->2022-01-31
+[ ] â†’ 2022-01-31
+[ ] ->   2022-01-31
+[ ] >2022-01-31
+[ ] Do until ->
+    2022-01-31
+
+[ ] #tag
+[ ] #T-A-G
+[ ] #--tag--
+[ ] #__tag__
+[ ] #t_a_g
+[ ] #123
+[ ] #___
+[ ] #---
+[ ] #1t2a3g
+[ ] #tÃ¤g
+[ ] #ä»Šæ—¥ã¯
+[ ] #áƒ’áƒáƒ›áƒáƒ áƒ¯áƒáƒ‘áƒ
+
+[ ] This #text contains #tags
+[ ] #Actually, it #has a #LOT.
+    Even on the #next-line!
+
+[ ] This is a #tag.
+[ ] Tags: #tag1/#tag2
+[ ] #t-a-g!
+[ ] #--tag--?
+[ ] #--tag--:text
+[ ] (#tag)
+[ ] #tagðŸ¥³
+
+[ ] Not a tag: #
+
+[ ] #tag=value
+[ ] #t-a-g=v-a-l-u-e
+[ ] #å›½=æ—¥æœ¬
+
+[ ] #tag=
+[ ] #tag=""
+[ ] #tag=''
+
+[ ] #tag="v a l u e"
+[ ] #tag='v!a.l?u+e'
+[ ] #tag='foo'bar
+[ ] #tag='foo'-bar
+[ ] #tag='foo'!!
+[ ] (#tag="bar")
+
+[ ] #tag='It\'s great
+
+[ ] #tag="v a l u e
+[ ] #tag="v a l u e'
+[ ] #tag="hello
+    World!"
+
+[ ] Item 1 of group
+[ ] Item 2 of group
+
+[ ] Item of another group
+
+Todos
+[ ] Item 1
+[ ] Item 2
+
+Group 1
+[ ] Item
+
+Group 2
+[ ] Item
+
+Todos
+
+ Todos
+[ ] Do this
+
+    Todos
+[ ] Do this
+
+[Todos]
+[ ] Do this
+
+[ ] Do this
+Todos
+[ ] Do this
+
+
diff --git a/src/eepp/ui/doc/syntaxdefinitionmanager.cpp b/src/eepp/ui/doc/syntaxdefinitionmanager.cpp
index 47f0e337d..11779130a 100644
--- a/src/eepp/ui/doc/syntaxdefinitionmanager.cpp
+++ b/src/eepp/ui/doc/syntaxdefinitionmanager.cpp
@@ -1364,7 +1364,7 @@ const SyntaxDefinition& SyntaxDefinitionManager::getByHeader( std::string_view h
 				for ( const auto& hdr : definition->get()->getHeaders() ) {
 					LuaPattern words( hdr );
 					int start, end;
-					if ( words.find( header.data(), start, end ) ) {
+					if ( words.find( header.data(), start, end, 0, header.size(), 0 ) ) {
 						return *definition->get();
 					}
 				}
@@ -1466,7 +1466,8 @@ std::size_t SyntaxDefinitionManager::count() const {
 
 bool SyntaxDefinitionManager::isFileFormatSupported( const std::string& filePath,
 													 std::string_view header ) {
-	return &find( filePath, header ) != mDefinitions[0].get();
+	return &find( filePath, header ) != mDefinitions[0].get() ||
+		   FileSystem::fileExtension( filePath ) == "txt";
 }
 
 void SyntaxDefinitionManager::resetFileAssociations() {
diff --git a/src/eepp/ui/doc/textdocument.cpp b/src/eepp/ui/doc/textdocument.cpp
index 762925cba..e67147c68 100644
--- a/src/eepp/ui/doc/textdocument.cpp
+++ b/src/eepp/ui/doc/textdocument.cpp
@@ -28,6 +28,8 @@ static constexpr char DEFAULT_NON_WORD_CHARS[] = " \t\n/\\()\"':,.;<>~!@#$%^&*|+
 
 static UnorderedSet<String::HashType> TEXT_DOCUMENT_COMMANDS = {};
 
+#include <string_view> // Ensure this is included for std::string_view
+
 bool TextDocument::fileMightBeBinary( const std::string& file ) {
 	static constexpr size_t MAX_READ = 4096;
 	static constexpr std::array<char, 4> NULL_SEQUENCE = { 0, 0, 0, 0 };
@@ -36,7 +38,14 @@ bool TextDocument::fileMightBeBinary( const std::string& file ) {
 	static constexpr std::array<char, 4> ELF_MAGIC = { 0x7F, 'E', 'L', 'F' };
 	static constexpr std::array<char, 4> PNG_MAGIC = { (char)0x89, 'P', 'N', 'G' };
 	static constexpr std::array<char, 5> PDF_MAGIC = { '%', 'P', 'D', 'F', '-' };
-	// UTF-16/UTF-32 BOMs (to avoid misclassifying as binary)
+	static constexpr std::array<char, 4> ZIP_MAGIC = { 'P', 'K', (char)0x03,
+													   (char)0x04 }; // Standard ZIP
+	static constexpr std::array<char, 4> ZIP_EMPTY = { 'P', 'K', (char)0x05,
+													   (char)0x06 }; // Empty ZIP
+	static constexpr std::array<char, 4> ZIP_SPANNED = { 'P', 'K', (char)0x07,
+														 (char)0x08 }; // Spanned ZIP
+	// UTF-8/UTF-16/UTF-32 BOMs (to avoid misclassifying as binary)
+	static constexpr std::array<char, 3> UTF8_BOM = { (char)0xEF, (char)0xBB, (char)0xBF };
 	static constexpr std::array<char, 2> UTF16BE_BOM = { (char)0xFE, (char)0xFF };
 	static constexpr std::array<char, 2> UTF16LE_BOM = { (char)0xFF, (char)0xFE };
 	static constexpr std::array<char, 4> UTF32BE_BOM = { (char)0x00, (char)0x00, (char)0xFE,
@@ -57,6 +66,9 @@ bool TextDocument::fileMightBeBinary( const std::string& file ) {
 	}
 
 	// Check for text encoding BOMs (indicates text file)
+	if ( bytesRead >= 3 && std::equal( UTF8_BOM.begin(), UTF8_BOM.end(), buffer.begin() ) ) {
+		return false; // UTF-8 text file
+	}
 	if ( bytesRead >= 2 ) {
 		if ( std::equal( UTF16BE_BOM.begin(), UTF16BE_BOM.end(), buffer.begin() ) ||
 			 std::equal( UTF16LE_BOM.begin(), UTF16LE_BOM.end(), buffer.begin() ) ) {
@@ -70,10 +82,13 @@ bool TextDocument::fileMightBeBinary( const std::string& file ) {
 		}
 	}
 
-	// Check for binary magic numbers
+	// Check for known binary magic numbers (ELF, PNG, PDF, ZIP)
 	if ( bytesRead >= 4 ) {
 		if ( std::equal( ELF_MAGIC.begin(), ELF_MAGIC.end(), buffer.begin() ) ||
 			 std::equal( PNG_MAGIC.begin(), PNG_MAGIC.end(), buffer.begin() ) ||
+			 std::equal( ZIP_MAGIC.begin(), ZIP_MAGIC.end(), buffer.begin() ) ||
+			 std::equal( ZIP_EMPTY.begin(), ZIP_EMPTY.end(), buffer.begin() ) ||
+			 std::equal( ZIP_SPANNED.begin(), ZIP_SPANNED.end(), buffer.begin() ) ||
 			 ( bytesRead >= 5 &&
 			   std::equal( PDF_MAGIC.begin(), PDF_MAGIC.end(), buffer.begin() ) ) ) {
 			return true; // Known binary file type
@@ -89,21 +104,187 @@ bool TextDocument::fileMightBeBinary( const std::string& file ) {
 		}
 	}
 
+	// Check if the buffer is valid in common text encodings (without BOM)
+	auto isValidUtf8 = []( const char* data, size_t len ) -> bool {
+		const unsigned char* udata = reinterpret_cast<const unsigned char*>( data );
+		size_t i = 0;
+		while ( i < len ) {
+			if ( udata[i] <= 0x7F ) {
+				++i;
+				continue;
+			}
+			if ( udata[i] >= 0xC2 && udata[i] <= 0xDF ) { // 2-byte sequence
+				if ( i + 1 >= len || udata[i + 1] < 0x80 || udata[i + 1] > 0xBF ) {
+					return false;
+				}
+				i += 2;
+				continue;
+			}
+			if ( udata[i] >= 0xE0 && udata[i] <= 0xEF ) { // 3-byte sequence
+				if ( i + 2 >= len ) {
+					return false;
+				}
+				if ( ( udata[i] == 0xE0 && udata[i + 1] < 0xA0 ) || udata[i + 1] > 0xBF ||
+					 ( udata[i] == 0xED && udata[i + 1] > 0x9F ) || udata[i + 2] < 0x80 ||
+					 udata[i + 2] > 0xBF ) {
+					return false;
+				}
+				i += 3;
+				continue;
+			}
+			if ( udata[i] >= 0xF0 && udata[i] <= 0xF4 ) { // 4-byte sequence
+				if ( i + 3 >= len ) {
+					return false;
+				}
+				if ( ( udata[i] == 0xF0 && udata[i + 1] < 0x90 ) || udata[i + 1] > 0xBF ||
+					 ( udata[i] == 0xF4 && udata[i + 1] > 0x8F ) || udata[i + 2] < 0x80 ||
+					 udata[i + 2] > 0xBF || udata[i + 3] < 0x80 || udata[i + 3] > 0xBF ) {
+					return false;
+				}
+				i += 4;
+				continue;
+			}
+			return false;
+		}
+		return true;
+	};
+
+	auto isValidUtf16LE = []( const char* data, size_t len ) -> bool {
+		const unsigned char* udata = reinterpret_cast<const unsigned char*>( data );
+		if ( len < 2 )
+			return true;
+		len -= len % 2;
+		size_t i = 0;
+		while ( i < len ) {
+			Uint16 word =
+				static_cast<Uint16>( udata[i] ) | ( static_cast<Uint16>( udata[i + 1] ) << 8 );
+			i += 2;
+			if ( word >= 0xD800 && word <= 0xDBFF ) { // High surrogate
+				if ( i >= len )
+					return false;
+				Uint16 next =
+					static_cast<Uint16>( udata[i] ) | ( static_cast<Uint16>( udata[i + 1] ) << 8 );
+				if ( next < 0xDC00 || next > 0xDFFF )
+					return false;
+				i += 2;
+			} else if ( word >= 0xDC00 && word <= 0xDFFF ) { // Low surrogate without high
+				return false;
+			}
+		}
+		return true;
+	};
+
+	auto isValidUtf16BE = []( const char* data, size_t len ) -> bool {
+		const unsigned char* udata = reinterpret_cast<const unsigned char*>( data );
+		if ( len < 2 )
+			return true;
+		len -= len % 2;
+		size_t i = 0;
+		while ( i < len ) {
+			Uint16 word =
+				( static_cast<Uint16>( udata[i] ) << 8 ) | static_cast<Uint16>( udata[i + 1] );
+			i += 2;
+			if ( word >= 0xD800 && word <= 0xDBFF ) { // High surrogate
+				if ( i >= len )
+					return false;
+				Uint16 next =
+					( static_cast<Uint16>( udata[i] ) << 8 ) | static_cast<Uint16>( udata[i + 1] );
+				if ( next < 0xDC00 || next > 0xDFFF )
+					return false;
+				i += 2;
+			} else if ( word >= 0xDC00 && word <= 0xDFFF ) { // Low surrogate without high
+				return false;
+			}
+		}
+		return true;
+	};
+
+	auto isValidUtf32LE = []( const char* data, size_t len ) -> bool {
+		const unsigned char* udata = reinterpret_cast<const unsigned char*>( data );
+		if ( len < 4 )
+			return true;
+		len -= len % 4;
+		for ( size_t i = 0; i < len; i += 4 ) {
+			Uint32 code = static_cast<Uint32>( udata[i] ) |
+						  ( static_cast<Uint32>( udata[i + 1] ) << 8 ) |
+						  ( static_cast<Uint32>( udata[i + 2] ) << 16 ) |
+						  ( static_cast<Uint32>( udata[i + 3] ) << 24 );
+			if ( code > 0x10FFFF || ( code >= 0xD800 && code <= 0xDFFF ) ) {
+				return false;
+			}
+		}
+		return true;
+	};
+
+	auto isValidUtf32BE = []( const char* data, size_t len ) -> bool {
+		const unsigned char* udata = reinterpret_cast<const unsigned char*>( data );
+		if ( len < 4 )
+			return true;
+		len -= len % 4;
+		for ( size_t i = 0; i < len; i += 4 ) {
+			Uint32 code = static_cast<Uint32>( udata[i + 3] ) |
+						  ( static_cast<Uint32>( udata[i + 2] ) << 8 ) |
+						  ( static_cast<Uint32>( udata[i + 1] ) << 16 ) |
+						  ( static_cast<Uint32>( udata[i] ) << 24 );
+			if ( code > 0x10FFFF || ( code >= 0xD800 && code <= 0xDFFF ) ) {
+				return false;
+			}
+		}
+		return true;
+	};
+
+	// Calculate byte entropy to detect binary files
+	auto calculateEntropy = []( const char* data, size_t len ) -> double {
+		std::array<size_t, 256> freq = { 0 };
+		for ( size_t i = 0; i < len; ++i ) {
+			freq[static_cast<unsigned char>( data[i] )]++;
+		}
+		double entropy = 0.0;
+		for ( size_t i = 0; i < 256; ++i ) {
+			if ( freq[i] > 0 ) {
+				double p = static_cast<double>( freq[i] ) / len;
+				entropy -= p * std::log2( p );
+			}
+		}
+		return entropy;
+	};
+
+	bool isFileFormatSupported = SyntaxDefinitionManager::instance()->isFileFormatSupported(
+		file, std::string_view{ buffer.data(), bytesRead } );
+
 	// Check proportion of non-printable characters
 	size_t nonPrintableCount = 0;
 	for ( size_t i = 0; i < bytesRead; ++i ) {
-		if ( buffer[i] < 32 && buffer[i] != '\n' && buffer[i] != '\r' && buffer[i] != '\t' ) {
+		unsigned char uch = static_cast<unsigned char>( buffer[i] );
+		if ( uch < 32 && uch != '\n' && uch != '\r' && uch != '\t' )
 			++nonPrintableCount;
-		}
 	}
 
-	// Consider file binary if >20% of characters are non-printable
-	if ( nonPrintableCount > bytesRead * 0.2 ) {
-		// Also white-list known extensions
-		if ( !SyntaxDefinitionManager::instance()->isFileFormatSupported(
-				 file, std::string_view{ buffer.data(), buffer.size() } ) ) {
-			return true;
+	// Check if the buffer is valid in common text encodings
+	bool validUtf8 = isValidUtf8( buffer.data(), bytesRead );
+	if ( validUtf8 || isValidUtf16LE( buffer.data(), bytesRead ) ||
+		 isValidUtf16BE( buffer.data(), bytesRead ) || isValidUtf32LE( buffer.data(), bytesRead ) ||
+		 isValidUtf32BE( buffer.data(), bytesRead ) ) {
+		// Even if valid text encoding, check non-printable characters
+		if ( nonPrintableCount > bytesRead * 0.2 &&
+			 !isFileFormatSupported ) { // 20% threshold for text encodings
+			return true;				// Likely binary due to non-printable chars
 		}
+		// For valid UTF-8, check entropy to catch binary files with valid UTF-8 sequences
+		if ( validUtf8 ) {
+			double entropy = calculateEntropy( buffer.data(), bytesRead );
+			// Binary files typically have higher entropy (>6.5 bits) than text
+			if ( entropy > 6.5 && !isFileFormatSupported ) {
+				return true; // Likely binary due to high entropy
+			}
+		}
+		return false; // Valid text encoding, treat as text
+	}
+
+	// For non-text encodings, check non-printable characters
+	if ( nonPrintableCount > bytesRead * 0.1 &&
+		 !isFileFormatSupported ) { // 10% threshold for non-text encodings
+		return true;				// Likely binary due to extension and non-printable chars
 	}
 
 	return false; // Likely a text file
diff --git a/src/tests/unit_tests/textdocument.cpp b/src/tests/unit_tests/textdocument.cpp
index 41aa57e7d..c30c3982b 100644
--- a/src/tests/unit_tests/textdocument.cpp
+++ b/src/tests/unit_tests/textdocument.cpp
@@ -74,3 +74,14 @@ UTEST( TextDocument, multicursor ) {
 	doc.resetUndoRedo();
 	doc.resetSelection( TextRange{ { 0, 0 }, { 0, 0 } } );
 }
+
+UTEST( TextDocument, fileMightBeBinary ) {
+	FileSystem::changeWorkingDirectory( Sys::getProcessPath() );
+	auto files = FileSystem::filesInfoGetInPath( "assets/textfiles" );
+	for ( const auto& file : files ) {
+		EXPECT_FALSE_MSG(
+			TextDocument::fileMightBeBinary( file.getFilepath() ),
+			String::format( "File %s should be detected as text file", file.getFilepath() )
+				.c_str() );
+	}
+}
diff --git a/src/tests/unit_tests/textformat.cpp b/src/tests/unit_tests/textformat.cpp
index 4bef034d7..c94116e0a 100644
--- a/src/tests/unit_tests/textformat.cpp
+++ b/src/tests/unit_tests/textformat.cpp
@@ -4,6 +4,7 @@
 #include <eepp/system/iostreamfile.hpp>
 #include <eepp/system/luapattern.hpp>
 #include <eepp/system/sys.hpp>
+#include <eepp/ui/doc/textdocument.hpp>
 #include <eepp/ui/doc/textformat.hpp>
 
 using namespace EE::Graphics;
@@ -82,11 +83,20 @@ UTEST( TextFormat, autodetectProject ) {
 				continue;
 			}
 			auto extension = file.getExtension();
+			bool fromSDLFolder = false;
 			if ( "a" == extension || "zip" == extension || "dll" == extension ||
 				 "dat" == extension || "cur" == extension || "icns" == extension ||
 				 "wav" == extension || Image::isImageExtension( file.getFilepath() ) ||
-				 LuaPattern::hasMatches( file.getFilepath(), "SDL2%-%d+%.%d+%.%d+" ) )
+				 ( fromSDLFolder =
+					   LuaPattern::hasMatches( file.getFilepath(), "SDL2%-%d+%.%d+%.%d+" ) ) ) {
+				if ( !fromSDLFolder && "dat" != extension ) {
+					EXPECT_TRUE_MSG( TextDocument::fileMightBeBinary( file.getFilepath() ),
+									 String::format( "File %s should be detected as binary file",
+													 file.getFilepath() )
+										 .c_str() );
+				}
 				continue;
+			}
 			IOStreamFile stream( file.getFilepath() );
 			auto expectedEncoding = getEncoding( file.getFileName() );
 			auto textFormat = TextFormat::autodetect( stream );
@@ -95,6 +105,10 @@ UTEST( TextFormat, autodetectProject ) {
 										   TextFormat::encodingToString( textFormat.encoding ),
 										   TextFormat::encodingToString( expectedEncoding ) )
 							   .c_str() );
+			EXPECT_FALSE_MSG(
+				TextDocument::fileMightBeBinary( file.getFilepath() ),
+				String::format( "File %s should be detected as text file", file.getFilepath() )
+					.c_str() );
 		}
 	};
 	checkFolder( projectRoot );