From d75c0991cc7bb912c87a3cb3fdaf846e5e29fe5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mart=C3=ADn=20Lucas=20Golini?= Date: Sun, 12 Oct 2025 14:04:21 -0300 Subject: [PATCH] Some improvements to `TextDocument::fileMightBeBinary` and added some unit tests for it. --- .ecode/project_build.json | 2 +- .../assets/textfiles/test-arabic.txt | 1 + .../assets/textfiles/test-bengali.txt | 24 +++ .../assets/textfiles/test-emoji.txt | 15 ++ .../assets/textfiles/test-flags.txt | 4 + .../textfiles/test-j-shift_jis.copy.txt | 6 + .../assets/textfiles/test-j-shift_jis.txt | 8 + bin/unit_tests/assets/textfiles/test-j.txt | 6 + bin/unit_tests/assets/textfiles/test-k.txt | 2 + bin/unit_tests/assets/textfiles/test-sc.txt | 2 + bin/unit_tests/assets/textfiles/test-tc.txt | 2 + bin/unit_tests/assets/textfiles/test.xit | 203 ++++++++++++++++++ src/eepp/ui/doc/syntaxdefinitionmanager.cpp | 5 +- src/eepp/ui/doc/textdocument.cpp | 201 ++++++++++++++++- src/tests/unit_tests/textdocument.cpp | 11 + src/tests/unit_tests/textformat.cpp | 16 +- 16 files changed, 494 insertions(+), 14 deletions(-) create mode 100644 bin/unit_tests/assets/textfiles/test-arabic.txt create mode 100644 bin/unit_tests/assets/textfiles/test-bengali.txt create mode 100644 bin/unit_tests/assets/textfiles/test-emoji.txt create mode 100644 bin/unit_tests/assets/textfiles/test-flags.txt create mode 100644 bin/unit_tests/assets/textfiles/test-j-shift_jis.copy.txt create mode 100644 bin/unit_tests/assets/textfiles/test-j-shift_jis.txt create mode 100644 bin/unit_tests/assets/textfiles/test-j.txt create mode 100644 bin/unit_tests/assets/textfiles/test-k.txt create mode 100644 bin/unit_tests/assets/textfiles/test-sc.txt create mode 100644 bin/unit_tests/assets/textfiles/test-tc.txt create mode 100644 bin/unit_tests/assets/textfiles/test.xit diff --git a/.ecode/project_build.json b/.ecode/project_build.json index b276eed4f..fb6c399d1 100644 --- a/.ecode/project_build.json +++ b/.ecode/project_build.json @@ -217,7 +217,7 @@ }, { "args": "", - "command": "${project_root}/bin/unit_tests/eepp-unit-tests-debug", + "command": "${project_root}/bin/unit_tests/eepp-unit_tests-debug", "name": "eepp-unit_tests-debug", "run_in_terminal": true, "working_dir": "${project_root}/bin/unit_tests/" diff --git a/bin/unit_tests/assets/textfiles/test-arabic.txt b/bin/unit_tests/assets/textfiles/test-arabic.txt new file mode 100644 index 000000000..646cae5e6 --- /dev/null +++ b/bin/unit_tests/assets/textfiles/test-arabic.txt @@ -0,0 +1 @@ +اسکم شاخ و دم نداره همین که کاربر شبکه خودت. کسی که رو شبکه تو فی داده سالها زحمت کشیده رو نادیده میگیری و به کاربر یه شبکه دیگه توکن سنگین میدی میشه اسکم علنی. باید کاری باهاش کنیم که مثل استارک به غلط کردن بیافته اره تو endgame هستی اخر اسکمرایی diff --git a/bin/unit_tests/assets/textfiles/test-bengali.txt b/bin/unit_tests/assets/textfiles/test-bengali.txt new file mode 100644 index 000000000..5f6394e9c --- /dev/null +++ b/bin/unit_tests/assets/textfiles/test-bengali.txt @@ -0,0 +1,24 @@ +Hello: হ্যালো / নমস্কার +Good morning: সুপ্রভাত +Good night: শুভ রাত্রি +Thank you: ধন্যবাদ +You're welcome: আপনি স্বাগত জানাই +Yes / No: হ্যাঁ / না +Please: অনুগ্রহ করে +Excuse me / Sorry: মাফ করবেন / দুঃখিত +How are you?: আপনি কেমন আছেন? +I'm fine. And you?: আমি ভালো আছি। এবং আপনি? +What's your name?: আপনার নাম কি? +My name is...: আমার নাম... +Nice to meet you: আপনার সাথে দেখা করে খুশি +Where are you from?: আপনি কোথা থেকে এসেছেন? +I'm from...: আমি ... থেকে এসেছি। +Do you speak English?: আপনি কি ইংরেজি বলতে পারেন? +I don't understand: আমি বুঝতে পারছি না। +Please speak more slowly: অনুগ্রহ করে ধীরে বলুন। +Please write it down: অনুগ্রহ করে এটি লিখে দিন। +How much is this?: এটার দাম কত? +Where is the bathroom?: বাথরুম কোথায়? +Help!: বাঁচাও! +Stop!: থামুন! +Call the police!: পুলিশ ডাকুন! diff --git a/bin/unit_tests/assets/textfiles/test-emoji.txt b/bin/unit_tests/assets/textfiles/test-emoji.txt new file mode 100644 index 000000000..067e07e84 --- /dev/null +++ b/bin/unit_tests/assets/textfiles/test-emoji.txt @@ -0,0 +1,15 @@ +# subgroup: face-smiling +1F600 ; fully-qualified # 😀 E1.0 grinning face +1F603 ; fully-qualified # 😃 E0.6 grinning face with big eyes +1F604 ; fully-qualified # 😄 E0.6 grinning face with smiling eyes +1F601 ; fully-qualified # 😁 E0.6 beaming face with smiling eyes +1F606 ; fully-qualified # 😆 E0.6 grinning squinting face +1F605 ; fully-qualified # 😅 E0.6 grinning face with sweat +1F923 ; fully-qualified # 🤣 E3.0 rolling on the floor laughing +1F602 ; fully-qualified # 😂 E0.6 face with tears of joy +1F642 ; fully-qualified # 🙂 E1.0 slightly smiling face +1F643 ; fully-qualified # 🙃 E1.0 upside-down face +1FAE0 ; fully-qualified # 🫠 E14.0 melting face +1F609 ; fully-qualified # 😉 E0.6 winking face +1F60A ; fully-qualified # 😊 E0.6 smiling face with smiling eyes +1F607 ; fully-qualified # 😇 E1.0 smiling face with halo diff --git a/bin/unit_tests/assets/textfiles/test-flags.txt b/bin/unit_tests/assets/textfiles/test-flags.txt new file mode 100644 index 000000000..85cba7709 --- /dev/null +++ b/bin/unit_tests/assets/textfiles/test-flags.txt @@ -0,0 +1,4 @@ +🇦🇷🇦🇷🇦🇷🇦🇷🇦🇷 +🇦🇷 🇦🇷 🇦🇷 🇦🇷 + +🇦🇷🇦🇷🇦🇷 Awante 🇦🇷 Argentina 🇦🇷🇦🇷🇦🇷 diff --git a/bin/unit_tests/assets/textfiles/test-j-shift_jis.copy.txt b/bin/unit_tests/assets/textfiles/test-j-shift_jis.copy.txt new file mode 100644 index 000000000..c5a9698dd --- /dev/null +++ b/bin/unit_tests/assets/textfiles/test-j-shift_jis.copy.txt @@ -0,0 +1,6 @@ +j[j[, ɂ + / +RR / 񂱂 + +ɂ́A̖O̓}[eBłB Ȃ̖O͉łH +̓A[`ɏZłARs[^[ vO}[łB EƂ͂ȂłH diff --git a/bin/unit_tests/assets/textfiles/test-j-shift_jis.txt b/bin/unit_tests/assets/textfiles/test-j-shift_jis.txt new file mode 100644 index 000000000..daffbeb5f --- /dev/null +++ b/bin/unit_tests/assets/textfiles/test-j-shift_jis.txt @@ -0,0 +1,8 @@ +j[j[, ɂ + / +RR / 񂱂 + +esto anda + +ɂ́A̖O̓}[eBłB Ȃ̖O͉łH +̓A[`ɏZłARs[^[ vO}[łB EƂ͂ȂłH diff --git a/bin/unit_tests/assets/textfiles/test-j.txt b/bin/unit_tests/assets/textfiles/test-j.txt new file mode 100644 index 000000000..fb4c13543 --- /dev/null +++ b/bin/unit_tests/assets/textfiles/test-j.txt @@ -0,0 +1,6 @@ +ニャーニャー, にゃん +ワンワン / わんわん +コンコン / こんこん + +こんにちは、私の名前はマーティンです。 あなたの名前は何ですか? +私はアルゼンチンに住んでおり、コンピューター プログラマーです。 職業はなんですか? diff --git a/bin/unit_tests/assets/textfiles/test-k.txt b/bin/unit_tests/assets/textfiles/test-k.txt new file mode 100644 index 000000000..ae833ab57 --- /dev/null +++ b/bin/unit_tests/assets/textfiles/test-k.txt @@ -0,0 +1,2 @@ +안녕하세요 오늘 강의해 드릴 내용은 12 안마당 빌드입니다. 12 안마당의 종류와 장단점들 그리고 빌드 오더를 간단하지만 자세하게 알려드리려고 합니다. 토스전에서는 가장 부유하게 시작하고 싶을 때 사용하는 빌드고요. 테란전에서는 12 안마당으로 할 수 있는 빌드가 여러가지가 있습니다. 그래서 가장 많이 사용하는 빌드들을 몇가지 알려드리려고 합니다. 첫 번째로 투에처리 빌드인데 12 안마당으로 시작하는 빌드입니다. 12 안마당 11 스포닝풀 10가스 이제 빠른 가스를 활용한 빌드인데요. 이 빌드는 투에처리 빌드를 하실때 3에처리를 빠르게 3가스 멀티에 가져가면서 플레이를 할 때 많이 사용을 하고요. 두번째로 12압 12풀 12가스 적당히 빠른 테크트리와 적당히 빠른 3에처리 빌드입니다. 12압 12가스 적당히 빠른 테크트리와 적당히 빠른 3에처리 빌드입니다. 이 빌드 같은 경우는 흔히들 말하는 안 3에처리라고 많이들 얘기를 하는데 뮤탈리스크도 빠르고 3에처리도 빠른 그런 빌드라고 생각하시면 되요. + diff --git a/bin/unit_tests/assets/textfiles/test-sc.txt b/bin/unit_tests/assets/textfiles/test-sc.txt new file mode 100644 index 000000000..f794590f4 --- /dev/null +++ b/bin/unit_tests/assets/textfiles/test-sc.txt @@ -0,0 +1,2 @@ +你被关在一个小房间里。你并不记得发生了什么,也不知道为什么被关在这里。你以前从房门的窗口那儿得到食物, +但是你用力敲门或者大叫都没有用。你决定一定要逃跑,要不然情况可能会变更不好。 diff --git a/bin/unit_tests/assets/textfiles/test-tc.txt b/bin/unit_tests/assets/textfiles/test-tc.txt new file mode 100644 index 000000000..1f674c391 --- /dev/null +++ b/bin/unit_tests/assets/textfiles/test-tc.txt @@ -0,0 +1,2 @@ +你被關在一個小房間裡。你並不記得發生了什麼,也不知道為什麼被關在這裡。 +你以前從房門的窗口那兒得到食物,但是你用力敲門或者大叫都沒有用。你決定一定要逃跑,要不然情況可能會變更不好 diff --git a/bin/unit_tests/assets/textfiles/test.xit b/bin/unit_tests/assets/textfiles/test.xit new file mode 100644 index 000000000..23936e869 --- /dev/null +++ b/bin/unit_tests/assets/textfiles/test.xit @@ -0,0 +1,203 @@ +[ ] Open +[x] Checked +[@] Ongoing +[~] Obsolete + +[*] Invalid +[o] Invalid +[X] Invalid (uppercase) +[ ] Invalid (non-breaking space) + +[] Invalid +[ ] Invalid +[ x ] Invalid +[@@] Invalid + + [x] Invalid + [x] Invalid + +[ ] Do this + +[ ] Do this + +[ ] +[ ] +[ ] + +[ ]Invalid +[ ]! Invalid +[ ]. Invalid +[ ]!!. Invalid +[ ]#invalid +[ ]->2022-02-16 Invalid + +[ ] This is a longer ... + description text +[ ] And this one ... + is even ... + longer + +[ ] The following is just ... + [ ] description text + +[x] These lines ... + should all ... + look the same + +[ ] This has some ... + more spaces +[ ] And this one ... + as well + +[ ] The next line is ... +invalid +[ ] The next line is ... + invalid +[ ] The next line is ... + invalid +[ ] The next line is ... + invalid (it’s a tab) + +[ ] ! This is important +[ ] !!! This is very important +[ ] !!!!!!!!!! This super important + +[ ] ..! This is important +[ ] !!. This is more important +[ ] ... This is not important + +[ ] ! Do something +[ ] . Do something + +[ ] ! Do something +[ ] . Do something + +[ ] .!. Invalid +[ ] !.! Invalid + +[ ] !This has regular priority +[ ] .The dot is not priority +[ ] This is also + !!! not important + +[ ] ! !!! This is important! +[ ] !! ! ! This ! is also important +[ ] !. ... This . is also important +[ ] . ! This is not important + +[ ] -> 2022-01-31 +[ ] Do this -> 2022-01-31 +[ ] -> 2022-01-31 (something) +[ ] Do something until ... + -> 2022-01-31 + +[ ] -> 2022-01-31 +[ ] -> 2022-01 +[ ] -> 2022 +[ ] -> 2022-W01 +[ ] -> 2022-Q1 + +[ ] -> 2022/01/31 +[ ] -> 2022/W01 + +[ ] -> 2022-01/31 + +[ ] -> 2022-01-31 -> 2022-01-31 + +[ ] Do this soon -> 2022-01-31!!! +[ ] Do this (-> 2022-01-31) + +[ ] ---> 2022-01-31 +[ ] Due-> 2022-01-31 +[ ] -> 2022-01-31very urgent +[ ] -> 2022-01-31T10:00 +[ ] -> 2022-01-31-0 +[ ] -> 2022/01/31/0 + +[ ] ->2022-01-31 +[ ] → 2022-01-31 +[ ] -> 2022-01-31 +[ ] >2022-01-31 +[ ] Do until -> + 2022-01-31 + +[ ] #tag +[ ] #T-A-G +[ ] #--tag-- +[ ] #__tag__ +[ ] #t_a_g +[ ] #123 +[ ] #___ +[ ] #--- +[ ] #1t2a3g +[ ] #täg +[ ] #今日は +[ ] #გამარჯობა + +[ ] This #text contains #tags +[ ] #Actually, it #has a #LOT. + Even on the #next-line! + +[ ] This is a #tag. +[ ] Tags: #tag1/#tag2 +[ ] #t-a-g! +[ ] #--tag--? +[ ] #--tag--:text +[ ] (#tag) +[ ] #tag🥳 + +[ ] Not a tag: # + +[ ] #tag=value +[ ] #t-a-g=v-a-l-u-e +[ ] #国=日本 + +[ ] #tag= +[ ] #tag="" +[ ] #tag='' + +[ ] #tag="v a l u e" +[ ] #tag='v!a.l?u+e' +[ ] #tag='foo'bar +[ ] #tag='foo'-bar +[ ] #tag='foo'!! +[ ] (#tag="bar") + +[ ] #tag='It\'s great + +[ ] #tag="v a l u e +[ ] #tag="v a l u e' +[ ] #tag="hello + World!" + +[ ] Item 1 of group +[ ] Item 2 of group + +[ ] Item of another group + +Todos +[ ] Item 1 +[ ] Item 2 + +Group 1 +[ ] Item + +Group 2 +[ ] Item + +Todos + + Todos +[ ] Do this + + Todos +[ ] Do this + +[Todos] +[ ] Do this + +[ ] Do this +Todos +[ ] Do this + + diff --git a/src/eepp/ui/doc/syntaxdefinitionmanager.cpp b/src/eepp/ui/doc/syntaxdefinitionmanager.cpp index 47f0e337d..11779130a 100644 --- a/src/eepp/ui/doc/syntaxdefinitionmanager.cpp +++ b/src/eepp/ui/doc/syntaxdefinitionmanager.cpp @@ -1364,7 +1364,7 @@ const SyntaxDefinition& SyntaxDefinitionManager::getByHeader( std::string_view h for ( const auto& hdr : definition->get()->getHeaders() ) { LuaPattern words( hdr ); int start, end; - if ( words.find( header.data(), start, end ) ) { + if ( words.find( header.data(), start, end, 0, header.size(), 0 ) ) { return *definition->get(); } } @@ -1466,7 +1466,8 @@ std::size_t SyntaxDefinitionManager::count() const { bool SyntaxDefinitionManager::isFileFormatSupported( const std::string& filePath, std::string_view header ) { - return &find( filePath, header ) != mDefinitions[0].get(); + return &find( filePath, header ) != mDefinitions[0].get() || + FileSystem::fileExtension( filePath ) == "txt"; } void SyntaxDefinitionManager::resetFileAssociations() { diff --git a/src/eepp/ui/doc/textdocument.cpp b/src/eepp/ui/doc/textdocument.cpp index 762925cba..e67147c68 100644 --- a/src/eepp/ui/doc/textdocument.cpp +++ b/src/eepp/ui/doc/textdocument.cpp @@ -28,6 +28,8 @@ static constexpr char DEFAULT_NON_WORD_CHARS[] = " \t\n/\\()\"':,.;<>~!@#$%^&*|+ static UnorderedSet TEXT_DOCUMENT_COMMANDS = {}; +#include // Ensure this is included for std::string_view + bool TextDocument::fileMightBeBinary( const std::string& file ) { static constexpr size_t MAX_READ = 4096; static constexpr std::array NULL_SEQUENCE = { 0, 0, 0, 0 }; @@ -36,7 +38,14 @@ bool TextDocument::fileMightBeBinary( const std::string& file ) { static constexpr std::array ELF_MAGIC = { 0x7F, 'E', 'L', 'F' }; static constexpr std::array PNG_MAGIC = { (char)0x89, 'P', 'N', 'G' }; static constexpr std::array PDF_MAGIC = { '%', 'P', 'D', 'F', '-' }; - // UTF-16/UTF-32 BOMs (to avoid misclassifying as binary) + static constexpr std::array ZIP_MAGIC = { 'P', 'K', (char)0x03, + (char)0x04 }; // Standard ZIP + static constexpr std::array ZIP_EMPTY = { 'P', 'K', (char)0x05, + (char)0x06 }; // Empty ZIP + static constexpr std::array ZIP_SPANNED = { 'P', 'K', (char)0x07, + (char)0x08 }; // Spanned ZIP + // UTF-8/UTF-16/UTF-32 BOMs (to avoid misclassifying as binary) + static constexpr std::array UTF8_BOM = { (char)0xEF, (char)0xBB, (char)0xBF }; static constexpr std::array UTF16BE_BOM = { (char)0xFE, (char)0xFF }; static constexpr std::array UTF16LE_BOM = { (char)0xFF, (char)0xFE }; static constexpr std::array UTF32BE_BOM = { (char)0x00, (char)0x00, (char)0xFE, @@ -57,6 +66,9 @@ bool TextDocument::fileMightBeBinary( const std::string& file ) { } // Check for text encoding BOMs (indicates text file) + if ( bytesRead >= 3 && std::equal( UTF8_BOM.begin(), UTF8_BOM.end(), buffer.begin() ) ) { + return false; // UTF-8 text file + } if ( bytesRead >= 2 ) { if ( std::equal( UTF16BE_BOM.begin(), UTF16BE_BOM.end(), buffer.begin() ) || std::equal( UTF16LE_BOM.begin(), UTF16LE_BOM.end(), buffer.begin() ) ) { @@ -70,10 +82,13 @@ bool TextDocument::fileMightBeBinary( const std::string& file ) { } } - // Check for binary magic numbers + // Check for known binary magic numbers (ELF, PNG, PDF, ZIP) if ( bytesRead >= 4 ) { if ( std::equal( ELF_MAGIC.begin(), ELF_MAGIC.end(), buffer.begin() ) || std::equal( PNG_MAGIC.begin(), PNG_MAGIC.end(), buffer.begin() ) || + std::equal( ZIP_MAGIC.begin(), ZIP_MAGIC.end(), buffer.begin() ) || + std::equal( ZIP_EMPTY.begin(), ZIP_EMPTY.end(), buffer.begin() ) || + std::equal( ZIP_SPANNED.begin(), ZIP_SPANNED.end(), buffer.begin() ) || ( bytesRead >= 5 && std::equal( PDF_MAGIC.begin(), PDF_MAGIC.end(), buffer.begin() ) ) ) { return true; // Known binary file type @@ -89,21 +104,187 @@ bool TextDocument::fileMightBeBinary( const std::string& file ) { } } + // Check if the buffer is valid in common text encodings (without BOM) + auto isValidUtf8 = []( const char* data, size_t len ) -> bool { + const unsigned char* udata = reinterpret_cast( data ); + size_t i = 0; + while ( i < len ) { + if ( udata[i] <= 0x7F ) { + ++i; + continue; + } + if ( udata[i] >= 0xC2 && udata[i] <= 0xDF ) { // 2-byte sequence + if ( i + 1 >= len || udata[i + 1] < 0x80 || udata[i + 1] > 0xBF ) { + return false; + } + i += 2; + continue; + } + if ( udata[i] >= 0xE0 && udata[i] <= 0xEF ) { // 3-byte sequence + if ( i + 2 >= len ) { + return false; + } + if ( ( udata[i] == 0xE0 && udata[i + 1] < 0xA0 ) || udata[i + 1] > 0xBF || + ( udata[i] == 0xED && udata[i + 1] > 0x9F ) || udata[i + 2] < 0x80 || + udata[i + 2] > 0xBF ) { + return false; + } + i += 3; + continue; + } + if ( udata[i] >= 0xF0 && udata[i] <= 0xF4 ) { // 4-byte sequence + if ( i + 3 >= len ) { + return false; + } + if ( ( udata[i] == 0xF0 && udata[i + 1] < 0x90 ) || udata[i + 1] > 0xBF || + ( udata[i] == 0xF4 && udata[i + 1] > 0x8F ) || udata[i + 2] < 0x80 || + udata[i + 2] > 0xBF || udata[i + 3] < 0x80 || udata[i + 3] > 0xBF ) { + return false; + } + i += 4; + continue; + } + return false; + } + return true; + }; + + auto isValidUtf16LE = []( const char* data, size_t len ) -> bool { + const unsigned char* udata = reinterpret_cast( data ); + if ( len < 2 ) + return true; + len -= len % 2; + size_t i = 0; + while ( i < len ) { + Uint16 word = + static_cast( udata[i] ) | ( static_cast( udata[i + 1] ) << 8 ); + i += 2; + if ( word >= 0xD800 && word <= 0xDBFF ) { // High surrogate + if ( i >= len ) + return false; + Uint16 next = + static_cast( udata[i] ) | ( static_cast( udata[i + 1] ) << 8 ); + if ( next < 0xDC00 || next > 0xDFFF ) + return false; + i += 2; + } else if ( word >= 0xDC00 && word <= 0xDFFF ) { // Low surrogate without high + return false; + } + } + return true; + }; + + auto isValidUtf16BE = []( const char* data, size_t len ) -> bool { + const unsigned char* udata = reinterpret_cast( data ); + if ( len < 2 ) + return true; + len -= len % 2; + size_t i = 0; + while ( i < len ) { + Uint16 word = + ( static_cast( udata[i] ) << 8 ) | static_cast( udata[i + 1] ); + i += 2; + if ( word >= 0xD800 && word <= 0xDBFF ) { // High surrogate + if ( i >= len ) + return false; + Uint16 next = + ( static_cast( udata[i] ) << 8 ) | static_cast( udata[i + 1] ); + if ( next < 0xDC00 || next > 0xDFFF ) + return false; + i += 2; + } else if ( word >= 0xDC00 && word <= 0xDFFF ) { // Low surrogate without high + return false; + } + } + return true; + }; + + auto isValidUtf32LE = []( const char* data, size_t len ) -> bool { + const unsigned char* udata = reinterpret_cast( data ); + if ( len < 4 ) + return true; + len -= len % 4; + for ( size_t i = 0; i < len; i += 4 ) { + Uint32 code = static_cast( udata[i] ) | + ( static_cast( udata[i + 1] ) << 8 ) | + ( static_cast( udata[i + 2] ) << 16 ) | + ( static_cast( udata[i + 3] ) << 24 ); + if ( code > 0x10FFFF || ( code >= 0xD800 && code <= 0xDFFF ) ) { + return false; + } + } + return true; + }; + + auto isValidUtf32BE = []( const char* data, size_t len ) -> bool { + const unsigned char* udata = reinterpret_cast( data ); + if ( len < 4 ) + return true; + len -= len % 4; + for ( size_t i = 0; i < len; i += 4 ) { + Uint32 code = static_cast( udata[i + 3] ) | + ( static_cast( udata[i + 2] ) << 8 ) | + ( static_cast( udata[i + 1] ) << 16 ) | + ( static_cast( udata[i] ) << 24 ); + if ( code > 0x10FFFF || ( code >= 0xD800 && code <= 0xDFFF ) ) { + return false; + } + } + return true; + }; + + // Calculate byte entropy to detect binary files + auto calculateEntropy = []( const char* data, size_t len ) -> double { + std::array freq = { 0 }; + for ( size_t i = 0; i < len; ++i ) { + freq[static_cast( data[i] )]++; + } + double entropy = 0.0; + for ( size_t i = 0; i < 256; ++i ) { + if ( freq[i] > 0 ) { + double p = static_cast( freq[i] ) / len; + entropy -= p * std::log2( p ); + } + } + return entropy; + }; + + bool isFileFormatSupported = SyntaxDefinitionManager::instance()->isFileFormatSupported( + file, std::string_view{ buffer.data(), bytesRead } ); + // Check proportion of non-printable characters size_t nonPrintableCount = 0; for ( size_t i = 0; i < bytesRead; ++i ) { - if ( buffer[i] < 32 && buffer[i] != '\n' && buffer[i] != '\r' && buffer[i] != '\t' ) { + unsigned char uch = static_cast( buffer[i] ); + if ( uch < 32 && uch != '\n' && uch != '\r' && uch != '\t' ) ++nonPrintableCount; - } } - // Consider file binary if >20% of characters are non-printable - if ( nonPrintableCount > bytesRead * 0.2 ) { - // Also white-list known extensions - if ( !SyntaxDefinitionManager::instance()->isFileFormatSupported( - file, std::string_view{ buffer.data(), buffer.size() } ) ) { - return true; + // Check if the buffer is valid in common text encodings + bool validUtf8 = isValidUtf8( buffer.data(), bytesRead ); + if ( validUtf8 || isValidUtf16LE( buffer.data(), bytesRead ) || + isValidUtf16BE( buffer.data(), bytesRead ) || isValidUtf32LE( buffer.data(), bytesRead ) || + isValidUtf32BE( buffer.data(), bytesRead ) ) { + // Even if valid text encoding, check non-printable characters + if ( nonPrintableCount > bytesRead * 0.2 && + !isFileFormatSupported ) { // 20% threshold for text encodings + return true; // Likely binary due to non-printable chars } + // For valid UTF-8, check entropy to catch binary files with valid UTF-8 sequences + if ( validUtf8 ) { + double entropy = calculateEntropy( buffer.data(), bytesRead ); + // Binary files typically have higher entropy (>6.5 bits) than text + if ( entropy > 6.5 && !isFileFormatSupported ) { + return true; // Likely binary due to high entropy + } + } + return false; // Valid text encoding, treat as text + } + + // For non-text encodings, check non-printable characters + if ( nonPrintableCount > bytesRead * 0.1 && + !isFileFormatSupported ) { // 10% threshold for non-text encodings + return true; // Likely binary due to extension and non-printable chars } return false; // Likely a text file diff --git a/src/tests/unit_tests/textdocument.cpp b/src/tests/unit_tests/textdocument.cpp index 41aa57e7d..c30c3982b 100644 --- a/src/tests/unit_tests/textdocument.cpp +++ b/src/tests/unit_tests/textdocument.cpp @@ -74,3 +74,14 @@ UTEST( TextDocument, multicursor ) { doc.resetUndoRedo(); doc.resetSelection( TextRange{ { 0, 0 }, { 0, 0 } } ); } + +UTEST( TextDocument, fileMightBeBinary ) { + FileSystem::changeWorkingDirectory( Sys::getProcessPath() ); + auto files = FileSystem::filesInfoGetInPath( "assets/textfiles" ); + for ( const auto& file : files ) { + EXPECT_FALSE_MSG( + TextDocument::fileMightBeBinary( file.getFilepath() ), + String::format( "File %s should be detected as text file", file.getFilepath() ) + .c_str() ); + } +} diff --git a/src/tests/unit_tests/textformat.cpp b/src/tests/unit_tests/textformat.cpp index 4bef034d7..c94116e0a 100644 --- a/src/tests/unit_tests/textformat.cpp +++ b/src/tests/unit_tests/textformat.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include using namespace EE::Graphics; @@ -82,11 +83,20 @@ UTEST( TextFormat, autodetectProject ) { continue; } auto extension = file.getExtension(); + bool fromSDLFolder = false; if ( "a" == extension || "zip" == extension || "dll" == extension || "dat" == extension || "cur" == extension || "icns" == extension || "wav" == extension || Image::isImageExtension( file.getFilepath() ) || - LuaPattern::hasMatches( file.getFilepath(), "SDL2%-%d+%.%d+%.%d+" ) ) + ( fromSDLFolder = + LuaPattern::hasMatches( file.getFilepath(), "SDL2%-%d+%.%d+%.%d+" ) ) ) { + if ( !fromSDLFolder && "dat" != extension ) { + EXPECT_TRUE_MSG( TextDocument::fileMightBeBinary( file.getFilepath() ), + String::format( "File %s should be detected as binary file", + file.getFilepath() ) + .c_str() ); + } continue; + } IOStreamFile stream( file.getFilepath() ); auto expectedEncoding = getEncoding( file.getFileName() ); auto textFormat = TextFormat::autodetect( stream ); @@ -95,6 +105,10 @@ UTEST( TextFormat, autodetectProject ) { TextFormat::encodingToString( textFormat.encoding ), TextFormat::encodingToString( expectedEncoding ) ) .c_str() ); + EXPECT_FALSE_MSG( + TextDocument::fileMightBeBinary( file.getFilepath() ), + String::format( "File %s should be detected as text file", file.getFilepath() ) + .c_str() ); } }; checkFolder( projectRoot );