mirror of
https://github.com/SpartanJ/eepp.git
synced 2026-05-28 17:16:29 +03:00
Some improvements to TextDocument::fileMightBeBinary and added some unit tests for it.
This commit is contained in:
@@ -217,7 +217,7 @@
|
||||
},
|
||||
{
|
||||
"args": "",
|
||||
"command": "${project_root}/bin/unit_tests/eepp-unit-tests-debug",
|
||||
"command": "${project_root}/bin/unit_tests/eepp-unit_tests-debug",
|
||||
"name": "eepp-unit_tests-debug",
|
||||
"run_in_terminal": true,
|
||||
"working_dir": "${project_root}/bin/unit_tests/"
|
||||
|
||||
1
bin/unit_tests/assets/textfiles/test-arabic.txt
Normal file
1
bin/unit_tests/assets/textfiles/test-arabic.txt
Normal file
@@ -0,0 +1 @@
|
||||
اسکم شاخ و دم نداره همین که کاربر شبکه خودت. کسی که رو شبکه تو فی داده سالها زحمت کشیده رو نادیده میگیری و به کاربر یه شبکه دیگه توکن سنگین میدی میشه اسکم علنی. باید کاری باهاش کنیم که مثل استارک به غلط کردن بیافته اره تو endgame هستی اخر اسکمرایی
|
||||
24
bin/unit_tests/assets/textfiles/test-bengali.txt
Normal file
24
bin/unit_tests/assets/textfiles/test-bengali.txt
Normal file
@@ -0,0 +1,24 @@
|
||||
Hello: হ্যালো / নমস্কার
|
||||
Good morning: সুপ্রভাত
|
||||
Good night: শুভ রাত্রি
|
||||
Thank you: ধন্যবাদ
|
||||
You're welcome: আপনি স্বাগত জানাই
|
||||
Yes / No: হ্যাঁ / না
|
||||
Please: অনুগ্রহ করে
|
||||
Excuse me / Sorry: মাফ করবেন / দুঃখিত
|
||||
How are you?: আপনি কেমন আছেন?
|
||||
I'm fine. And you?: আমি ভালো আছি। এবং আপনি?
|
||||
What's your name?: আপনার নাম কি?
|
||||
My name is...: আমার নাম...
|
||||
Nice to meet you: আপনার সাথে দেখা করে খুশি
|
||||
Where are you from?: আপনি কোথা থেকে এসেছেন?
|
||||
I'm from...: আমি ... থেকে এসেছি।
|
||||
Do you speak English?: আপনি কি ইংরেজি বলতে পারেন?
|
||||
I don't understand: আমি বুঝতে পারছি না।
|
||||
Please speak more slowly: অনুগ্রহ করে ধীরে বলুন।
|
||||
Please write it down: অনুগ্রহ করে এটি লিখে দিন।
|
||||
How much is this?: এটার দাম কত?
|
||||
Where is the bathroom?: বাথরুম কোথায়?
|
||||
Help!: বাঁচাও!
|
||||
Stop!: থামুন!
|
||||
Call the police!: পুলিশ ডাকুন!
|
||||
15
bin/unit_tests/assets/textfiles/test-emoji.txt
Normal file
15
bin/unit_tests/assets/textfiles/test-emoji.txt
Normal file
@@ -0,0 +1,15 @@
|
||||
# subgroup: face-smiling
|
||||
1F600 ; fully-qualified # 😀 E1.0 grinning face
|
||||
1F603 ; fully-qualified # 😃 E0.6 grinning face with big eyes
|
||||
1F604 ; fully-qualified # 😄 E0.6 grinning face with smiling eyes
|
||||
1F601 ; fully-qualified # 😁 E0.6 beaming face with smiling eyes
|
||||
1F606 ; fully-qualified # 😆 E0.6 grinning squinting face
|
||||
1F605 ; fully-qualified # 😅 E0.6 grinning face with sweat
|
||||
1F923 ; fully-qualified # 🤣 E3.0 rolling on the floor laughing
|
||||
1F602 ; fully-qualified # 😂 E0.6 face with tears of joy
|
||||
1F642 ; fully-qualified # 🙂 E1.0 slightly smiling face
|
||||
1F643 ; fully-qualified # 🙃 E1.0 upside-down face
|
||||
1FAE0 ; fully-qualified # 🫠 E14.0 melting face
|
||||
1F609 ; fully-qualified # 😉 E0.6 winking face
|
||||
1F60A ; fully-qualified # 😊 E0.6 smiling face with smiling eyes
|
||||
1F607 ; fully-qualified # 😇 E1.0 smiling face with halo
|
||||
4
bin/unit_tests/assets/textfiles/test-flags.txt
Normal file
4
bin/unit_tests/assets/textfiles/test-flags.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
🇦🇷🇦🇷🇦🇷🇦🇷🇦🇷
|
||||
🇦🇷 🇦🇷 🇦🇷 🇦🇷
|
||||
|
||||
🇦🇷🇦🇷🇦🇷 Awante 🇦🇷 Argentina 🇦🇷🇦🇷🇦🇷
|
||||
@@ -0,0 +1,6 @@
|
||||
<EFBFBD>j<EFBFBD><EFBFBD><EFBFBD>[<5B>j<EFBFBD><6A><EFBFBD>[, <20>ɂ<EFBFBD><C982><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> / <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD>R<EFBFBD><EFBFBD><EFBFBD>R<EFBFBD><EFBFBD> / <20><><EFBFBD><EFBFBD>
|
||||
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ɂ<EFBFBD><EFBFBD>́A<EFBFBD><EFBFBD><EFBFBD>̖<EFBFBD><EFBFBD>O<EFBFBD>̓}<7D>[<5B>e<EFBFBD>B<EFBFBD><42><EFBFBD>ł<EFBFBD><C582>B <20><><EFBFBD>Ȃ<EFBFBD><C882>̖<EFBFBD><CC96>O<EFBFBD>͉<EFBFBD><CD89>ł<EFBFBD><C582><EFBFBD><EFBFBD>H
|
||||
<EFBFBD><EFBFBD><EFBFBD>̓A<EFBFBD><EFBFBD><EFBFBD>[<5B><><EFBFBD>`<60><><EFBFBD>ɏZ<C98F><5A><EFBFBD>ł<EFBFBD><C582><EFBFBD><EFBFBD>A<EFBFBD>R<EFBFBD><52><EFBFBD>s<EFBFBD><73><EFBFBD>[<5B>^<5E>[ <20>v<EFBFBD><76><EFBFBD>O<EFBFBD><4F><EFBFBD>}<7D>[<5B>ł<EFBFBD><C582>B <20>E<EFBFBD>Ƃ͂Ȃ<CD82><C882>ł<EFBFBD><C582><EFBFBD><EFBFBD>H
|
||||
8
bin/unit_tests/assets/textfiles/test-j-shift_jis.txt
Normal file
8
bin/unit_tests/assets/textfiles/test-j-shift_jis.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
<EFBFBD>j<EFBFBD><EFBFBD><EFBFBD>[<5B>j<EFBFBD><6A><EFBFBD>[, <20>ɂ<EFBFBD><C982><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> / <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD>R<EFBFBD><EFBFBD><EFBFBD>R<EFBFBD><EFBFBD> / <20><><EFBFBD><EFBFBD>
|
||||
|
||||
esto anda
|
||||
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ɂ<EFBFBD><EFBFBD>́A<EFBFBD><EFBFBD><EFBFBD>̖<EFBFBD><EFBFBD>O<EFBFBD>̓}<7D>[<5B>e<EFBFBD>B<EFBFBD><42><EFBFBD>ł<EFBFBD><C582>B <20><><EFBFBD>Ȃ<EFBFBD><C882>̖<EFBFBD><CC96>O<EFBFBD>͉<EFBFBD><CD89>ł<EFBFBD><C582><EFBFBD><EFBFBD>H
|
||||
<EFBFBD><EFBFBD><EFBFBD>̓A<EFBFBD><EFBFBD><EFBFBD>[<5B><><EFBFBD>`<60><><EFBFBD>ɏZ<C98F><5A><EFBFBD>ł<EFBFBD><C582><EFBFBD><EFBFBD>A<EFBFBD>R<EFBFBD><52><EFBFBD>s<EFBFBD><73><EFBFBD>[<5B>^<5E>[ <20>v<EFBFBD><76><EFBFBD>O<EFBFBD><4F><EFBFBD>}<7D>[<5B>ł<EFBFBD><C582>B <20>E<EFBFBD>Ƃ͂Ȃ<CD82><C882>ł<EFBFBD><C582><EFBFBD><EFBFBD>H
|
||||
6
bin/unit_tests/assets/textfiles/test-j.txt
Normal file
6
bin/unit_tests/assets/textfiles/test-j.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
ニャーニャー, にゃん
|
||||
ワンワン / わんわん
|
||||
コンコン / こんこん
|
||||
|
||||
こんにちは、私の名前はマーティンです。 あなたの名前は何ですか?
|
||||
私はアルゼンチンに住んでおり、コンピューター プログラマーです。 職業はなんですか?
|
||||
2
bin/unit_tests/assets/textfiles/test-k.txt
Normal file
2
bin/unit_tests/assets/textfiles/test-k.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
안녕하세요 오늘 강의해 드릴 내용은 12 안마당 빌드입니다. 12 안마당의 종류와 장단점들 그리고 빌드 오더를 간단하지만 자세하게 알려드리려고 합니다. 토스전에서는 가장 부유하게 시작하고 싶을 때 사용하는 빌드고요. 테란전에서는 12 안마당으로 할 수 있는 빌드가 여러가지가 있습니다. 그래서 가장 많이 사용하는 빌드들을 몇가지 알려드리려고 합니다. 첫 번째로 투에처리 빌드인데 12 안마당으로 시작하는 빌드입니다. 12 안마당 11 스포닝풀 10가스 이제 빠른 가스를 활용한 빌드인데요. 이 빌드는 투에처리 빌드를 하실때 3에처리를 빠르게 3가스 멀티에 가져가면서 플레이를 할 때 많이 사용을 하고요. 두번째로 12압 12풀 12가스 적당히 빠른 테크트리와 적당히 빠른 3에처리 빌드입니다. 12압 12가스 적당히 빠른 테크트리와 적당히 빠른 3에처리 빌드입니다. 이 빌드 같은 경우는 흔히들 말하는 안 3에처리라고 많이들 얘기를 하는데 뮤탈리스크도 빠르고 3에처리도 빠른 그런 빌드라고 생각하시면 되요.
|
||||
|
||||
2
bin/unit_tests/assets/textfiles/test-sc.txt
Normal file
2
bin/unit_tests/assets/textfiles/test-sc.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
你被关在一个小房间里。你并不记得发生了什么,也不知道为什么被关在这里。你以前从房门的窗口那儿得到食物,
|
||||
但是你用力敲门或者大叫都没有用。你决定一定要逃跑,要不然情况可能会变更不好。
|
||||
2
bin/unit_tests/assets/textfiles/test-tc.txt
Normal file
2
bin/unit_tests/assets/textfiles/test-tc.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
你被關在一個小房間裡。你並不記得發生了什麼,也不知道為什麼被關在這裡。
|
||||
你以前從房門的窗口那兒得到食物,但是你用力敲門或者大叫都沒有用。你決定一定要逃跑,要不然情況可能會變更不好
|
||||
203
bin/unit_tests/assets/textfiles/test.xit
Normal file
203
bin/unit_tests/assets/textfiles/test.xit
Normal file
@@ -0,0 +1,203 @@
|
||||
[ ] Open
|
||||
[x] Checked
|
||||
[@] Ongoing
|
||||
[~] Obsolete
|
||||
|
||||
[*] Invalid
|
||||
[o] Invalid
|
||||
[X] Invalid (uppercase)
|
||||
[ ] Invalid (non-breaking space)
|
||||
|
||||
[] Invalid
|
||||
[ ] Invalid
|
||||
[ x ] Invalid
|
||||
[@@] Invalid
|
||||
|
||||
[x] Invalid
|
||||
[x] Invalid
|
||||
|
||||
[ ] Do this
|
||||
|
||||
[ ] Do this
|
||||
|
||||
[ ]
|
||||
[ ]
|
||||
[ ]
|
||||
|
||||
[ ]Invalid
|
||||
[ ]! Invalid
|
||||
[ ]. Invalid
|
||||
[ ]!!. Invalid
|
||||
[ ]#invalid
|
||||
[ ]->2022-02-16 Invalid
|
||||
|
||||
[ ] This is a longer ...
|
||||
description text
|
||||
[ ] And this one ...
|
||||
is even ...
|
||||
longer
|
||||
|
||||
[ ] The following is just ...
|
||||
[ ] description text
|
||||
|
||||
[x] These lines ...
|
||||
should all ...
|
||||
look the same
|
||||
|
||||
[ ] This has some ...
|
||||
more spaces
|
||||
[ ] And this one ...
|
||||
as well
|
||||
|
||||
[ ] The next line is ...
|
||||
invalid
|
||||
[ ] The next line is ...
|
||||
invalid
|
||||
[ ] The next line is ...
|
||||
invalid
|
||||
[ ] The next line is ...
|
||||
invalid (it’s a tab)
|
||||
|
||||
[ ] ! This is important
|
||||
[ ] !!! This is very important
|
||||
[ ] !!!!!!!!!! This super important
|
||||
|
||||
[ ] ..! This is important
|
||||
[ ] !!. This is more important
|
||||
[ ] ... This is not important
|
||||
|
||||
[ ] ! Do something
|
||||
[ ] . Do something
|
||||
|
||||
[ ] ! Do something
|
||||
[ ] . Do something
|
||||
|
||||
[ ] .!. Invalid
|
||||
[ ] !.! Invalid
|
||||
|
||||
[ ] !This has regular priority
|
||||
[ ] .The dot is not priority
|
||||
[ ] This is also
|
||||
!!! not important
|
||||
|
||||
[ ] ! !!! This is important!
|
||||
[ ] !! ! ! This ! is also important
|
||||
[ ] !. ... This . is also important
|
||||
[ ] . ! This is not important
|
||||
|
||||
[ ] -> 2022-01-31
|
||||
[ ] Do this -> 2022-01-31
|
||||
[ ] -> 2022-01-31 (something)
|
||||
[ ] Do something until ...
|
||||
-> 2022-01-31
|
||||
|
||||
[ ] -> 2022-01-31
|
||||
[ ] -> 2022-01
|
||||
[ ] -> 2022
|
||||
[ ] -> 2022-W01
|
||||
[ ] -> 2022-Q1
|
||||
|
||||
[ ] -> 2022/01/31
|
||||
[ ] -> 2022/W01
|
||||
|
||||
[ ] -> 2022-01/31
|
||||
|
||||
[ ] -> 2022-01-31 -> 2022-01-31
|
||||
|
||||
[ ] Do this soon -> 2022-01-31!!!
|
||||
[ ] Do this (-> 2022-01-31)
|
||||
|
||||
[ ] ---> 2022-01-31
|
||||
[ ] Due-> 2022-01-31
|
||||
[ ] -> 2022-01-31very urgent
|
||||
[ ] -> 2022-01-31T10:00
|
||||
[ ] -> 2022-01-31-0
|
||||
[ ] -> 2022/01/31/0
|
||||
|
||||
[ ] ->2022-01-31
|
||||
[ ] → 2022-01-31
|
||||
[ ] -> 2022-01-31
|
||||
[ ] >2022-01-31
|
||||
[ ] Do until ->
|
||||
2022-01-31
|
||||
|
||||
[ ] #tag
|
||||
[ ] #T-A-G
|
||||
[ ] #--tag--
|
||||
[ ] #__tag__
|
||||
[ ] #t_a_g
|
||||
[ ] #123
|
||||
[ ] #___
|
||||
[ ] #---
|
||||
[ ] #1t2a3g
|
||||
[ ] #täg
|
||||
[ ] #今日は
|
||||
[ ] #გამარჯობა
|
||||
|
||||
[ ] This #text contains #tags
|
||||
[ ] #Actually, it #has a #LOT.
|
||||
Even on the #next-line!
|
||||
|
||||
[ ] This is a #tag.
|
||||
[ ] Tags: #tag1/#tag2
|
||||
[ ] #t-a-g!
|
||||
[ ] #--tag--?
|
||||
[ ] #--tag--:text
|
||||
[ ] (#tag)
|
||||
[ ] #tag🥳
|
||||
|
||||
[ ] Not a tag: #
|
||||
|
||||
[ ] #tag=value
|
||||
[ ] #t-a-g=v-a-l-u-e
|
||||
[ ] #国=日本
|
||||
|
||||
[ ] #tag=
|
||||
[ ] #tag=""
|
||||
[ ] #tag=''
|
||||
|
||||
[ ] #tag="v a l u e"
|
||||
[ ] #tag='v!a.l?u+e'
|
||||
[ ] #tag='foo'bar
|
||||
[ ] #tag='foo'-bar
|
||||
[ ] #tag='foo'!!
|
||||
[ ] (#tag="bar")
|
||||
|
||||
[ ] #tag='It\'s great
|
||||
|
||||
[ ] #tag="v a l u e
|
||||
[ ] #tag="v a l u e'
|
||||
[ ] #tag="hello
|
||||
World!"
|
||||
|
||||
[ ] Item 1 of group
|
||||
[ ] Item 2 of group
|
||||
|
||||
[ ] Item of another group
|
||||
|
||||
Todos
|
||||
[ ] Item 1
|
||||
[ ] Item 2
|
||||
|
||||
Group 1
|
||||
[ ] Item
|
||||
|
||||
Group 2
|
||||
[ ] Item
|
||||
|
||||
Todos
|
||||
|
||||
Todos
|
||||
[ ] Do this
|
||||
|
||||
Todos
|
||||
[ ] Do this
|
||||
|
||||
[Todos]
|
||||
[ ] Do this
|
||||
|
||||
[ ] Do this
|
||||
Todos
|
||||
[ ] Do this
|
||||
|
||||
|
||||
@@ -1364,7 +1364,7 @@ const SyntaxDefinition& SyntaxDefinitionManager::getByHeader( std::string_view h
|
||||
for ( const auto& hdr : definition->get()->getHeaders() ) {
|
||||
LuaPattern words( hdr );
|
||||
int start, end;
|
||||
if ( words.find( header.data(), start, end ) ) {
|
||||
if ( words.find( header.data(), start, end, 0, header.size(), 0 ) ) {
|
||||
return *definition->get();
|
||||
}
|
||||
}
|
||||
@@ -1466,7 +1466,8 @@ std::size_t SyntaxDefinitionManager::count() const {
|
||||
|
||||
bool SyntaxDefinitionManager::isFileFormatSupported( const std::string& filePath,
|
||||
std::string_view header ) {
|
||||
return &find( filePath, header ) != mDefinitions[0].get();
|
||||
return &find( filePath, header ) != mDefinitions[0].get() ||
|
||||
FileSystem::fileExtension( filePath ) == "txt";
|
||||
}
|
||||
|
||||
void SyntaxDefinitionManager::resetFileAssociations() {
|
||||
|
||||
@@ -28,6 +28,8 @@ static constexpr char DEFAULT_NON_WORD_CHARS[] = " \t\n/\\()\"':,.;<>~!@#$%^&*|+
|
||||
|
||||
static UnorderedSet<String::HashType> TEXT_DOCUMENT_COMMANDS = {};
|
||||
|
||||
#include <string_view> // Ensure this is included for std::string_view
|
||||
|
||||
bool TextDocument::fileMightBeBinary( const std::string& file ) {
|
||||
static constexpr size_t MAX_READ = 4096;
|
||||
static constexpr std::array<char, 4> NULL_SEQUENCE = { 0, 0, 0, 0 };
|
||||
@@ -36,7 +38,14 @@ bool TextDocument::fileMightBeBinary( const std::string& file ) {
|
||||
static constexpr std::array<char, 4> ELF_MAGIC = { 0x7F, 'E', 'L', 'F' };
|
||||
static constexpr std::array<char, 4> PNG_MAGIC = { (char)0x89, 'P', 'N', 'G' };
|
||||
static constexpr std::array<char, 5> PDF_MAGIC = { '%', 'P', 'D', 'F', '-' };
|
||||
// UTF-16/UTF-32 BOMs (to avoid misclassifying as binary)
|
||||
static constexpr std::array<char, 4> ZIP_MAGIC = { 'P', 'K', (char)0x03,
|
||||
(char)0x04 }; // Standard ZIP
|
||||
static constexpr std::array<char, 4> ZIP_EMPTY = { 'P', 'K', (char)0x05,
|
||||
(char)0x06 }; // Empty ZIP
|
||||
static constexpr std::array<char, 4> ZIP_SPANNED = { 'P', 'K', (char)0x07,
|
||||
(char)0x08 }; // Spanned ZIP
|
||||
// UTF-8/UTF-16/UTF-32 BOMs (to avoid misclassifying as binary)
|
||||
static constexpr std::array<char, 3> UTF8_BOM = { (char)0xEF, (char)0xBB, (char)0xBF };
|
||||
static constexpr std::array<char, 2> UTF16BE_BOM = { (char)0xFE, (char)0xFF };
|
||||
static constexpr std::array<char, 2> UTF16LE_BOM = { (char)0xFF, (char)0xFE };
|
||||
static constexpr std::array<char, 4> UTF32BE_BOM = { (char)0x00, (char)0x00, (char)0xFE,
|
||||
@@ -57,6 +66,9 @@ bool TextDocument::fileMightBeBinary( const std::string& file ) {
|
||||
}
|
||||
|
||||
// Check for text encoding BOMs (indicates text file)
|
||||
if ( bytesRead >= 3 && std::equal( UTF8_BOM.begin(), UTF8_BOM.end(), buffer.begin() ) ) {
|
||||
return false; // UTF-8 text file
|
||||
}
|
||||
if ( bytesRead >= 2 ) {
|
||||
if ( std::equal( UTF16BE_BOM.begin(), UTF16BE_BOM.end(), buffer.begin() ) ||
|
||||
std::equal( UTF16LE_BOM.begin(), UTF16LE_BOM.end(), buffer.begin() ) ) {
|
||||
@@ -70,10 +82,13 @@ bool TextDocument::fileMightBeBinary( const std::string& file ) {
|
||||
}
|
||||
}
|
||||
|
||||
// Check for binary magic numbers
|
||||
// Check for known binary magic numbers (ELF, PNG, PDF, ZIP)
|
||||
if ( bytesRead >= 4 ) {
|
||||
if ( std::equal( ELF_MAGIC.begin(), ELF_MAGIC.end(), buffer.begin() ) ||
|
||||
std::equal( PNG_MAGIC.begin(), PNG_MAGIC.end(), buffer.begin() ) ||
|
||||
std::equal( ZIP_MAGIC.begin(), ZIP_MAGIC.end(), buffer.begin() ) ||
|
||||
std::equal( ZIP_EMPTY.begin(), ZIP_EMPTY.end(), buffer.begin() ) ||
|
||||
std::equal( ZIP_SPANNED.begin(), ZIP_SPANNED.end(), buffer.begin() ) ||
|
||||
( bytesRead >= 5 &&
|
||||
std::equal( PDF_MAGIC.begin(), PDF_MAGIC.end(), buffer.begin() ) ) ) {
|
||||
return true; // Known binary file type
|
||||
@@ -89,21 +104,187 @@ bool TextDocument::fileMightBeBinary( const std::string& file ) {
|
||||
}
|
||||
}
|
||||
|
||||
// Check if the buffer is valid in common text encodings (without BOM)
|
||||
auto isValidUtf8 = []( const char* data, size_t len ) -> bool {
|
||||
const unsigned char* udata = reinterpret_cast<const unsigned char*>( data );
|
||||
size_t i = 0;
|
||||
while ( i < len ) {
|
||||
if ( udata[i] <= 0x7F ) {
|
||||
++i;
|
||||
continue;
|
||||
}
|
||||
if ( udata[i] >= 0xC2 && udata[i] <= 0xDF ) { // 2-byte sequence
|
||||
if ( i + 1 >= len || udata[i + 1] < 0x80 || udata[i + 1] > 0xBF ) {
|
||||
return false;
|
||||
}
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
if ( udata[i] >= 0xE0 && udata[i] <= 0xEF ) { // 3-byte sequence
|
||||
if ( i + 2 >= len ) {
|
||||
return false;
|
||||
}
|
||||
if ( ( udata[i] == 0xE0 && udata[i + 1] < 0xA0 ) || udata[i + 1] > 0xBF ||
|
||||
( udata[i] == 0xED && udata[i + 1] > 0x9F ) || udata[i + 2] < 0x80 ||
|
||||
udata[i + 2] > 0xBF ) {
|
||||
return false;
|
||||
}
|
||||
i += 3;
|
||||
continue;
|
||||
}
|
||||
if ( udata[i] >= 0xF0 && udata[i] <= 0xF4 ) { // 4-byte sequence
|
||||
if ( i + 3 >= len ) {
|
||||
return false;
|
||||
}
|
||||
if ( ( udata[i] == 0xF0 && udata[i + 1] < 0x90 ) || udata[i + 1] > 0xBF ||
|
||||
( udata[i] == 0xF4 && udata[i + 1] > 0x8F ) || udata[i + 2] < 0x80 ||
|
||||
udata[i + 2] > 0xBF || udata[i + 3] < 0x80 || udata[i + 3] > 0xBF ) {
|
||||
return false;
|
||||
}
|
||||
i += 4;
|
||||
continue;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
auto isValidUtf16LE = []( const char* data, size_t len ) -> bool {
|
||||
const unsigned char* udata = reinterpret_cast<const unsigned char*>( data );
|
||||
if ( len < 2 )
|
||||
return true;
|
||||
len -= len % 2;
|
||||
size_t i = 0;
|
||||
while ( i < len ) {
|
||||
Uint16 word =
|
||||
static_cast<Uint16>( udata[i] ) | ( static_cast<Uint16>( udata[i + 1] ) << 8 );
|
||||
i += 2;
|
||||
if ( word >= 0xD800 && word <= 0xDBFF ) { // High surrogate
|
||||
if ( i >= len )
|
||||
return false;
|
||||
Uint16 next =
|
||||
static_cast<Uint16>( udata[i] ) | ( static_cast<Uint16>( udata[i + 1] ) << 8 );
|
||||
if ( next < 0xDC00 || next > 0xDFFF )
|
||||
return false;
|
||||
i += 2;
|
||||
} else if ( word >= 0xDC00 && word <= 0xDFFF ) { // Low surrogate without high
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
auto isValidUtf16BE = []( const char* data, size_t len ) -> bool {
|
||||
const unsigned char* udata = reinterpret_cast<const unsigned char*>( data );
|
||||
if ( len < 2 )
|
||||
return true;
|
||||
len -= len % 2;
|
||||
size_t i = 0;
|
||||
while ( i < len ) {
|
||||
Uint16 word =
|
||||
( static_cast<Uint16>( udata[i] ) << 8 ) | static_cast<Uint16>( udata[i + 1] );
|
||||
i += 2;
|
||||
if ( word >= 0xD800 && word <= 0xDBFF ) { // High surrogate
|
||||
if ( i >= len )
|
||||
return false;
|
||||
Uint16 next =
|
||||
( static_cast<Uint16>( udata[i] ) << 8 ) | static_cast<Uint16>( udata[i + 1] );
|
||||
if ( next < 0xDC00 || next > 0xDFFF )
|
||||
return false;
|
||||
i += 2;
|
||||
} else if ( word >= 0xDC00 && word <= 0xDFFF ) { // Low surrogate without high
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
auto isValidUtf32LE = []( const char* data, size_t len ) -> bool {
|
||||
const unsigned char* udata = reinterpret_cast<const unsigned char*>( data );
|
||||
if ( len < 4 )
|
||||
return true;
|
||||
len -= len % 4;
|
||||
for ( size_t i = 0; i < len; i += 4 ) {
|
||||
Uint32 code = static_cast<Uint32>( udata[i] ) |
|
||||
( static_cast<Uint32>( udata[i + 1] ) << 8 ) |
|
||||
( static_cast<Uint32>( udata[i + 2] ) << 16 ) |
|
||||
( static_cast<Uint32>( udata[i + 3] ) << 24 );
|
||||
if ( code > 0x10FFFF || ( code >= 0xD800 && code <= 0xDFFF ) ) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
auto isValidUtf32BE = []( const char* data, size_t len ) -> bool {
|
||||
const unsigned char* udata = reinterpret_cast<const unsigned char*>( data );
|
||||
if ( len < 4 )
|
||||
return true;
|
||||
len -= len % 4;
|
||||
for ( size_t i = 0; i < len; i += 4 ) {
|
||||
Uint32 code = static_cast<Uint32>( udata[i + 3] ) |
|
||||
( static_cast<Uint32>( udata[i + 2] ) << 8 ) |
|
||||
( static_cast<Uint32>( udata[i + 1] ) << 16 ) |
|
||||
( static_cast<Uint32>( udata[i] ) << 24 );
|
||||
if ( code > 0x10FFFF || ( code >= 0xD800 && code <= 0xDFFF ) ) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
// Calculate byte entropy to detect binary files
|
||||
auto calculateEntropy = []( const char* data, size_t len ) -> double {
|
||||
std::array<size_t, 256> freq = { 0 };
|
||||
for ( size_t i = 0; i < len; ++i ) {
|
||||
freq[static_cast<unsigned char>( data[i] )]++;
|
||||
}
|
||||
double entropy = 0.0;
|
||||
for ( size_t i = 0; i < 256; ++i ) {
|
||||
if ( freq[i] > 0 ) {
|
||||
double p = static_cast<double>( freq[i] ) / len;
|
||||
entropy -= p * std::log2( p );
|
||||
}
|
||||
}
|
||||
return entropy;
|
||||
};
|
||||
|
||||
bool isFileFormatSupported = SyntaxDefinitionManager::instance()->isFileFormatSupported(
|
||||
file, std::string_view{ buffer.data(), bytesRead } );
|
||||
|
||||
// Check proportion of non-printable characters
|
||||
size_t nonPrintableCount = 0;
|
||||
for ( size_t i = 0; i < bytesRead; ++i ) {
|
||||
if ( buffer[i] < 32 && buffer[i] != '\n' && buffer[i] != '\r' && buffer[i] != '\t' ) {
|
||||
unsigned char uch = static_cast<unsigned char>( buffer[i] );
|
||||
if ( uch < 32 && uch != '\n' && uch != '\r' && uch != '\t' )
|
||||
++nonPrintableCount;
|
||||
}
|
||||
}
|
||||
|
||||
// Consider file binary if >20% of characters are non-printable
|
||||
if ( nonPrintableCount > bytesRead * 0.2 ) {
|
||||
// Also white-list known extensions
|
||||
if ( !SyntaxDefinitionManager::instance()->isFileFormatSupported(
|
||||
file, std::string_view{ buffer.data(), buffer.size() } ) ) {
|
||||
return true;
|
||||
// Check if the buffer is valid in common text encodings
|
||||
bool validUtf8 = isValidUtf8( buffer.data(), bytesRead );
|
||||
if ( validUtf8 || isValidUtf16LE( buffer.data(), bytesRead ) ||
|
||||
isValidUtf16BE( buffer.data(), bytesRead ) || isValidUtf32LE( buffer.data(), bytesRead ) ||
|
||||
isValidUtf32BE( buffer.data(), bytesRead ) ) {
|
||||
// Even if valid text encoding, check non-printable characters
|
||||
if ( nonPrintableCount > bytesRead * 0.2 &&
|
||||
!isFileFormatSupported ) { // 20% threshold for text encodings
|
||||
return true; // Likely binary due to non-printable chars
|
||||
}
|
||||
// For valid UTF-8, check entropy to catch binary files with valid UTF-8 sequences
|
||||
if ( validUtf8 ) {
|
||||
double entropy = calculateEntropy( buffer.data(), bytesRead );
|
||||
// Binary files typically have higher entropy (>6.5 bits) than text
|
||||
if ( entropy > 6.5 && !isFileFormatSupported ) {
|
||||
return true; // Likely binary due to high entropy
|
||||
}
|
||||
}
|
||||
return false; // Valid text encoding, treat as text
|
||||
}
|
||||
|
||||
// For non-text encodings, check non-printable characters
|
||||
if ( nonPrintableCount > bytesRead * 0.1 &&
|
||||
!isFileFormatSupported ) { // 10% threshold for non-text encodings
|
||||
return true; // Likely binary due to extension and non-printable chars
|
||||
}
|
||||
|
||||
return false; // Likely a text file
|
||||
|
||||
@@ -74,3 +74,14 @@ UTEST( TextDocument, multicursor ) {
|
||||
doc.resetUndoRedo();
|
||||
doc.resetSelection( TextRange{ { 0, 0 }, { 0, 0 } } );
|
||||
}
|
||||
|
||||
UTEST( TextDocument, fileMightBeBinary ) {
|
||||
FileSystem::changeWorkingDirectory( Sys::getProcessPath() );
|
||||
auto files = FileSystem::filesInfoGetInPath( "assets/textfiles" );
|
||||
for ( const auto& file : files ) {
|
||||
EXPECT_FALSE_MSG(
|
||||
TextDocument::fileMightBeBinary( file.getFilepath() ),
|
||||
String::format( "File %s should be detected as text file", file.getFilepath() )
|
||||
.c_str() );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#include <eepp/system/iostreamfile.hpp>
|
||||
#include <eepp/system/luapattern.hpp>
|
||||
#include <eepp/system/sys.hpp>
|
||||
#include <eepp/ui/doc/textdocument.hpp>
|
||||
#include <eepp/ui/doc/textformat.hpp>
|
||||
|
||||
using namespace EE::Graphics;
|
||||
@@ -82,11 +83,20 @@ UTEST( TextFormat, autodetectProject ) {
|
||||
continue;
|
||||
}
|
||||
auto extension = file.getExtension();
|
||||
bool fromSDLFolder = false;
|
||||
if ( "a" == extension || "zip" == extension || "dll" == extension ||
|
||||
"dat" == extension || "cur" == extension || "icns" == extension ||
|
||||
"wav" == extension || Image::isImageExtension( file.getFilepath() ) ||
|
||||
LuaPattern::hasMatches( file.getFilepath(), "SDL2%-%d+%.%d+%.%d+" ) )
|
||||
( fromSDLFolder =
|
||||
LuaPattern::hasMatches( file.getFilepath(), "SDL2%-%d+%.%d+%.%d+" ) ) ) {
|
||||
if ( !fromSDLFolder && "dat" != extension ) {
|
||||
EXPECT_TRUE_MSG( TextDocument::fileMightBeBinary( file.getFilepath() ),
|
||||
String::format( "File %s should be detected as binary file",
|
||||
file.getFilepath() )
|
||||
.c_str() );
|
||||
}
|
||||
continue;
|
||||
}
|
||||
IOStreamFile stream( file.getFilepath() );
|
||||
auto expectedEncoding = getEncoding( file.getFileName() );
|
||||
auto textFormat = TextFormat::autodetect( stream );
|
||||
@@ -95,6 +105,10 @@ UTEST( TextFormat, autodetectProject ) {
|
||||
TextFormat::encodingToString( textFormat.encoding ),
|
||||
TextFormat::encodingToString( expectedEncoding ) )
|
||||
.c_str() );
|
||||
EXPECT_FALSE_MSG(
|
||||
TextDocument::fileMightBeBinary( file.getFilepath() ),
|
||||
String::format( "File %s should be detected as text file", file.getFilepath() )
|
||||
.c_str() );
|
||||
}
|
||||
};
|
||||
checkFolder( projectRoot );
|
||||
|
||||
Reference in New Issue
Block a user