From 0df86b1d531e130faba32737b7c60df108f03922 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mart=C3=ADn=20Lucas=20Golini?= Date: Wed, 28 May 2025 03:08:17 -0300 Subject: [PATCH] RegEx fixes. TextMate grammars fixes. --- include/eepp/system/regex.hpp | 6 +- include/eepp/ui/doc/syntaxdefinition.hpp | 2 +- src/eepp/system/regex.cpp | 49 ++++--- src/eepp/ui/doc/syntaxdefinition.cpp | 2 +- src/eepp/ui/doc/syntaxdefinitionmanager.cpp | 24 +++- src/eepp/ui/doc/syntaxtokenizer.cpp | 128 ++++++++++-------- .../src/eepp/ui/doc/languages/fixscript.cpp | 2 +- .../src/eepp/ui/doc/languages/janet.cpp | 6 +- .../src/eepp/ui/doc/languages/rescript.cpp | 4 +- src/tests/unit_tests/regex.cpp | 14 ++ 10 files changed, 150 insertions(+), 87 deletions(-) diff --git a/include/eepp/system/regex.hpp b/include/eepp/system/regex.hpp index 04d5f2816..aec20d7d1 100644 --- a/include/eepp/system/regex.hpp +++ b/include/eepp/system/regex.hpp @@ -24,8 +24,8 @@ class EE_API RegExCache { protected: bool mEnabled{ true }; - UnorderedMap mCache; - UnorderedMap mCacheOpt; + std::unordered_map mCache; + std::unordered_map mCacheOpt; }; class EE_API RegEx : public PatternMatcher { @@ -65,7 +65,7 @@ class EE_API RegEx : public PatternMatcher { FilterOutCaptures = 0x08000000u, // It will filter out repeated captures and same range captures AllowFallback = 0x10000000u, - UseOnigmo = 0x20000000u, + UseOniguruma = 0x20000000u, }; RegEx( std::string_view pattern, Uint32 options = Options::Utf | Options::AllowFallback, diff --git a/include/eepp/ui/doc/syntaxdefinition.hpp b/include/eepp/ui/doc/syntaxdefinition.hpp index 8ce90be25..f3b209ed8 100644 --- a/include/eepp/ui/doc/syntaxdefinition.hpp +++ b/include/eepp/ui/doc/syntaxdefinition.hpp @@ -124,7 +124,7 @@ struct EE_API SyntaxPattern { } inline bool checkIsRootSelfInclude() const { - return checkIsIncludePattern() && patterns[1] == "$self"; + return checkIsIncludePattern() && ( patterns[1] == "$self" || patterns[1] == "$base" ); } inline bool checkIsRepositoryInclude() const { diff --git a/src/eepp/system/regex.cpp b/src/eepp/system/regex.cpp index ddd159ffa..dc7bede36 100644 --- a/src/eepp/system/regex.cpp +++ b/src/eepp/system/regex.cpp @@ -24,20 +24,20 @@ RegExCache::~RegExCache() { } void RegExCache::insert( std::string_view key, Uint32 options, void* cache ) { - auto hash = hashCombine( String::hash( key ), options ); + auto hash = hashCombine( std::hash()( key ), options ); mCache.insert( { hash, cache } ); mCacheOpt.insert( { hash, options } ); } void* RegExCache::find( const std::string_view& key, Uint32 options ) { - auto it = mCache.find( hashCombine( String::hash( key ), options ) ); + auto it = mCache.find( hashCombine( std::hash()( key ), options ) ); return ( it != mCache.end() ) ? it->second : nullptr; } void RegExCache::clear() { for ( auto& cache : mCache ) { auto opt = mCacheOpt.find( cache.first ); - if ( opt->second & RegEx::Options::UseOnigmo ) + if ( opt->second & RegEx::Options::UseOniguruma ) onig_free( static_cast( cache.second ) ); else pcre2_code_free( reinterpret_cast( cache.second ) ); @@ -65,7 +65,17 @@ RegEx::RegEx( std::string_view pattern, Uint32 options, bool useCache ) : return; } - if ( mOptions & Options::UseOnigmo ) { + if ( useCache && ( mOptions & Options::AllowFallback ) && + !( mOptions & Options::UseOniguruma ) && RegExCache::instance()->isEnabled() && + ( mCompiledPattern = + RegExCache::instance()->find( pattern, mOptions | Options::UseOniguruma ) ) ) { + mValid = true; + mCached = true; + mOptions |= Options::UseOniguruma; + return; + } + + if ( mOptions & Options::UseOniguruma ) { initWithOnigmo( pattern, useCache ); return; } @@ -76,8 +86,8 @@ RegEx::RegEx( std::string_view pattern, Uint32 options, bool useCache ) : if ( options & Options::AllowFallback ) options &= ~Options::AllowFallback; - if ( options & Options::UseOnigmo ) - options &= ~Options::UseOnigmo; + if ( options & Options::UseOniguruma ) + options &= ~Options::UseOniguruma; mCompiledPattern = pcre2_compile( pattern_sptr, // the pattern pattern.size(), // the length of the pattern @@ -128,7 +138,7 @@ bool RegEx::matches( const char* stringSearch, int stringStartOffset, return false; } - if ( mOptions & Options::UseOnigmo ) { + if ( mOptions & Options::UseOniguruma ) { OnigRegion* region = onig_region_new(); if ( !region ) { Log::error( "Onigmo: onig_region_new() failed." ); @@ -162,6 +172,8 @@ bool RegEx::matches( const char* stringSearch, int stringStartOffset, for ( int i = 0; i < region->num_regs; ++i ) { int start = static_cast( region->beg[i] ); int end = static_cast( region->end[i] ); + if ( start == -1 || end == -1 ) + continue; if ( !mFilterOutCaptures || ( !( start == 0 && end == 0 ) && start != end && ( curCap == 0 || !( matchList[curCap - 1].start == start && @@ -171,7 +183,8 @@ bool RegEx::matches( const char* stringSearch, int stringStartOffset, curCap++; } } - mMatchNum = curCap; + if ( mMatchNum > 1 ) + mMatchNum = curCap; } onig_region_free( region, 1 ); @@ -196,13 +209,13 @@ bool RegEx::matches( const char* stringSearch, int stringStartOffset, PCRE2_SPTR subject = reinterpret_cast( stringSearch ); - int rc = pcre2_match( compiledPattern, // the compiled pattern - subject + stringStartOffset, // the subject string - stringLength - stringStartOffset, // the length of the subject - 0, // start at offset in the subject - 0, // default options - match_data, // match data - NULL // match context + int rc = pcre2_match( compiledPattern, // the compiled pattern + subject, // the subject string + stringLength, // the length of the subject + stringStartOffset, // start at offset in the subject + 0, // default options + match_data, // match data + NULL // match context ); if ( rc < 0 ) { @@ -220,8 +233,8 @@ bool RegEx::matches( const char* stringSearch, int stringStartOffset, PCRE2_SIZE* ovector = pcre2_get_ovector_pointer( match_data ); int curCap = 0; for ( size_t i = 0; i < static_cast( rc ); ++i ) { - int start = stringStartOffset + static_cast( ovector[2 * i] ); - int end = stringStartOffset + static_cast( ovector[2 * i + 1] ); + int start = static_cast( ovector[2 * i] ); + int end = static_cast( ovector[2 * i + 1] ); if ( !mFilterOutCaptures || ( !( start == 0 && end == 0 ) && start != end && ( curCap == 0 || !( matchList[curCap - 1].start == start && @@ -279,7 +292,7 @@ bool RegEx::initWithOnigmo( std::string_view pattern, bool useCache ) { mCompiledPattern = regex; mValid = true; - mOptions |= Options::UseOnigmo; + mOptions |= Options::UseOniguruma; mCaptureCount = onig_number_of_captures( static_cast( mCompiledPattern ) ); if ( useCache && RegExCache::instance()->isEnabled() ) { diff --git a/src/eepp/ui/doc/syntaxdefinition.cpp b/src/eepp/ui/doc/syntaxdefinition.cpp index 0b445b099..0030a8323 100644 --- a/src/eepp/ui/doc/syntaxdefinition.cpp +++ b/src/eepp/ui/doc/syntaxdefinition.cpp @@ -55,7 +55,7 @@ template void updateCache( const SyntaxPattern& ptrn static void updatePatternRefs( const SyntaxDefinition& def, SyntaxPattern& ptrn ) { ptrn.def = &def; - if ( ptrn.syntax == "$self" ) + if ( ptrn.syntax == "$self" || ptrn.syntax == "$base" ) ptrn.syntax = def.getLanguageName(); } diff --git a/src/eepp/ui/doc/syntaxdefinitionmanager.cpp b/src/eepp/ui/doc/syntaxdefinitionmanager.cpp index 7ca5ac429..7e1b68640 100644 --- a/src/eepp/ui/doc/syntaxdefinitionmanager.cpp +++ b/src/eepp/ui/doc/syntaxdefinitionmanager.cpp @@ -41,7 +41,8 @@ class TextMateScopeMapper { { "variable.parameter", "keyword3" }, // Function parameters { "variable.language", "literal" }, // Language constants like 'this', 'self', 'null'? { "variable.identifier", "normal" }, - { "storage.type", "keyword2" }, // Class, struct, int, bool etc. (declaration) + { "variable.function", "function" }, + { "storage.type", "keyword" }, // Class, struct, int, bool etc. (declaration) { "entity.name.function", "function" }, // Function definition name { "entity.name.type", "keyword2" }, // Type name (class, struct, etc.) in definition { "entity.name.class", "keyword2" }, // Class name in definition @@ -655,10 +656,18 @@ static SyntaxPattern parsePattern( const nlohmann::json& pattern ) { for ( Uint64 i = 0; i < totalCaptures; i++ ) { auto capNumStr = String::toString( i ); if ( captures.contains( capNumStr ) && captures[capNumStr].contains( "name" ) ) { - type.emplace_back( - TextMateScopeMapper::scopeToType( captures[capNumStr].value( "name", "" ) ) ); + auto ctype = + TextMateScopeMapper::scopeToType( captures[capNumStr].value( "name", "" ) ); + if ( i < type.size() ) + type[i] = ctype; + else + type.emplace_back( ctype ); } else if ( parent.contains( "name" ) ) { - type.emplace_back( TextMateScopeMapper::scopeToType( parent.value( "name", "" ) ) ); + auto ctype = TextMateScopeMapper::scopeToType( parent.value( "name", "" ) ); + if ( i < type.size() ) + type[i] = ctype; + else + type.emplace_back( ctype ); } else { type.emplace_back( "normal" ); } @@ -810,6 +819,13 @@ static SyntaxDefinition loadTextMateLanguage( const nlohmann::json& json, Syntax auto ext( file.get() ); def.addFileType( ( !String::contains( ext, "." ) ? "%." : "" ) + ext + "$" ); } + } else if ( json.contains( "filetypes" ) && json["filetypes"].is_array() ) { + const auto& files = json["filetypes"]; + for ( const auto& file : files ) + if ( file.is_string() ) { + auto ext( file.get() ); + def.addFileType( ( !String::contains( ext, "." ) ? "%." : "" ) + ext + "$" ); + } } else if ( json.contains( "scopeName" ) && json["scopeName"].is_string() ) { const auto& scopeName = json.value( "scopeName", "" ); def.addFileType( "%." + FileSystem::fileExtension( scopeName ) + "$" ); diff --git a/src/eepp/ui/doc/syntaxtokenizer.cpp b/src/eepp/ui/doc/syntaxtokenizer.cpp index ec10b5f65..e16c6dbf8 100644 --- a/src/eepp/ui/doc/syntaxtokenizer.cpp +++ b/src/eepp/ui/doc/syntaxtokenizer.cpp @@ -13,6 +13,8 @@ using namespace EE::System; namespace EE { namespace UI { namespace Doc { +static constexpr auto REGEX_FLAGS = RegEx::Options::Utf | RegEx::Options::AllowFallback; + struct PatternStackItem { const std::vector* patterns{ nullptr }; size_t index = 0; @@ -49,6 +51,8 @@ static int isInMultiByteCodePoint( const char* text, const size_t& textSize, con template static void pushToken( std::vector& tokens, const SyntaxStyleType& type, const std::string_view& text ) { + if ( text.empty() ) + return; if ( !tokens.empty() && ( tokens[tokens.size() - 1].type == type ) ) { size_t tpos = tokens.size() - 1; tokens[tpos].type = type; @@ -129,7 +133,7 @@ static NonEscapedMatch findNonEscaped( const std::string& text, const std::strin ? std::variant( LuaPattern( pattern ) ) : ( matchType == SyntaxPatternMatchType::RegEx ? std::variant( - RegEx( pattern, RegEx::Options::Utf | RegEx::Options::AllowFallback ) ) + RegEx( pattern, REGEX_FLAGS ) ) : std::variant( ParserMatcher( pattern ) ) ); PatternMatcher& words = std::visit( []( auto& patternType ) -> PatternMatcher& { return patternType; }, wordsVar ); @@ -275,6 +279,9 @@ static inline void pushTokensToOpenCloseSubsyntax( int i, std::string_view textv start = rangeSubsyntax.matches[sidx].start; end = rangeSubsyntax.matches[sidx].end; + if ( start == -1 || end == -1 ) + continue; + if ( sidx == 1 && start > lastStart ) { pushToken( tokens, patternType, textv.substr( patternMatchStart, start - patternMatchStart ) ); @@ -333,9 +340,8 @@ _tokenize( const SyntaxDefinition& syntax, const std::string& text, const Syntax pattern.matchType == SyntaxPatternMatchType::LuaPattern ? std::variant( LuaPattern( patternStr ) ) : ( pattern.matchType == SyntaxPatternMatchType::RegEx - ? std::variant( RegEx( - patternStr, RegEx::Options::Utf | RegEx::Options::AllowFallback | - RegEx::Options::Anchored ) ) + ? std::variant( + RegEx( patternStr, REGEX_FLAGS | RegEx::Options::Anchored ) ) : std::variant( ParserMatcher( patternStr ) ) ); PatternMatcher& words = std::visit( @@ -443,69 +449,83 @@ _tokenize( const SyntaxDefinition& syntax, const std::string& text, const Syntax for ( size_t curMatch = 1; curMatch < numMatches; curMatch++ ) { start = matches[curMatch].start; end = matches[curMatch].end; - if ( start == end || start < 0 || end < 0 ) - continue; - if ( pattern.patterns.size() >= 3 && startIdx > 0 && - text[startIdx - 1] == pattern.patterns[2][0] ) - continue; - Uint8 lead = ( 0xff & ( text[start] ) ); - if ( !( lead < 0x80 ) ) { - char* strStart = const_cast( text.c_str() + start ); - char* strEnd = strStart; - String::utf8Next( strEnd ); - end = start + ( strEnd - strStart ); - } - if ( curMatch == 1 && start > lastStart ) { - pushToken( tokens, patternType, - textv.substr( fullMatchStart, start - fullMatchStart ) ); - } else if ( start > lastEnd ) { - pushToken( tokens, patternType, - textv.substr( lastEnd, start - lastEnd ) ); - } - patternText = textv.substr( start, end - start ); - SyntaxStyleType type = - curMatch < pattern.types.size() && - ( pattern.types[curMatch] == SyntaxStyleTypes::Symbol || - pattern.types[curMatch] == SyntaxStyleTypes::Normal ) - ? curState.currentSyntax->getSymbol( - ( patternTextStr = patternText ) ) - : SyntaxStyleEmpty(); + if ( !( start == end || start < 0 || end < 0 ) && + !( pattern.patterns.size() >= 3 && startIdx > 0 && + text[startIdx - 1] == pattern.patterns[2][0] ) ) { + Uint8 lead = ( 0xff & ( text[start] ) ); + if ( !( lead < 0x80 ) ) { + char* strStart = const_cast( text.c_str() + start ); + char* strEnd = strStart; + String::utf8Next( strEnd ); + end = start + ( strEnd - strStart ); + } + if ( curMatch == 1 && start > lastStart ) { + pushToken( tokens, patternType, + textv.substr( fullMatchStart, start - fullMatchStart ) ); + } else if ( start > lastEnd ) { + pushToken( tokens, patternType, + textv.substr( lastEnd, start - lastEnd ) ); + } - if ( !skipSubSyntaxSeparator || !pattern.hasSyntaxOrContentScope() ) { - pushToken( tokens, - type == SyntaxStyleEmpty() - ? ( curMatch < pattern.types.size() - ? pattern.types[curMatch] - : pattern.types[0] ) - : type, - patternText ); - } + patternText = textv.substr( start, end - start ); + SyntaxStyleType type = + curMatch < pattern.types.size() && + ( pattern.types[curMatch] == SyntaxStyleTypes::Symbol || + pattern.types[curMatch] == SyntaxStyleTypes::Normal ) + ? curState.currentSyntax->getSymbol( + ( patternTextStr = patternText ) ) + : SyntaxStyleEmpty(); - if ( pattern.isRangedMatch() && curMatch == numMatches - 1 && - end == fullMatchEnd ) { - pushStack( - curState, retState, pattern, patternIndex, - textv.substr( fullMatchStart, fullMatchEnd - fullMatchStart ) ); - } + if ( !skipSubSyntaxSeparator || !pattern.hasSyntaxOrContentScope() ) { + pushToken( tokens, + type == SyntaxStyleEmpty() + ? ( curMatch < pattern.types.size() + ? pattern.types[curMatch] + : pattern.types[0] ) + : type, + patternText ); + } - startIdx = end; - - if ( curMatch == numMatches - 1 && end < fullMatchEnd ) { - pushToken( tokens, patternType, - textv.substr( end, fullMatchEnd - end ) ); - startIdx = fullMatchEnd; - - if ( pattern.isRangedMatch() && curMatch == numMatches - 1 ) { + if ( pattern.isRangedMatch() && curMatch == numMatches - 1 && + end == fullMatchEnd ) { pushStack( curState, retState, pattern, patternIndex, textv.substr( fullMatchStart, fullMatchEnd - fullMatchStart ) ); } + + startIdx = end; + + if ( curMatch == numMatches - 1 && end < fullMatchEnd ) { + pushToken( tokens, patternType, + textv.substr( end, fullMatchEnd - end ) ); + startIdx = fullMatchEnd; + end = fullMatchEnd; + + if ( pattern.isRangedMatch() && curMatch == numMatches - 1 ) { + pushStack( curState, retState, pattern, patternIndex, + textv.substr( fullMatchStart, + fullMatchEnd - fullMatchStart ) ); + } + } } lastStart = start; lastEnd = end; } + + if ( lastEnd < fullMatchEnd ) { + pushToken( tokens, patternType, + textv.substr( lastEnd, fullMatchEnd - lastEnd ) ); + startIdx = fullMatchEnd; + if ( pattern.isRangedMatch() ) { + pushStack( + curState, retState, pattern, patternIndex, + textv.substr( fullMatchStart, fullMatchEnd - fullMatchStart ) ); + } + startIdx = fullMatchEnd; + } + return true; } } else { diff --git a/src/modules/languages-syntax-highlighting/src/eepp/ui/doc/languages/fixscript.cpp b/src/modules/languages-syntax-highlighting/src/eepp/ui/doc/languages/fixscript.cpp index 8efdcb739..3ccbb3355 100644 --- a/src/modules/languages-syntax-highlighting/src/eepp/ui/doc/languages/fixscript.cpp +++ b/src/modules/languages-syntax-highlighting/src/eepp/ui/doc/languages/fixscript.cpp @@ -32,7 +32,7 @@ void addFixScript() { { "else", "keyword" }, { "break", "keyword" }, { "const", "keyword" }, { "while", "keyword" }, { "import", "keyword" }, { "return", "keyword" }, { "switch", "keyword" }, { "default", "keyword" }, { "continue", "keyword" }, - { "function", "function" }, + { "function", "keyword" }, { "macro", "keyword" }, { "generate", "keyword" }, { "output", "keyword" }, diff --git a/src/modules/languages-syntax-highlighting/src/eepp/ui/doc/languages/janet.cpp b/src/modules/languages-syntax-highlighting/src/eepp/ui/doc/languages/janet.cpp index e1f64b200..51e641682 100644 --- a/src/modules/languages-syntax-highlighting/src/eepp/ui/doc/languages/janet.cpp +++ b/src/modules/languages-syntax-highlighting/src/eepp/ui/doc/languages/janet.cpp @@ -11,9 +11,9 @@ void addJanet() { { "Janet", { "%.janet$" }, { - { { "(@?)```", "```", "\\" }, "string" }, - { { "(@?)``", "``", "\\" }, "string" }, - { { "(@?)`", "`", "\\" }, "string" }, + { { "(@?)```", "```" }, "string" }, + { { "(@?)``", "``" }, "string" }, + { { "(@?)`", "`" }, "string" }, { { "\"", "\"", "\\" }, "string" }, { { "0x[%da-fA-F]+" }, "number" }, { { "-?%d+[%d%.eE]*f?" }, "number" }, diff --git a/src/modules/languages-syntax-highlighting/src/eepp/ui/doc/languages/rescript.cpp b/src/modules/languages-syntax-highlighting/src/eepp/ui/doc/languages/rescript.cpp index c5dd43cf3..90326fcc4 100644 --- a/src/modules/languages-syntax-highlighting/src/eepp/ui/doc/languages/rescript.cpp +++ b/src/modules/languages-syntax-highlighting/src/eepp/ui/doc/languages/rescript.cpp @@ -12,8 +12,8 @@ void addRescript() { { { { "//.-\n" }, "comment" }, { { "/%*", "%*/" }, "comment" }, - { { "\"", "\"", "\\" }, "string" }, - { { "`", "`", "\\" }, "string" }, + { { "\\?\"", "\"", "\\" }, "string" }, + { { "\\?`", "`", "\\" }, "string" }, { { "%f[^<]![%a_][%w%_%-]*" }, "keyword2" }, { { "%f[^<][%a_][%w%_%-]*" }, "function" }, diff --git a/src/tests/unit_tests/regex.cpp b/src/tests/unit_tests/regex.cpp index 8b6644f6a..6058a66ef 100644 --- a/src/tests/unit_tests/regex.cpp +++ b/src/tests/unit_tests/regex.cpp @@ -83,3 +83,17 @@ UTEST( LuaPattern, basicTest ) { EXPECT_EQ( end, 16 ); } } + +UTEST( RegExEngines, basicTest ) { + std::string testStr = " ,(render-posts a-blog)))))\n"; + RegEx regexPCRE2( "(?<=\\))", RegEx::Options::Utf ); + RegEx regexOniguruma( "(?<=\\))", RegEx::Options::Utf | RegEx::Options::UseOniguruma ); + PatternMatcher::Range matchesPCRE2[10]; + PatternMatcher::Range matchesOniguruma[10]; + regexPCRE2.matches( testStr, matchesPCRE2, 38 ); + regexOniguruma.matches( testStr, matchesOniguruma, 38 ); + EXPECT_EQ( 38, matchesPCRE2[0].start ); + EXPECT_EQ( 38, matchesPCRE2[0].end ); + EXPECT_EQ( 38, matchesOniguruma[0].start ); + EXPECT_EQ( 38, matchesOniguruma[0].end ); +}