From b84080f7ecbaee7ad0cf5be1f61ca2e5a4cdc834 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mart=C3=ADn=20Lucas=20Golini?= Date: Sun, 9 Mar 2025 01:57:54 -0300 Subject: [PATCH] SyntaxTokenizer improvements. Now it's possible to set captures to sub-syntaxes and ranges. Fix in RegEx that prevented to ^ work properly in tokenizer. --- include/eepp/system/patternmatcher.hpp | 6 ++ src/eepp/system/patternmatcher.cpp | 19 ++++ src/eepp/system/regex.cpp | 18 ++-- src/eepp/ui/doc/languages/cpp.cpp | 8 +- src/eepp/ui/doc/syntaxtokenizer.cpp | 124 +++++++++++++++++++------ 5 files changed, 133 insertions(+), 42 deletions(-) diff --git a/include/eepp/system/patternmatcher.hpp b/include/eepp/system/patternmatcher.hpp index 3e778ae9a..4f19cae98 100644 --- a/include/eepp/system/patternmatcher.hpp +++ b/include/eepp/system/patternmatcher.hpp @@ -114,6 +114,12 @@ class EE_API PatternMatcher { bool find( const char* stringSearch, int& startMatch, int& endMatch, int stringStartOffset = 0, int stringLength = 0, int returnMatchIndex = 0 ) const; + bool find( const std::string& s, int& startMatch, int& endMatch, int offset, + int returnedMatchIndex, PatternMatcher::Range* matchesBuffer ) const; + + bool find( const char* stringSearch, int& startMatch, int& endMatch, int stringStartOffset, + int stringLength, int returnMatchIndex, PatternMatcher::Range* matchesBuffer ) const; + std::string gsub( const char* text, const char* replace ); std::string gsub( const std::string& text, const std::string& replace ); diff --git a/src/eepp/system/patternmatcher.cpp b/src/eepp/system/patternmatcher.cpp index daf2fcbef..e6dcc38a1 100644 --- a/src/eepp/system/patternmatcher.cpp +++ b/src/eepp/system/patternmatcher.cpp @@ -9,6 +9,19 @@ namespace EE { namespace System { #define MAX_DEFAULT_MATCHES 12 +bool PatternMatcher::find( const char* stringSearch, int& startMatch, int& endMatch, + int stringStartOffset, int stringLength, int returnMatchIndex, + PatternMatcher::Range* matchesBuffer ) const { + if ( matches( stringSearch, stringStartOffset, matchesBuffer, stringLength ) ) { + range( returnMatchIndex, startMatch, endMatch, matchesBuffer ); + return true; + } else { + startMatch = -1; + endMatch = -1; + return false; + } +} + bool PatternMatcher::find( const char* stringSearch, int& startMatch, int& endMatch, int stringStartOffset, int stringLength, int returnMatchIndex ) const { PatternMatcher::Range matchesBuffer[MAX_DEFAULT_MATCHES]; @@ -22,6 +35,12 @@ bool PatternMatcher::find( const char* stringSearch, int& startMatch, int& endMa } } +bool PatternMatcher::find( const std::string& s, int& startMatch, int& endMatch, int offset, + int returnedMatchIndex, PatternMatcher::Range* matchesBuffer ) const { + return find( s.c_str(), startMatch, endMatch, offset, s.size(), returnedMatchIndex, + matchesBuffer ); +} + bool PatternMatcher::find( const std::string& s, int& startMatch, int& endMatch, int offset, int returnedMatchIndex ) const { return find( s.c_str(), startMatch, endMatch, offset, s.size(), returnedMatchIndex ); diff --git a/src/eepp/system/regex.cpp b/src/eepp/system/regex.cpp index 9d1530189..1b2182483 100644 --- a/src/eepp/system/regex.cpp +++ b/src/eepp/system/regex.cpp @@ -88,13 +88,13 @@ bool RegEx::matches( const char* stringSearch, int stringStartOffset, PCRE2_SPTR subject = reinterpret_cast( stringSearch ); - int rc = pcre2_match( compiledPattern, // the compiled pattern - subject, // the subject string - stringLength, // the length of the subject - stringStartOffset, // start at offset in the subject - 0, // default options - match_data, // match data - NULL // match context + int rc = pcre2_match( compiledPattern, // the compiled pattern + subject + stringStartOffset, // the subject string + stringLength - stringStartOffset, // the length of the subject + 0, // start at offset in the subject + 0, // default options + match_data, // match data + NULL // match context ); if ( rc < 0 ) { @@ -111,8 +111,8 @@ bool RegEx::matches( const char* stringSearch, int stringStartOffset, if ( matchList != nullptr ) { PCRE2_SIZE* ovector = pcre2_get_ovector_pointer( match_data ); for ( size_t i = 0; i < static_cast( rc ); ++i ) { - matchList[i].start = static_cast( ovector[2 * i] ); - matchList[i].end = static_cast( ovector[2 * i + 1] ); + matchList[i].start = stringStartOffset + static_cast( ovector[2 * i] ); + matchList[i].end = stringStartOffset + static_cast( ovector[2 * i + 1] ); if ( matchList[i].start >= matchList[i].end ) { matchList[i].start = matchList[i].end = -1; mMatchNum--; diff --git a/src/eepp/ui/doc/languages/cpp.cpp b/src/eepp/ui/doc/languages/cpp.cpp index 112693821..bb5761e89 100644 --- a/src/eepp/ui/doc/languages/cpp.cpp +++ b/src/eepp/ui/doc/languages/cpp.cpp @@ -11,10 +11,10 @@ void addCPP() { { "%.cpp$", "%.cc$", "%.cxx$", "%.c++$", "%.hh$", "%.inl$", "%.hxx$", "%.hpp$", "%.h++$", "%.tcc$" }, { - { { "R%\"xml%(", "%)xml%\"" }, "function", "XML" }, - { { "R%\"css%(", "%)css%\"" }, "function", "CSS" }, - { { "R%\"html%(", "%)html%\"" }, "function", "HTML" }, - { { "R%\"json%(", "%)json%\"" }, "function", "JSON" }, + { { "R%\"(xml)%(", "%)(xml)%\"" }, { "string", "keyword2", "keyword2" }, "XML" }, + { { "R%\"(css)%(", "%)(css)%\"" }, { "string", "keyword2", "keyword2" }, "CSS" }, + { { "R%\"(html)%(", "%)(html)%\"" }, { "string", "keyword2", "keyword2" }, "HTML" }, + { { "R%\"(json)%(", "%)(json)%\"" }, { "string", "keyword2", "keyword2" }, "JSON" }, { { "R\"[%a-\"]+%(", "%)[%a-\"]+%\"" }, "string" }, { { "R\"%(", "%)\"" }, "string" }, { { "//.-\n" }, "comment" }, diff --git a/src/eepp/ui/doc/syntaxtokenizer.cpp b/src/eepp/ui/doc/syntaxtokenizer.cpp index 907807400..351d07166 100644 --- a/src/eepp/ui/doc/syntaxtokenizer.cpp +++ b/src/eepp/ui/doc/syntaxtokenizer.cpp @@ -98,26 +98,36 @@ static bool isScaped( const std::string& text, const size_t& startIndex, return count % 2 == 1; } -static std::pair findNonEscaped( const std::string& text, const std::string& pattern, - int offset, const std::string& escapeStr, - bool isRegEx ) { +struct NonEscapedMatch { + std::pair range{ -1, -1 }; + PatternMatcher::Range matches[6]; + int numMatches{ 0 }; +}; + +static NonEscapedMatch findNonEscaped( const std::string& text, const std::string& pattern, + int offset, const std::string& escapeStr, bool isRegEx ) { eeASSERT( !pattern.empty() ); if ( pattern.empty() ) - return std::make_pair( -1, -1 ); + return {}; std::variant wordsVar = isRegEx ? std::variant( RegEx( pattern ) ) : std::variant( LuaPattern( pattern ) ); PatternMatcher& words = std::visit( []( auto& patternType ) -> PatternMatcher& { return patternType; }, wordsVar ); int start, end; - while ( words.find( text, start, end, offset ) ) { + PatternMatcher::Range matches[6]; + while ( words.find( text, start, end, offset, 0, matches ) ) { if ( !escapeStr.empty() && isScaped( text, start, escapeStr ) ) { offset = end; } else { - return std::make_pair( start, end ); + NonEscapedMatch res; + res.range = { start, end }; + res.numMatches = words.getNumMatches(); + std::memcpy( res.matches, matches, sizeof( matches ) ); + return res; } } - return std::make_pair( -1, -1 ); + return {}; } SyntaxStateRestored SyntaxTokenizer::retrieveSyntaxState( const SyntaxDefinition& syntax, @@ -184,6 +194,55 @@ static inline void popSubsyntax( SyntaxStateRestored& curState, SyntaxState& ret curState = SyntaxTokenizer::retrieveSyntaxState( syntax, retState ); }; +template +static inline void +pushTokensToOpenCloseSubsyntax( int i, std::string_view textv, const SyntaxPattern* subsyntaxInfo, + const NonEscapedMatch& rangeSubsyntax, std::vector& tokens ) { + if ( rangeSubsyntax.numMatches > 1 ) { + int patternMatchStart = rangeSubsyntax.matches[0].start; + int patternMatchEnd = rangeSubsyntax.matches[0].end; + auto patternType = subsyntaxInfo->types[0]; + int lastStart = patternMatchStart; + int lastEnd = patternMatchEnd; + + if ( i < patternMatchStart ) + pushToken( tokens, patternType, textv.substr( i, patternMatchStart - i ) ); + + int start; + int end; + + for ( int sidx = 1; sidx < rangeSubsyntax.numMatches; sidx++ ) { + start = rangeSubsyntax.matches[sidx].start; + end = rangeSubsyntax.matches[sidx].end; + + if ( sidx == 1 && start > lastStart ) { + pushToken( tokens, patternType, + textv.substr( patternMatchStart, start - patternMatchStart ) ); + } else if ( start > lastEnd ) { + pushToken( tokens, patternType, textv.substr( lastEnd, start - lastEnd ) ); + } + + auto ss{ textv.substr( start, end - start ) }; + + pushToken( tokens, + sidx < static_cast( subsyntaxInfo->types.size() ) + ? subsyntaxInfo->types[sidx] + : subsyntaxInfo->types[0], + ss ); + + if ( sidx == rangeSubsyntax.numMatches - 1 && end < patternMatchEnd ) { + pushToken( tokens, patternType, textv.substr( end, patternMatchEnd - end ) ); + } + + lastStart = start; + lastEnd = end; + } + } else { + pushToken( tokens, subsyntaxInfo->types[0], + textv.substr( i, rangeSubsyntax.range.second - i ) ); + } +} + template static inline std::pair, SyntaxState> _tokenize( const SyntaxDefinition& syntax, const std::string& text, const SyntaxState& state, @@ -211,28 +270,29 @@ _tokenize( const SyntaxDefinition& syntax, const std::string& text, const Syntax if ( curState.currentPatternIdx != SYNTAX_TOKENIZER_STATE_NONE ) { const SyntaxPattern& pattern = curState.currentSyntax->getPatterns()[curState.currentPatternIdx - 1]; - std::pair range = findNonEscaped( - text, pattern.patterns[1], i, - pattern.patterns.size() >= 3 ? pattern.patterns[2] : "", pattern.isRegEx ); + auto range = findNonEscaped( text, pattern.patterns[1], i, + pattern.patterns.size() >= 3 ? pattern.patterns[2] : "", + pattern.isRegEx ) + .range; bool skip = false; - if ( curState.subsyntaxInfo != nullptr ) { - std::pair rangeSubsyntax = - findNonEscaped( text, curState.subsyntaxInfo->patterns[1], i, - curState.subsyntaxInfo->patterns.size() >= 3 - ? curState.subsyntaxInfo->patterns[2] - : "", - pattern.isRegEx ); + if ( curState.subsyntaxInfo != nullptr && + curState.subsyntaxInfo->patterns.size() > 1 ) { + auto rangeSubsyntax = findNonEscaped( text, curState.subsyntaxInfo->patterns[1], i, + curState.subsyntaxInfo->patterns.size() >= 3 + ? curState.subsyntaxInfo->patterns[2] + : "", + pattern.isRegEx ); - if ( rangeSubsyntax.first != -1 && - ( range.first == -1 || rangeSubsyntax.first < range.first ) ) { + if ( rangeSubsyntax.range.first != -1 && + ( range.first == -1 || rangeSubsyntax.range.first < range.first ) ) { if ( !skipSubSyntaxSeparator ) { - pushToken( tokens, curState.subsyntaxInfo->types[0], - textv.substr( i, rangeSubsyntax.second - i ) ); + pushTokensToOpenCloseSubsyntax( i, textv, curState.subsyntaxInfo, + rangeSubsyntax, tokens ); } popSubsyntax( curState, retState, syntax ); - i = rangeSubsyntax.second; + i = rangeSubsyntax.range.second; skip = true; } } @@ -255,20 +315,20 @@ _tokenize( const SyntaxDefinition& syntax, const std::string& text, const Syntax } } - if ( curState.subsyntaxInfo != nullptr ) { - std::pair rangeSubsyntax = findNonEscaped( + if ( curState.subsyntaxInfo != nullptr && curState.subsyntaxInfo->patterns.size() > 1 ) { + auto rangeSubsyntax = findNonEscaped( text, "^" + curState.subsyntaxInfo->patterns[1], i, curState.subsyntaxInfo->patterns.size() >= 3 ? curState.subsyntaxInfo->patterns[2] : "", curState.subsyntaxInfo->isRegEx ); - if ( rangeSubsyntax.first != -1 ) { + if ( rangeSubsyntax.range.first != -1 ) { if ( !skipSubSyntaxSeparator ) { - pushToken( tokens, curState.subsyntaxInfo->types[0], - textv.substr( i, rangeSubsyntax.second - i ) ); + pushTokensToOpenCloseSubsyntax( i, textv, curState.subsyntaxInfo, + rangeSubsyntax, tokens ); } popSubsyntax( curState, retState, syntax ); - i = rangeSubsyntax.second; + i = rangeSubsyntax.range.second; } } @@ -334,7 +394,8 @@ _tokenize( const SyntaxDefinition& syntax, const std::string& text, const Syntax patternText ); } - if ( pattern.hasSyntax() ) { + if ( pattern.hasSyntax() && curMatch == numMatches - 1 && + end == patternMatchEnd ) { pushSubsyntax( curState, retState, pattern, patternIndex + 1, patternStr ); } else if ( pattern.patterns.size() > 1 ) { @@ -347,6 +408,11 @@ _tokenize( const SyntaxDefinition& syntax, const std::string& text, const Syntax pushToken( tokens, patternType, textv.substr( end, patternMatchEnd - end ) ); i = patternMatchEnd; + + if ( pattern.hasSyntax() && curMatch == numMatches - 1 ) { + pushSubsyntax( curState, retState, pattern, patternIndex + 1, + patternStr ); + } } matched = true;