SyntaxTokenizer improvements. Now it's possible to set captures to sub-syntaxes and ranges.

Fix in RegEx that prevented to ^ work properly in tokenizer.
This commit is contained in:
Martín Lucas Golini
2025-03-09 01:57:54 -03:00
parent dfb0820d0f
commit b84080f7ec
5 changed files with 133 additions and 42 deletions

View File

@@ -114,6 +114,12 @@ class EE_API PatternMatcher {
bool find( const char* stringSearch, int& startMatch, int& endMatch, int stringStartOffset = 0,
int stringLength = 0, int returnMatchIndex = 0 ) const;
bool find( const std::string& s, int& startMatch, int& endMatch, int offset,
int returnedMatchIndex, PatternMatcher::Range* matchesBuffer ) const;
bool find( const char* stringSearch, int& startMatch, int& endMatch, int stringStartOffset,
int stringLength, int returnMatchIndex, PatternMatcher::Range* matchesBuffer ) const;
std::string gsub( const char* text, const char* replace );
std::string gsub( const std::string& text, const std::string& replace );

View File

@@ -9,6 +9,19 @@ namespace EE { namespace System {
#define MAX_DEFAULT_MATCHES 12
bool PatternMatcher::find( const char* stringSearch, int& startMatch, int& endMatch,
int stringStartOffset, int stringLength, int returnMatchIndex,
PatternMatcher::Range* matchesBuffer ) const {
if ( matches( stringSearch, stringStartOffset, matchesBuffer, stringLength ) ) {
range( returnMatchIndex, startMatch, endMatch, matchesBuffer );
return true;
} else {
startMatch = -1;
endMatch = -1;
return false;
}
}
bool PatternMatcher::find( const char* stringSearch, int& startMatch, int& endMatch,
int stringStartOffset, int stringLength, int returnMatchIndex ) const {
PatternMatcher::Range matchesBuffer[MAX_DEFAULT_MATCHES];
@@ -22,6 +35,12 @@ bool PatternMatcher::find( const char* stringSearch, int& startMatch, int& endMa
}
}
bool PatternMatcher::find( const std::string& s, int& startMatch, int& endMatch, int offset,
int returnedMatchIndex, PatternMatcher::Range* matchesBuffer ) const {
return find( s.c_str(), startMatch, endMatch, offset, s.size(), returnedMatchIndex,
matchesBuffer );
}
bool PatternMatcher::find( const std::string& s, int& startMatch, int& endMatch, int offset,
int returnedMatchIndex ) const {
return find( s.c_str(), startMatch, endMatch, offset, s.size(), returnedMatchIndex );

View File

@@ -88,13 +88,13 @@ bool RegEx::matches( const char* stringSearch, int stringStartOffset,
PCRE2_SPTR subject = reinterpret_cast<PCRE2_SPTR>( stringSearch );
int rc = pcre2_match( compiledPattern, // the compiled pattern
subject, // the subject string
stringLength, // the length of the subject
stringStartOffset, // start at offset in the subject
0, // default options
match_data, // match data
NULL // match context
int rc = pcre2_match( compiledPattern, // the compiled pattern
subject + stringStartOffset, // the subject string
stringLength - stringStartOffset, // the length of the subject
0, // start at offset in the subject
0, // default options
match_data, // match data
NULL // match context
);
if ( rc < 0 ) {
@@ -111,8 +111,8 @@ bool RegEx::matches( const char* stringSearch, int stringStartOffset,
if ( matchList != nullptr ) {
PCRE2_SIZE* ovector = pcre2_get_ovector_pointer( match_data );
for ( size_t i = 0; i < static_cast<size_t>( rc ); ++i ) {
matchList[i].start = static_cast<int>( ovector[2 * i] );
matchList[i].end = static_cast<int>( ovector[2 * i + 1] );
matchList[i].start = stringStartOffset + static_cast<int>( ovector[2 * i] );
matchList[i].end = stringStartOffset + static_cast<int>( ovector[2 * i + 1] );
if ( matchList[i].start >= matchList[i].end ) {
matchList[i].start = matchList[i].end = -1;
mMatchNum--;

View File

@@ -11,10 +11,10 @@ void addCPP() {
{ "%.cpp$", "%.cc$", "%.cxx$", "%.c++$", "%.hh$", "%.inl$", "%.hxx$", "%.hpp$", "%.h++$",
"%.tcc$" },
{
{ { "R%\"xml%(", "%)xml%\"" }, "function", "XML" },
{ { "R%\"css%(", "%)css%\"" }, "function", "CSS" },
{ { "R%\"html%(", "%)html%\"" }, "function", "HTML" },
{ { "R%\"json%(", "%)json%\"" }, "function", "JSON" },
{ { "R%\"(xml)%(", "%)(xml)%\"" }, { "string", "keyword2", "keyword2" }, "XML" },
{ { "R%\"(css)%(", "%)(css)%\"" }, { "string", "keyword2", "keyword2" }, "CSS" },
{ { "R%\"(html)%(", "%)(html)%\"" }, { "string", "keyword2", "keyword2" }, "HTML" },
{ { "R%\"(json)%(", "%)(json)%\"" }, { "string", "keyword2", "keyword2" }, "JSON" },
{ { "R\"[%a-\"]+%(", "%)[%a-\"]+%\"" }, "string" },
{ { "R\"%(", "%)\"" }, "string" },
{ { "//.-\n" }, "comment" },

View File

@@ -98,26 +98,36 @@ static bool isScaped( const std::string& text, const size_t& startIndex,
return count % 2 == 1;
}
static std::pair<int, int> findNonEscaped( const std::string& text, const std::string& pattern,
int offset, const std::string& escapeStr,
bool isRegEx ) {
struct NonEscapedMatch {
std::pair<int, int> range{ -1, -1 };
PatternMatcher::Range matches[6];
int numMatches{ 0 };
};
static NonEscapedMatch findNonEscaped( const std::string& text, const std::string& pattern,
int offset, const std::string& escapeStr, bool isRegEx ) {
eeASSERT( !pattern.empty() );
if ( pattern.empty() )
return std::make_pair( -1, -1 );
return {};
std::variant<RegEx, LuaPattern> wordsVar =
isRegEx ? std::variant<RegEx, LuaPattern>( RegEx( pattern ) )
: std::variant<RegEx, LuaPattern>( LuaPattern( pattern ) );
PatternMatcher& words =
std::visit( []( auto& patternType ) -> PatternMatcher& { return patternType; }, wordsVar );
int start, end;
while ( words.find( text, start, end, offset ) ) {
PatternMatcher::Range matches[6];
while ( words.find( text, start, end, offset, 0, matches ) ) {
if ( !escapeStr.empty() && isScaped( text, start, escapeStr ) ) {
offset = end;
} else {
return std::make_pair( start, end );
NonEscapedMatch res;
res.range = { start, end };
res.numMatches = words.getNumMatches();
std::memcpy( res.matches, matches, sizeof( matches ) );
return res;
}
}
return std::make_pair( -1, -1 );
return {};
}
SyntaxStateRestored SyntaxTokenizer::retrieveSyntaxState( const SyntaxDefinition& syntax,
@@ -184,6 +194,55 @@ static inline void popSubsyntax( SyntaxStateRestored& curState, SyntaxState& ret
curState = SyntaxTokenizer::retrieveSyntaxState( syntax, retState );
};
template <typename T>
static inline void
pushTokensToOpenCloseSubsyntax( int i, std::string_view textv, const SyntaxPattern* subsyntaxInfo,
const NonEscapedMatch& rangeSubsyntax, std::vector<T>& tokens ) {
if ( rangeSubsyntax.numMatches > 1 ) {
int patternMatchStart = rangeSubsyntax.matches[0].start;
int patternMatchEnd = rangeSubsyntax.matches[0].end;
auto patternType = subsyntaxInfo->types[0];
int lastStart = patternMatchStart;
int lastEnd = patternMatchEnd;
if ( i < patternMatchStart )
pushToken( tokens, patternType, textv.substr( i, patternMatchStart - i ) );
int start;
int end;
for ( int sidx = 1; sidx < rangeSubsyntax.numMatches; sidx++ ) {
start = rangeSubsyntax.matches[sidx].start;
end = rangeSubsyntax.matches[sidx].end;
if ( sidx == 1 && start > lastStart ) {
pushToken( tokens, patternType,
textv.substr( patternMatchStart, start - patternMatchStart ) );
} else if ( start > lastEnd ) {
pushToken( tokens, patternType, textv.substr( lastEnd, start - lastEnd ) );
}
auto ss{ textv.substr( start, end - start ) };
pushToken( tokens,
sidx < static_cast<int>( subsyntaxInfo->types.size() )
? subsyntaxInfo->types[sidx]
: subsyntaxInfo->types[0],
ss );
if ( sidx == rangeSubsyntax.numMatches - 1 && end < patternMatchEnd ) {
pushToken( tokens, patternType, textv.substr( end, patternMatchEnd - end ) );
}
lastStart = start;
lastEnd = end;
}
} else {
pushToken( tokens, subsyntaxInfo->types[0],
textv.substr( i, rangeSubsyntax.range.second - i ) );
}
}
template <typename T>
static inline std::pair<std::vector<T>, SyntaxState>
_tokenize( const SyntaxDefinition& syntax, const std::string& text, const SyntaxState& state,
@@ -211,28 +270,29 @@ _tokenize( const SyntaxDefinition& syntax, const std::string& text, const Syntax
if ( curState.currentPatternIdx != SYNTAX_TOKENIZER_STATE_NONE ) {
const SyntaxPattern& pattern =
curState.currentSyntax->getPatterns()[curState.currentPatternIdx - 1];
std::pair<int, int> range = findNonEscaped(
text, pattern.patterns[1], i,
pattern.patterns.size() >= 3 ? pattern.patterns[2] : "", pattern.isRegEx );
auto range = findNonEscaped( text, pattern.patterns[1], i,
pattern.patterns.size() >= 3 ? pattern.patterns[2] : "",
pattern.isRegEx )
.range;
bool skip = false;
if ( curState.subsyntaxInfo != nullptr ) {
std::pair<int, int> rangeSubsyntax =
findNonEscaped( text, curState.subsyntaxInfo->patterns[1], i,
curState.subsyntaxInfo->patterns.size() >= 3
? curState.subsyntaxInfo->patterns[2]
: "",
pattern.isRegEx );
if ( curState.subsyntaxInfo != nullptr &&
curState.subsyntaxInfo->patterns.size() > 1 ) {
auto rangeSubsyntax = findNonEscaped( text, curState.subsyntaxInfo->patterns[1], i,
curState.subsyntaxInfo->patterns.size() >= 3
? curState.subsyntaxInfo->patterns[2]
: "",
pattern.isRegEx );
if ( rangeSubsyntax.first != -1 &&
( range.first == -1 || rangeSubsyntax.first < range.first ) ) {
if ( rangeSubsyntax.range.first != -1 &&
( range.first == -1 || rangeSubsyntax.range.first < range.first ) ) {
if ( !skipSubSyntaxSeparator ) {
pushToken( tokens, curState.subsyntaxInfo->types[0],
textv.substr( i, rangeSubsyntax.second - i ) );
pushTokensToOpenCloseSubsyntax( i, textv, curState.subsyntaxInfo,
rangeSubsyntax, tokens );
}
popSubsyntax( curState, retState, syntax );
i = rangeSubsyntax.second;
i = rangeSubsyntax.range.second;
skip = true;
}
}
@@ -255,20 +315,20 @@ _tokenize( const SyntaxDefinition& syntax, const std::string& text, const Syntax
}
}
if ( curState.subsyntaxInfo != nullptr ) {
std::pair<int, int> rangeSubsyntax = findNonEscaped(
if ( curState.subsyntaxInfo != nullptr && curState.subsyntaxInfo->patterns.size() > 1 ) {
auto rangeSubsyntax = findNonEscaped(
text, "^" + curState.subsyntaxInfo->patterns[1], i,
curState.subsyntaxInfo->patterns.size() >= 3 ? curState.subsyntaxInfo->patterns[2]
: "",
curState.subsyntaxInfo->isRegEx );
if ( rangeSubsyntax.first != -1 ) {
if ( rangeSubsyntax.range.first != -1 ) {
if ( !skipSubSyntaxSeparator ) {
pushToken( tokens, curState.subsyntaxInfo->types[0],
textv.substr( i, rangeSubsyntax.second - i ) );
pushTokensToOpenCloseSubsyntax( i, textv, curState.subsyntaxInfo,
rangeSubsyntax, tokens );
}
popSubsyntax( curState, retState, syntax );
i = rangeSubsyntax.second;
i = rangeSubsyntax.range.second;
}
}
@@ -334,7 +394,8 @@ _tokenize( const SyntaxDefinition& syntax, const std::string& text, const Syntax
patternText );
}
if ( pattern.hasSyntax() ) {
if ( pattern.hasSyntax() && curMatch == numMatches - 1 &&
end == patternMatchEnd ) {
pushSubsyntax( curState, retState, pattern, patternIndex + 1,
patternStr );
} else if ( pattern.patterns.size() > 1 ) {
@@ -347,6 +408,11 @@ _tokenize( const SyntaxDefinition& syntax, const std::string& text, const Syntax
pushToken( tokens, patternType,
textv.substr( end, patternMatchEnd - end ) );
i = patternMatchEnd;
if ( pattern.hasSyntax() && curMatch == numMatches - 1 ) {
pushSubsyntax( curState, retState, pattern, patternIndex + 1,
patternStr );
}
}
matched = true;