mirror of
https://github.com/SpartanJ/eepp.git
synced 2026-05-31 10:36:30 +03:00
SyntaxTokenizer improvements. Now it's possible to set captures to sub-syntaxes and ranges.
Fix in RegEx that prevented to ^ work properly in tokenizer.
This commit is contained in:
@@ -114,6 +114,12 @@ class EE_API PatternMatcher {
|
||||
bool find( const char* stringSearch, int& startMatch, int& endMatch, int stringStartOffset = 0,
|
||||
int stringLength = 0, int returnMatchIndex = 0 ) const;
|
||||
|
||||
bool find( const std::string& s, int& startMatch, int& endMatch, int offset,
|
||||
int returnedMatchIndex, PatternMatcher::Range* matchesBuffer ) const;
|
||||
|
||||
bool find( const char* stringSearch, int& startMatch, int& endMatch, int stringStartOffset,
|
||||
int stringLength, int returnMatchIndex, PatternMatcher::Range* matchesBuffer ) const;
|
||||
|
||||
std::string gsub( const char* text, const char* replace );
|
||||
|
||||
std::string gsub( const std::string& text, const std::string& replace );
|
||||
|
||||
@@ -9,6 +9,19 @@ namespace EE { namespace System {
|
||||
|
||||
#define MAX_DEFAULT_MATCHES 12
|
||||
|
||||
bool PatternMatcher::find( const char* stringSearch, int& startMatch, int& endMatch,
|
||||
int stringStartOffset, int stringLength, int returnMatchIndex,
|
||||
PatternMatcher::Range* matchesBuffer ) const {
|
||||
if ( matches( stringSearch, stringStartOffset, matchesBuffer, stringLength ) ) {
|
||||
range( returnMatchIndex, startMatch, endMatch, matchesBuffer );
|
||||
return true;
|
||||
} else {
|
||||
startMatch = -1;
|
||||
endMatch = -1;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool PatternMatcher::find( const char* stringSearch, int& startMatch, int& endMatch,
|
||||
int stringStartOffset, int stringLength, int returnMatchIndex ) const {
|
||||
PatternMatcher::Range matchesBuffer[MAX_DEFAULT_MATCHES];
|
||||
@@ -22,6 +35,12 @@ bool PatternMatcher::find( const char* stringSearch, int& startMatch, int& endMa
|
||||
}
|
||||
}
|
||||
|
||||
bool PatternMatcher::find( const std::string& s, int& startMatch, int& endMatch, int offset,
|
||||
int returnedMatchIndex, PatternMatcher::Range* matchesBuffer ) const {
|
||||
return find( s.c_str(), startMatch, endMatch, offset, s.size(), returnedMatchIndex,
|
||||
matchesBuffer );
|
||||
}
|
||||
|
||||
bool PatternMatcher::find( const std::string& s, int& startMatch, int& endMatch, int offset,
|
||||
int returnedMatchIndex ) const {
|
||||
return find( s.c_str(), startMatch, endMatch, offset, s.size(), returnedMatchIndex );
|
||||
|
||||
@@ -88,13 +88,13 @@ bool RegEx::matches( const char* stringSearch, int stringStartOffset,
|
||||
|
||||
PCRE2_SPTR subject = reinterpret_cast<PCRE2_SPTR>( stringSearch );
|
||||
|
||||
int rc = pcre2_match( compiledPattern, // the compiled pattern
|
||||
subject, // the subject string
|
||||
stringLength, // the length of the subject
|
||||
stringStartOffset, // start at offset in the subject
|
||||
0, // default options
|
||||
match_data, // match data
|
||||
NULL // match context
|
||||
int rc = pcre2_match( compiledPattern, // the compiled pattern
|
||||
subject + stringStartOffset, // the subject string
|
||||
stringLength - stringStartOffset, // the length of the subject
|
||||
0, // start at offset in the subject
|
||||
0, // default options
|
||||
match_data, // match data
|
||||
NULL // match context
|
||||
);
|
||||
|
||||
if ( rc < 0 ) {
|
||||
@@ -111,8 +111,8 @@ bool RegEx::matches( const char* stringSearch, int stringStartOffset,
|
||||
if ( matchList != nullptr ) {
|
||||
PCRE2_SIZE* ovector = pcre2_get_ovector_pointer( match_data );
|
||||
for ( size_t i = 0; i < static_cast<size_t>( rc ); ++i ) {
|
||||
matchList[i].start = static_cast<int>( ovector[2 * i] );
|
||||
matchList[i].end = static_cast<int>( ovector[2 * i + 1] );
|
||||
matchList[i].start = stringStartOffset + static_cast<int>( ovector[2 * i] );
|
||||
matchList[i].end = stringStartOffset + static_cast<int>( ovector[2 * i + 1] );
|
||||
if ( matchList[i].start >= matchList[i].end ) {
|
||||
matchList[i].start = matchList[i].end = -1;
|
||||
mMatchNum--;
|
||||
|
||||
@@ -11,10 +11,10 @@ void addCPP() {
|
||||
{ "%.cpp$", "%.cc$", "%.cxx$", "%.c++$", "%.hh$", "%.inl$", "%.hxx$", "%.hpp$", "%.h++$",
|
||||
"%.tcc$" },
|
||||
{
|
||||
{ { "R%\"xml%(", "%)xml%\"" }, "function", "XML" },
|
||||
{ { "R%\"css%(", "%)css%\"" }, "function", "CSS" },
|
||||
{ { "R%\"html%(", "%)html%\"" }, "function", "HTML" },
|
||||
{ { "R%\"json%(", "%)json%\"" }, "function", "JSON" },
|
||||
{ { "R%\"(xml)%(", "%)(xml)%\"" }, { "string", "keyword2", "keyword2" }, "XML" },
|
||||
{ { "R%\"(css)%(", "%)(css)%\"" }, { "string", "keyword2", "keyword2" }, "CSS" },
|
||||
{ { "R%\"(html)%(", "%)(html)%\"" }, { "string", "keyword2", "keyword2" }, "HTML" },
|
||||
{ { "R%\"(json)%(", "%)(json)%\"" }, { "string", "keyword2", "keyword2" }, "JSON" },
|
||||
{ { "R\"[%a-\"]+%(", "%)[%a-\"]+%\"" }, "string" },
|
||||
{ { "R\"%(", "%)\"" }, "string" },
|
||||
{ { "//.-\n" }, "comment" },
|
||||
|
||||
@@ -98,26 +98,36 @@ static bool isScaped( const std::string& text, const size_t& startIndex,
|
||||
return count % 2 == 1;
|
||||
}
|
||||
|
||||
static std::pair<int, int> findNonEscaped( const std::string& text, const std::string& pattern,
|
||||
int offset, const std::string& escapeStr,
|
||||
bool isRegEx ) {
|
||||
struct NonEscapedMatch {
|
||||
std::pair<int, int> range{ -1, -1 };
|
||||
PatternMatcher::Range matches[6];
|
||||
int numMatches{ 0 };
|
||||
};
|
||||
|
||||
static NonEscapedMatch findNonEscaped( const std::string& text, const std::string& pattern,
|
||||
int offset, const std::string& escapeStr, bool isRegEx ) {
|
||||
eeASSERT( !pattern.empty() );
|
||||
if ( pattern.empty() )
|
||||
return std::make_pair( -1, -1 );
|
||||
return {};
|
||||
std::variant<RegEx, LuaPattern> wordsVar =
|
||||
isRegEx ? std::variant<RegEx, LuaPattern>( RegEx( pattern ) )
|
||||
: std::variant<RegEx, LuaPattern>( LuaPattern( pattern ) );
|
||||
PatternMatcher& words =
|
||||
std::visit( []( auto& patternType ) -> PatternMatcher& { return patternType; }, wordsVar );
|
||||
int start, end;
|
||||
while ( words.find( text, start, end, offset ) ) {
|
||||
PatternMatcher::Range matches[6];
|
||||
while ( words.find( text, start, end, offset, 0, matches ) ) {
|
||||
if ( !escapeStr.empty() && isScaped( text, start, escapeStr ) ) {
|
||||
offset = end;
|
||||
} else {
|
||||
return std::make_pair( start, end );
|
||||
NonEscapedMatch res;
|
||||
res.range = { start, end };
|
||||
res.numMatches = words.getNumMatches();
|
||||
std::memcpy( res.matches, matches, sizeof( matches ) );
|
||||
return res;
|
||||
}
|
||||
}
|
||||
return std::make_pair( -1, -1 );
|
||||
return {};
|
||||
}
|
||||
|
||||
SyntaxStateRestored SyntaxTokenizer::retrieveSyntaxState( const SyntaxDefinition& syntax,
|
||||
@@ -184,6 +194,55 @@ static inline void popSubsyntax( SyntaxStateRestored& curState, SyntaxState& ret
|
||||
curState = SyntaxTokenizer::retrieveSyntaxState( syntax, retState );
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
static inline void
|
||||
pushTokensToOpenCloseSubsyntax( int i, std::string_view textv, const SyntaxPattern* subsyntaxInfo,
|
||||
const NonEscapedMatch& rangeSubsyntax, std::vector<T>& tokens ) {
|
||||
if ( rangeSubsyntax.numMatches > 1 ) {
|
||||
int patternMatchStart = rangeSubsyntax.matches[0].start;
|
||||
int patternMatchEnd = rangeSubsyntax.matches[0].end;
|
||||
auto patternType = subsyntaxInfo->types[0];
|
||||
int lastStart = patternMatchStart;
|
||||
int lastEnd = patternMatchEnd;
|
||||
|
||||
if ( i < patternMatchStart )
|
||||
pushToken( tokens, patternType, textv.substr( i, patternMatchStart - i ) );
|
||||
|
||||
int start;
|
||||
int end;
|
||||
|
||||
for ( int sidx = 1; sidx < rangeSubsyntax.numMatches; sidx++ ) {
|
||||
start = rangeSubsyntax.matches[sidx].start;
|
||||
end = rangeSubsyntax.matches[sidx].end;
|
||||
|
||||
if ( sidx == 1 && start > lastStart ) {
|
||||
pushToken( tokens, patternType,
|
||||
textv.substr( patternMatchStart, start - patternMatchStart ) );
|
||||
} else if ( start > lastEnd ) {
|
||||
pushToken( tokens, patternType, textv.substr( lastEnd, start - lastEnd ) );
|
||||
}
|
||||
|
||||
auto ss{ textv.substr( start, end - start ) };
|
||||
|
||||
pushToken( tokens,
|
||||
sidx < static_cast<int>( subsyntaxInfo->types.size() )
|
||||
? subsyntaxInfo->types[sidx]
|
||||
: subsyntaxInfo->types[0],
|
||||
ss );
|
||||
|
||||
if ( sidx == rangeSubsyntax.numMatches - 1 && end < patternMatchEnd ) {
|
||||
pushToken( tokens, patternType, textv.substr( end, patternMatchEnd - end ) );
|
||||
}
|
||||
|
||||
lastStart = start;
|
||||
lastEnd = end;
|
||||
}
|
||||
} else {
|
||||
pushToken( tokens, subsyntaxInfo->types[0],
|
||||
textv.substr( i, rangeSubsyntax.range.second - i ) );
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static inline std::pair<std::vector<T>, SyntaxState>
|
||||
_tokenize( const SyntaxDefinition& syntax, const std::string& text, const SyntaxState& state,
|
||||
@@ -211,28 +270,29 @@ _tokenize( const SyntaxDefinition& syntax, const std::string& text, const Syntax
|
||||
if ( curState.currentPatternIdx != SYNTAX_TOKENIZER_STATE_NONE ) {
|
||||
const SyntaxPattern& pattern =
|
||||
curState.currentSyntax->getPatterns()[curState.currentPatternIdx - 1];
|
||||
std::pair<int, int> range = findNonEscaped(
|
||||
text, pattern.patterns[1], i,
|
||||
pattern.patterns.size() >= 3 ? pattern.patterns[2] : "", pattern.isRegEx );
|
||||
auto range = findNonEscaped( text, pattern.patterns[1], i,
|
||||
pattern.patterns.size() >= 3 ? pattern.patterns[2] : "",
|
||||
pattern.isRegEx )
|
||||
.range;
|
||||
|
||||
bool skip = false;
|
||||
|
||||
if ( curState.subsyntaxInfo != nullptr ) {
|
||||
std::pair<int, int> rangeSubsyntax =
|
||||
findNonEscaped( text, curState.subsyntaxInfo->patterns[1], i,
|
||||
curState.subsyntaxInfo->patterns.size() >= 3
|
||||
? curState.subsyntaxInfo->patterns[2]
|
||||
: "",
|
||||
pattern.isRegEx );
|
||||
if ( curState.subsyntaxInfo != nullptr &&
|
||||
curState.subsyntaxInfo->patterns.size() > 1 ) {
|
||||
auto rangeSubsyntax = findNonEscaped( text, curState.subsyntaxInfo->patterns[1], i,
|
||||
curState.subsyntaxInfo->patterns.size() >= 3
|
||||
? curState.subsyntaxInfo->patterns[2]
|
||||
: "",
|
||||
pattern.isRegEx );
|
||||
|
||||
if ( rangeSubsyntax.first != -1 &&
|
||||
( range.first == -1 || rangeSubsyntax.first < range.first ) ) {
|
||||
if ( rangeSubsyntax.range.first != -1 &&
|
||||
( range.first == -1 || rangeSubsyntax.range.first < range.first ) ) {
|
||||
if ( !skipSubSyntaxSeparator ) {
|
||||
pushToken( tokens, curState.subsyntaxInfo->types[0],
|
||||
textv.substr( i, rangeSubsyntax.second - i ) );
|
||||
pushTokensToOpenCloseSubsyntax( i, textv, curState.subsyntaxInfo,
|
||||
rangeSubsyntax, tokens );
|
||||
}
|
||||
popSubsyntax( curState, retState, syntax );
|
||||
i = rangeSubsyntax.second;
|
||||
i = rangeSubsyntax.range.second;
|
||||
skip = true;
|
||||
}
|
||||
}
|
||||
@@ -255,20 +315,20 @@ _tokenize( const SyntaxDefinition& syntax, const std::string& text, const Syntax
|
||||
}
|
||||
}
|
||||
|
||||
if ( curState.subsyntaxInfo != nullptr ) {
|
||||
std::pair<int, int> rangeSubsyntax = findNonEscaped(
|
||||
if ( curState.subsyntaxInfo != nullptr && curState.subsyntaxInfo->patterns.size() > 1 ) {
|
||||
auto rangeSubsyntax = findNonEscaped(
|
||||
text, "^" + curState.subsyntaxInfo->patterns[1], i,
|
||||
curState.subsyntaxInfo->patterns.size() >= 3 ? curState.subsyntaxInfo->patterns[2]
|
||||
: "",
|
||||
curState.subsyntaxInfo->isRegEx );
|
||||
|
||||
if ( rangeSubsyntax.first != -1 ) {
|
||||
if ( rangeSubsyntax.range.first != -1 ) {
|
||||
if ( !skipSubSyntaxSeparator ) {
|
||||
pushToken( tokens, curState.subsyntaxInfo->types[0],
|
||||
textv.substr( i, rangeSubsyntax.second - i ) );
|
||||
pushTokensToOpenCloseSubsyntax( i, textv, curState.subsyntaxInfo,
|
||||
rangeSubsyntax, tokens );
|
||||
}
|
||||
popSubsyntax( curState, retState, syntax );
|
||||
i = rangeSubsyntax.second;
|
||||
i = rangeSubsyntax.range.second;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -334,7 +394,8 @@ _tokenize( const SyntaxDefinition& syntax, const std::string& text, const Syntax
|
||||
patternText );
|
||||
}
|
||||
|
||||
if ( pattern.hasSyntax() ) {
|
||||
if ( pattern.hasSyntax() && curMatch == numMatches - 1 &&
|
||||
end == patternMatchEnd ) {
|
||||
pushSubsyntax( curState, retState, pattern, patternIndex + 1,
|
||||
patternStr );
|
||||
} else if ( pattern.patterns.size() > 1 ) {
|
||||
@@ -347,6 +408,11 @@ _tokenize( const SyntaxDefinition& syntax, const std::string& text, const Syntax
|
||||
pushToken( tokens, patternType,
|
||||
textv.substr( end, patternMatchEnd - end ) );
|
||||
i = patternMatchEnd;
|
||||
|
||||
if ( pattern.hasSyntax() && curMatch == numMatches - 1 ) {
|
||||
pushSubsyntax( curState, retState, pattern, patternIndex + 1,
|
||||
patternStr );
|
||||
}
|
||||
}
|
||||
|
||||
matched = true;
|
||||
|
||||
Reference in New Issue
Block a user