RegEx fixes. TextMate grammars fixes.

This commit is contained in:
Martín Lucas Golini
2025-05-28 03:08:17 -03:00
parent e072649221
commit 0df86b1d53
10 changed files with 150 additions and 87 deletions

View File

@@ -24,8 +24,8 @@ class EE_API RegExCache {
protected:
bool mEnabled{ true };
UnorderedMap<String::HashType, void*> mCache;
UnorderedMap<String::HashType, Uint32> mCacheOpt;
std::unordered_map<std::size_t, void*> mCache;
std::unordered_map<std::size_t, Uint32> mCacheOpt;
};
class EE_API RegEx : public PatternMatcher {
@@ -65,7 +65,7 @@ class EE_API RegEx : public PatternMatcher {
FilterOutCaptures =
0x08000000u, // It will filter out repeated captures and same range captures
AllowFallback = 0x10000000u,
UseOnigmo = 0x20000000u,
UseOniguruma = 0x20000000u,
};
RegEx( std::string_view pattern, Uint32 options = Options::Utf | Options::AllowFallback,

View File

@@ -124,7 +124,7 @@ struct EE_API SyntaxPattern {
}
inline bool checkIsRootSelfInclude() const {
return checkIsIncludePattern() && patterns[1] == "$self";
return checkIsIncludePattern() && ( patterns[1] == "$self" || patterns[1] == "$base" );
}
inline bool checkIsRepositoryInclude() const {

View File

@@ -24,20 +24,20 @@ RegExCache::~RegExCache() {
}
void RegExCache::insert( std::string_view key, Uint32 options, void* cache ) {
auto hash = hashCombine( String::hash( key ), options );
auto hash = hashCombine( std::hash<std::string_view>()( key ), options );
mCache.insert( { hash, cache } );
mCacheOpt.insert( { hash, options } );
}
void* RegExCache::find( const std::string_view& key, Uint32 options ) {
auto it = mCache.find( hashCombine( String::hash( key ), options ) );
auto it = mCache.find( hashCombine( std::hash<std::string_view>()( key ), options ) );
return ( it != mCache.end() ) ? it->second : nullptr;
}
void RegExCache::clear() {
for ( auto& cache : mCache ) {
auto opt = mCacheOpt.find( cache.first );
if ( opt->second & RegEx::Options::UseOnigmo )
if ( opt->second & RegEx::Options::UseOniguruma )
onig_free( static_cast<OnigRegex>( cache.second ) );
else
pcre2_code_free( reinterpret_cast<pcre2_code*>( cache.second ) );
@@ -65,7 +65,17 @@ RegEx::RegEx( std::string_view pattern, Uint32 options, bool useCache ) :
return;
}
if ( mOptions & Options::UseOnigmo ) {
if ( useCache && ( mOptions & Options::AllowFallback ) &&
!( mOptions & Options::UseOniguruma ) && RegExCache::instance()->isEnabled() &&
( mCompiledPattern =
RegExCache::instance()->find( pattern, mOptions | Options::UseOniguruma ) ) ) {
mValid = true;
mCached = true;
mOptions |= Options::UseOniguruma;
return;
}
if ( mOptions & Options::UseOniguruma ) {
initWithOnigmo( pattern, useCache );
return;
}
@@ -76,8 +86,8 @@ RegEx::RegEx( std::string_view pattern, Uint32 options, bool useCache ) :
if ( options & Options::AllowFallback )
options &= ~Options::AllowFallback;
if ( options & Options::UseOnigmo )
options &= ~Options::UseOnigmo;
if ( options & Options::UseOniguruma )
options &= ~Options::UseOniguruma;
mCompiledPattern = pcre2_compile( pattern_sptr, // the pattern
pattern.size(), // the length of the pattern
@@ -128,7 +138,7 @@ bool RegEx::matches( const char* stringSearch, int stringStartOffset,
return false;
}
if ( mOptions & Options::UseOnigmo ) {
if ( mOptions & Options::UseOniguruma ) {
OnigRegion* region = onig_region_new();
if ( !region ) {
Log::error( "Onigmo: onig_region_new() failed." );
@@ -162,6 +172,8 @@ bool RegEx::matches( const char* stringSearch, int stringStartOffset,
for ( int i = 0; i < region->num_regs; ++i ) {
int start = static_cast<int>( region->beg[i] );
int end = static_cast<int>( region->end[i] );
if ( start == -1 || end == -1 )
continue;
if ( !mFilterOutCaptures ||
( !( start == 0 && end == 0 ) && start != end &&
( curCap == 0 || !( matchList[curCap - 1].start == start &&
@@ -171,7 +183,8 @@ bool RegEx::matches( const char* stringSearch, int stringStartOffset,
curCap++;
}
}
mMatchNum = curCap;
if ( mMatchNum > 1 )
mMatchNum = curCap;
}
onig_region_free( region, 1 );
@@ -196,13 +209,13 @@ bool RegEx::matches( const char* stringSearch, int stringStartOffset,
PCRE2_SPTR subject = reinterpret_cast<PCRE2_SPTR>( stringSearch );
int rc = pcre2_match( compiledPattern, // the compiled pattern
subject + stringStartOffset, // the subject string
stringLength - stringStartOffset, // the length of the subject
0, // start at offset in the subject
0, // default options
match_data, // match data
NULL // match context
int rc = pcre2_match( compiledPattern, // the compiled pattern
subject, // the subject string
stringLength, // the length of the subject
stringStartOffset, // start at offset in the subject
0, // default options
match_data, // match data
NULL // match context
);
if ( rc < 0 ) {
@@ -220,8 +233,8 @@ bool RegEx::matches( const char* stringSearch, int stringStartOffset,
PCRE2_SIZE* ovector = pcre2_get_ovector_pointer( match_data );
int curCap = 0;
for ( size_t i = 0; i < static_cast<size_t>( rc ); ++i ) {
int start = stringStartOffset + static_cast<int>( ovector[2 * i] );
int end = stringStartOffset + static_cast<int>( ovector[2 * i + 1] );
int start = static_cast<int>( ovector[2 * i] );
int end = static_cast<int>( ovector[2 * i + 1] );
if ( !mFilterOutCaptures ||
( !( start == 0 && end == 0 ) && start != end &&
( curCap == 0 || !( matchList[curCap - 1].start == start &&
@@ -279,7 +292,7 @@ bool RegEx::initWithOnigmo( std::string_view pattern, bool useCache ) {
mCompiledPattern = regex;
mValid = true;
mOptions |= Options::UseOnigmo;
mOptions |= Options::UseOniguruma;
mCaptureCount = onig_number_of_captures( static_cast<OnigRegex>( mCompiledPattern ) );
if ( useCache && RegExCache::instance()->isEnabled() ) {

View File

@@ -55,7 +55,7 @@ template <typename SyntaxStyleType> void updateCache( const SyntaxPattern& ptrn
static void updatePatternRefs( const SyntaxDefinition& def, SyntaxPattern& ptrn ) {
ptrn.def = &def;
if ( ptrn.syntax == "$self" )
if ( ptrn.syntax == "$self" || ptrn.syntax == "$base" )
ptrn.syntax = def.getLanguageName();
}

View File

@@ -41,7 +41,8 @@ class TextMateScopeMapper {
{ "variable.parameter", "keyword3" }, // Function parameters
{ "variable.language", "literal" }, // Language constants like 'this', 'self', 'null'?
{ "variable.identifier", "normal" },
{ "storage.type", "keyword2" }, // Class, struct, int, bool etc. (declaration)
{ "variable.function", "function" },
{ "storage.type", "keyword" }, // Class, struct, int, bool etc. (declaration)
{ "entity.name.function", "function" }, // Function definition name
{ "entity.name.type", "keyword2" }, // Type name (class, struct, etc.) in definition
{ "entity.name.class", "keyword2" }, // Class name in definition
@@ -655,10 +656,18 @@ static SyntaxPattern parsePattern( const nlohmann::json& pattern ) {
for ( Uint64 i = 0; i < totalCaptures; i++ ) {
auto capNumStr = String::toString( i );
if ( captures.contains( capNumStr ) && captures[capNumStr].contains( "name" ) ) {
type.emplace_back(
TextMateScopeMapper::scopeToType( captures[capNumStr].value( "name", "" ) ) );
auto ctype =
TextMateScopeMapper::scopeToType( captures[capNumStr].value( "name", "" ) );
if ( i < type.size() )
type[i] = ctype;
else
type.emplace_back( ctype );
} else if ( parent.contains( "name" ) ) {
type.emplace_back( TextMateScopeMapper::scopeToType( parent.value( "name", "" ) ) );
auto ctype = TextMateScopeMapper::scopeToType( parent.value( "name", "" ) );
if ( i < type.size() )
type[i] = ctype;
else
type.emplace_back( ctype );
} else {
type.emplace_back( "normal" );
}
@@ -810,6 +819,13 @@ static SyntaxDefinition loadTextMateLanguage( const nlohmann::json& json, Syntax
auto ext( file.get<std::string>() );
def.addFileType( ( !String::contains( ext, "." ) ? "%." : "" ) + ext + "$" );
}
} else if ( json.contains( "filetypes" ) && json["filetypes"].is_array() ) {
const auto& files = json["filetypes"];
for ( const auto& file : files )
if ( file.is_string() ) {
auto ext( file.get<std::string>() );
def.addFileType( ( !String::contains( ext, "." ) ? "%." : "" ) + ext + "$" );
}
} else if ( json.contains( "scopeName" ) && json["scopeName"].is_string() ) {
const auto& scopeName = json.value( "scopeName", "" );
def.addFileType( "%." + FileSystem::fileExtension( scopeName ) + "$" );

View File

@@ -13,6 +13,8 @@ using namespace EE::System;
namespace EE { namespace UI { namespace Doc {
static constexpr auto REGEX_FLAGS = RegEx::Options::Utf | RegEx::Options::AllowFallback;
struct PatternStackItem {
const std::vector<SyntaxPattern>* patterns{ nullptr };
size_t index = 0;
@@ -49,6 +51,8 @@ static int isInMultiByteCodePoint( const char* text, const size_t& textSize, con
template <typename T>
static void pushToken( std::vector<T>& tokens, const SyntaxStyleType& type,
const std::string_view& text ) {
if ( text.empty() )
return;
if ( !tokens.empty() && ( tokens[tokens.size() - 1].type == type ) ) {
size_t tpos = tokens.size() - 1;
tokens[tpos].type = type;
@@ -129,7 +133,7 @@ static NonEscapedMatch findNonEscaped( const std::string& text, const std::strin
? std::variant<RegEx, LuaPattern, ParserMatcher>( LuaPattern( pattern ) )
: ( matchType == SyntaxPatternMatchType::RegEx
? std::variant<RegEx, LuaPattern, ParserMatcher>(
RegEx( pattern, RegEx::Options::Utf | RegEx::Options::AllowFallback ) )
RegEx( pattern, REGEX_FLAGS ) )
: std::variant<RegEx, LuaPattern, ParserMatcher>( ParserMatcher( pattern ) ) );
PatternMatcher& words =
std::visit( []( auto& patternType ) -> PatternMatcher& { return patternType; }, wordsVar );
@@ -275,6 +279,9 @@ static inline void pushTokensToOpenCloseSubsyntax( int i, std::string_view textv
start = rangeSubsyntax.matches[sidx].start;
end = rangeSubsyntax.matches[sidx].end;
if ( start == -1 || end == -1 )
continue;
if ( sidx == 1 && start > lastStart ) {
pushToken( tokens, patternType,
textv.substr( patternMatchStart, start - patternMatchStart ) );
@@ -333,9 +340,8 @@ _tokenize( const SyntaxDefinition& syntax, const std::string& text, const Syntax
pattern.matchType == SyntaxPatternMatchType::LuaPattern
? std::variant<RegEx, LuaPattern, ParserMatcher>( LuaPattern( patternStr ) )
: ( pattern.matchType == SyntaxPatternMatchType::RegEx
? std::variant<RegEx, LuaPattern, ParserMatcher>( RegEx(
patternStr, RegEx::Options::Utf | RegEx::Options::AllowFallback |
RegEx::Options::Anchored ) )
? std::variant<RegEx, LuaPattern, ParserMatcher>(
RegEx( patternStr, REGEX_FLAGS | RegEx::Options::Anchored ) )
: std::variant<RegEx, LuaPattern, ParserMatcher>(
ParserMatcher( patternStr ) ) );
PatternMatcher& words = std::visit(
@@ -443,69 +449,83 @@ _tokenize( const SyntaxDefinition& syntax, const std::string& text, const Syntax
for ( size_t curMatch = 1; curMatch < numMatches; curMatch++ ) {
start = matches[curMatch].start;
end = matches[curMatch].end;
if ( start == end || start < 0 || end < 0 )
continue;
if ( pattern.patterns.size() >= 3 && startIdx > 0 &&
text[startIdx - 1] == pattern.patterns[2][0] )
continue;
Uint8 lead = ( 0xff & ( text[start] ) );
if ( !( lead < 0x80 ) ) {
char* strStart = const_cast<char*>( text.c_str() + start );
char* strEnd = strStart;
String::utf8Next( strEnd );
end = start + ( strEnd - strStart );
}
if ( curMatch == 1 && start > lastStart ) {
pushToken( tokens, patternType,
textv.substr( fullMatchStart, start - fullMatchStart ) );
} else if ( start > lastEnd ) {
pushToken( tokens, patternType,
textv.substr( lastEnd, start - lastEnd ) );
}
patternText = textv.substr( start, end - start );
SyntaxStyleType type =
curMatch < pattern.types.size() &&
( pattern.types[curMatch] == SyntaxStyleTypes::Symbol ||
pattern.types[curMatch] == SyntaxStyleTypes::Normal )
? curState.currentSyntax->getSymbol(
( patternTextStr = patternText ) )
: SyntaxStyleEmpty();
if ( !( start == end || start < 0 || end < 0 ) &&
!( pattern.patterns.size() >= 3 && startIdx > 0 &&
text[startIdx - 1] == pattern.patterns[2][0] ) ) {
Uint8 lead = ( 0xff & ( text[start] ) );
if ( !( lead < 0x80 ) ) {
char* strStart = const_cast<char*>( text.c_str() + start );
char* strEnd = strStart;
String::utf8Next( strEnd );
end = start + ( strEnd - strStart );
}
if ( curMatch == 1 && start > lastStart ) {
pushToken( tokens, patternType,
textv.substr( fullMatchStart, start - fullMatchStart ) );
} else if ( start > lastEnd ) {
pushToken( tokens, patternType,
textv.substr( lastEnd, start - lastEnd ) );
}
if ( !skipSubSyntaxSeparator || !pattern.hasSyntaxOrContentScope() ) {
pushToken( tokens,
type == SyntaxStyleEmpty()
? ( curMatch < pattern.types.size()
? pattern.types[curMatch]
: pattern.types[0] )
: type,
patternText );
}
patternText = textv.substr( start, end - start );
SyntaxStyleType type =
curMatch < pattern.types.size() &&
( pattern.types[curMatch] == SyntaxStyleTypes::Symbol ||
pattern.types[curMatch] == SyntaxStyleTypes::Normal )
? curState.currentSyntax->getSymbol(
( patternTextStr = patternText ) )
: SyntaxStyleEmpty();
if ( pattern.isRangedMatch() && curMatch == numMatches - 1 &&
end == fullMatchEnd ) {
pushStack(
curState, retState, pattern, patternIndex,
textv.substr( fullMatchStart, fullMatchEnd - fullMatchStart ) );
}
if ( !skipSubSyntaxSeparator || !pattern.hasSyntaxOrContentScope() ) {
pushToken( tokens,
type == SyntaxStyleEmpty()
? ( curMatch < pattern.types.size()
? pattern.types[curMatch]
: pattern.types[0] )
: type,
patternText );
}
startIdx = end;
if ( curMatch == numMatches - 1 && end < fullMatchEnd ) {
pushToken( tokens, patternType,
textv.substr( end, fullMatchEnd - end ) );
startIdx = fullMatchEnd;
if ( pattern.isRangedMatch() && curMatch == numMatches - 1 ) {
if ( pattern.isRangedMatch() && curMatch == numMatches - 1 &&
end == fullMatchEnd ) {
pushStack(
curState, retState, pattern, patternIndex,
textv.substr( fullMatchStart, fullMatchEnd - fullMatchStart ) );
}
startIdx = end;
if ( curMatch == numMatches - 1 && end < fullMatchEnd ) {
pushToken( tokens, patternType,
textv.substr( end, fullMatchEnd - end ) );
startIdx = fullMatchEnd;
end = fullMatchEnd;
if ( pattern.isRangedMatch() && curMatch == numMatches - 1 ) {
pushStack( curState, retState, pattern, patternIndex,
textv.substr( fullMatchStart,
fullMatchEnd - fullMatchStart ) );
}
}
}
lastStart = start;
lastEnd = end;
}
if ( lastEnd < fullMatchEnd ) {
pushToken( tokens, patternType,
textv.substr( lastEnd, fullMatchEnd - lastEnd ) );
startIdx = fullMatchEnd;
if ( pattern.isRangedMatch() ) {
pushStack(
curState, retState, pattern, patternIndex,
textv.substr( fullMatchStart, fullMatchEnd - fullMatchStart ) );
}
startIdx = fullMatchEnd;
}
return true;
}
} else {

View File

@@ -32,7 +32,7 @@ void addFixScript() {
{ "else", "keyword" }, { "break", "keyword" }, { "const", "keyword" },
{ "while", "keyword" }, { "import", "keyword" }, { "return", "keyword" },
{ "switch", "keyword" }, { "default", "keyword" }, { "continue", "keyword" },
{ "function", "function" },
{ "function", "keyword" },
{ "macro", "keyword" }, { "generate", "keyword" }, { "output", "keyword" },

View File

@@ -11,9 +11,9 @@ void addJanet() {
{ "Janet",
{ "%.janet$" },
{
{ { "(@?)```", "```", "\\" }, "string" },
{ { "(@?)``", "``", "\\" }, "string" },
{ { "(@?)`", "`", "\\" }, "string" },
{ { "(@?)```", "```" }, "string" },
{ { "(@?)``", "``" }, "string" },
{ { "(@?)`", "`" }, "string" },
{ { "\"", "\"", "\\" }, "string" },
{ { "0x[%da-fA-F]+" }, "number" },
{ { "-?%d+[%d%.eE]*f?" }, "number" },

View File

@@ -12,8 +12,8 @@ void addRescript() {
{
{ { "//.-\n" }, "comment" },
{ { "/%*", "%*/" }, "comment" },
{ { "\"", "\"", "\\" }, "string" },
{ { "`", "`", "\\" }, "string" },
{ { "\\?\"", "\"", "\\" }, "string" },
{ { "\\?`", "`", "\\" }, "string" },
{ { "%f[^<]![%a_][%w%_%-]*" }, "keyword2" },
{ { "%f[^<][%a_][%w%_%-]*" }, "function" },

View File

@@ -83,3 +83,17 @@ UTEST( LuaPattern, basicTest ) {
EXPECT_EQ( end, 16 );
}
}
UTEST( RegExEngines, basicTest ) {
std::string testStr = " ,(render-posts a-blog)))))\n";
RegEx regexPCRE2( "(?<=\\))", RegEx::Options::Utf );
RegEx regexOniguruma( "(?<=\\))", RegEx::Options::Utf | RegEx::Options::UseOniguruma );
PatternMatcher::Range matchesPCRE2[10];
PatternMatcher::Range matchesOniguruma[10];
regexPCRE2.matches( testStr, matchesPCRE2, 38 );
regexOniguruma.matches( testStr, matchesOniguruma, 38 );
EXPECT_EQ( 38, matchesPCRE2[0].start );
EXPECT_EQ( 38, matchesPCRE2[0].end );
EXPECT_EQ( 38, matchesOniguruma[0].start );
EXPECT_EQ( 38, matchesOniguruma[0].end );
}