Added support for subsyntax languages (a language inside a parent language, usefull for HTML and Markdown files).

This commit is contained in:
Martín Lucas Golini
2022-03-14 02:05:25 -03:00
parent 775b65ffa6
commit 82f859fcd4
8 changed files with 216 additions and 44 deletions

View File

@@ -2,6 +2,7 @@
#define EE_UI_DOC_DEFINITION_HPP
#include <eepp/config.hpp>
#include <eepp/core/string.hpp>
#include <string>
#include <unordered_map>
#include <vector>
@@ -11,6 +12,7 @@ namespace EE { namespace UI { namespace Doc {
struct EE_API SyntaxPattern {
std::vector<std::string> patterns;
std::string type;
std::string syntax{ "" };
};
class EE_API SyntaxDefinition {
@@ -26,6 +28,8 @@ class EE_API SyntaxDefinition {
const std::string& getLanguageName() const;
const String::HashType& getLanguageId() const;
const std::vector<std::string>& getFiles() const;
std::string getFileExtension() const;
@@ -63,6 +67,7 @@ class EE_API SyntaxDefinition {
protected:
std::string mLanguageName;
String::HashType mLanguageId;
std::vector<std::string> mFiles;
std::vector<SyntaxPattern> mPatterns;
std::unordered_map<std::string, std::string> mSymbols;

View File

@@ -25,6 +25,8 @@ class EE_API SyntaxDefinitionManager {
const SyntaxDefinition& getStyleByLanguageName( const std::string& name ) const;
const SyntaxDefinition& getStyleByLanguageId( const String::HashType& id) const;
SyntaxDefinition& getStyleByLanguageNameRef( const std::string& name );
std::vector<std::string> getLanguageNames() const;

View File

@@ -8,10 +8,10 @@
namespace EE { namespace UI { namespace Doc {
struct TokenizedLine {
int initState;
Uint64 initState;
String::HashType hash;
std::vector<SyntaxToken> tokens;
int state;
Uint64 state;
};
class EE_API SyntaxHighlighter {
@@ -37,7 +37,7 @@ class EE_API SyntaxHighlighter {
std::map<size_t, TokenizedLine> mLines;
Int64 mFirstInvalidLine;
Int64 mMaxWantedLine;
TokenizedLine tokenizeLine( const size_t& line, const int& state );
TokenizedLine tokenizeLine( const size_t& line, const Uint64& state );
};
}}} // namespace EE::UI::Doc

View File

@@ -13,14 +13,14 @@ struct EE_API SyntaxToken {
std::string text;
};
#define SYNTAX_TOKENIZER_STATE_NONE ( -1 )
#define SYNTAX_TOKENIZER_STATE_NONE ( 0 )
class EE_API SyntaxTokenizer {
public:
std::pair<std::vector<SyntaxToken>, int> static tokenize( const SyntaxDefinition& syntax,
const std::string& text,
const int& state,
const size_t& startIndex = 0 );
static std::pair<std::vector<SyntaxToken>, Uint64> tokenize( const SyntaxDefinition& syntax,
const std::string& text,
const Uint64& state,
const size_t& startIndex = 0 );
};
}}} // namespace EE::UI::Doc

View File

@@ -13,6 +13,7 @@ SyntaxDefinition::SyntaxDefinition( const std::string& languageName,
const std::string& comment,
const std::vector<std::string> headers ) :
mLanguageName( languageName ),
mLanguageId( String::hash( languageName ) ),
mFiles( files ),
mPatterns( patterns ),
mSymbols( symbols ),
@@ -67,7 +68,7 @@ SyntaxDefinition& SyntaxDefinition::addPatternToFront( const SyntaxPattern& patt
auto patterns = mPatterns;
mPatterns.clear();
mPatterns.push_back( pattern );
for ( auto pa : patterns )
for ( const auto& pa : patterns )
mPatterns.push_back( pa );
return *this;
}
@@ -112,4 +113,8 @@ const std::string& SyntaxDefinition::getLanguageName() const {
return mLanguageName;
}
const String::HashType& SyntaxDefinition::getLanguageId() const {
return mLanguageId;
}
}}} // namespace EE::UI::Doc

View File

@@ -19,9 +19,9 @@ SyntaxDefinitionManager::SyntaxDefinitionManager() {
// Plain text
add( { "Plain Text", { "%.txt$" }, {} } );
// XML - HTML
// XML
add( { "XML",
{ "%.xml$", "%.html?$", "%.svg$" },
{ "%.xml$", "%.svg$" },
{
{ { "<!%-%-", "%-%->" }, "comment" },
{ { "%f[^>][^<]", "%f[<]" }, "normal" },
@@ -38,7 +38,39 @@ SyntaxDefinitionManager::SyntaxDefinitionManager() {
},
{},
"",
{ "<%?xml", "<![Dd][Oo][Cc][Tt][Yy][Pp][Ee]%s[Hh][Tt][Mm][Ll]>" } } );
{ "<%?xml" } } );
// HTML
add( { "HTML",
{ "%.html?$" },
{
{ { "<%s*[sS][cC][rR][iI][pP][tT]%s+[tT][yY][pP][eE]%s*=%s*['\"]%a+/"
"[jJ][aA][vV][aA][sS][cC][rR][iI][pP][tT]['\"]%s*>",
"<%s*/[sS][cC][rR][iI][pP][tT]>" },
"function",
"JavaScript" },
{ { "<%s*[sS][cC][rR][iI][pP][tT]%s*>", "<%s*/%s*[sS][cC][rR][iI][pP][tT]>" },
"function",
"JavaScript" },
{ { "<%s*[sS][tT][yY][lL][eE][^>]*>", "<%s*/%s*[sS][tT][yY][lL][eE]%s*>" },
"function",
"CSS" },
{ { "<!%-%-", "%-%->" }, "comment" },
{ { "%f[^>][^<]", "%f[<]" }, "normal" },
{ { "\"", "\"", "\\" }, "string" },
{ { "'", "'", "\\" }, "string" },
{ { "0x[%da-fA-F]+" }, "number" },
{ { "-?%d+[%d%.]*f?" }, "number" },
{ { "-?%.?%d+f?" }, "number" },
{ { "%f[^<]![%a_][%w_]*" }, "keyword2" },
{ { "%f[^<][%a_][%w_]*" }, "function" },
{ { "%f[^<]/[%a_][%w_]*" }, "function" },
{ { "[%a_][%w_]*" }, "keyword" },
{ { "[/<>=]" }, "operator" },
},
{},
"",
{ "<html", "<![Dd][Oo][Cc][Tt][Yy][Pp][Ee]%s[Hh][Tt][Mm][Ll]>" } } );
// CSS
add( { "CSS",
@@ -55,9 +87,9 @@ SyntaxDefinitionManager::SyntaxDefinitionManager() {
{ { "-?%d+[%d%.]*deg" }, "number" },
{ { "-?%d+[%d%.]*" }, "number" },
{ { "[%a_][%w_]*" }, "symbol" },
{ { "#[%a][%w_-]*" }, "keyword2" },
{ { "#[_-%a][%w_-]*" }, "keyword2" },
{ { "@[%a][%w_-]*" }, "keyword2" },
{ { "%.[%a][%w_-]*" }, "keyword2" },
{ { "%.[_-%a][%w_-]*" }, "keyword2" },
{ { "[{}:]" }, "operator" },
} } )
.addSymbols( UIWidgetCreator::getWidgetNames(), "keyword2" );
@@ -67,6 +99,35 @@ SyntaxDefinitionManager::SyntaxDefinitionManager() {
{ "%.md$", "%.markdown$" },
{
{ { "\\." }, "normal" },
{ { "```[Xx][Mm][Ll]", "```" }, "function", "XML" },
{ { "```[Hh][Tt][Mm][Ll]", "```" }, "function", "html" },
{ { "```[Cc]++", "```" }, "function", "C++" },
{ { "```[Cc][Pp][Pp]", "```" }, "function", "C++" },
{ { "```[Cc]%#", "```" }, "function", "C#" },
{ { "```[Cc][Ss][Ss]", "```" }, "function", "CSS" },
{ { "```[Cc]", "```" }, "function", "C" },
{ { "```[Dd]", "```" }, "function", "D" },
{ { "```[Ll]ua", "```" }, "function", "Lua" },
{ { "```[Ja]va[Ss]cript", "```" }, "function", "JavaScript" },
{ { "```[Tt]ype[Ss]cript", "```" }, "function", "TypeScript" },
{ { "```[Pp]ython", "```" }, "function", "Python" },
{ { "```[Bb]ash", "```" }, "function", "Bash" },
{ { "```[Pp][Hh][Pp]", "```" }, "function", "PHP" },
{ { "```[Ss][Qq][Ll]", "```" }, "function", "SQL" },
{ { "```[Gg][Ll][Ss][Ll]", "```" }, "function", "GLSL" },
{ { "```[Ii][Nn][Ii]", "```" }, "function", "Config File" },
{ { "```[Mm]makefile", "```" }, "function", "Makefile" },
{ { "```[Gg][Oo]", "```" }, "function", "Go" },
{ { "```[Rr]ust", "```" }, "function", "Rust" },
{ { "```[Gg][Dd][Ss]cript", "```" }, "function", "GSCript" },
{ { "```[Jj]ava", "```" }, "function", "java" },
{ { "```[Ss]wift", "```" }, "function", "Swift" },
{ { "```[Dd]art", "```" }, "function", "Dart" },
{ { "```[Oo]bjective[Cc]", "```" }, "function", "Objective-C" },
{ { "```[Yy][Aa][Mm][Ll]", "```" }, "function", "YAML" },
{ { "```[Kk]otlin", "```" }, "function", "Kotlin" },
{ { "```[Ss]olidity", "```" }, "function", "Solidity" },
{ { "```[Hh]askell", "```" }, "function", "Haskell" },
{ { "<!%-%-", "%-%->" }, "comment" },
{ { "```", "```" }, "string" },
{ { "``", "``" }, "string" },
@@ -2261,6 +2322,15 @@ SyntaxDefinitionManager::getStyleByLanguageName( const std::string& name ) const
return mStyles[0];
}
const SyntaxDefinition&
SyntaxDefinitionManager::getStyleByLanguageId( const String::HashType& id ) const {
for ( auto& style : mStyles ) {
if ( style.getLanguageId() == id )
return style;
}
return mStyles[0];
}
SyntaxDefinition& SyntaxDefinitionManager::getStyleByLanguageNameRef( const std::string& name ) {
return const_cast<SyntaxDefinition&>( getStyleByLanguageName( name ) );
}

View File

@@ -25,11 +25,11 @@ void SyntaxHighlighter::invalidate( Int64 lineIndex ) {
mMaxWantedLine = eemin<Int64>( mMaxWantedLine, (Int64)mDoc->linesCount() - 1 );
}
TokenizedLine SyntaxHighlighter::tokenizeLine( const size_t& line, const int& state ) {
TokenizedLine SyntaxHighlighter::tokenizeLine( const size_t& line, const Uint64& state ) {
TokenizedLine tokenizedLine;
tokenizedLine.initState = state;
tokenizedLine.hash = mDoc->line( line ).getHash();
std::pair<std::vector<SyntaxToken>, int> res = SyntaxTokenizer::tokenize(
std::pair<std::vector<SyntaxToken>, Uint64> res = SyntaxTokenizer::tokenize(
mDoc->getSyntaxDefinition(), mDoc->line( line ).toUtf8(), state );
tokenizedLine.tokens = std::move( res.first );
tokenizedLine.state = std::move( res.second );
@@ -70,7 +70,7 @@ bool SyntaxHighlighter::updateDirty( int visibleLinesCount ) {
Int64 max = eemax( 0LL, eemin( mFirstInvalidLine + visibleLinesCount, mMaxWantedLine ) );
for ( Int64 index = mFirstInvalidLine; index <= max; index++ ) {
int state = SYNTAX_TOKENIZER_STATE_NONE;
Uint64 state = SYNTAX_TOKENIZER_STATE_NONE;
if ( index > 0 ) {
auto prevIt = mLines.find( index - 1 );
if ( prevIt != mLines.end() ) {

View File

@@ -10,19 +10,11 @@ namespace EE { namespace UI { namespace Doc {
// tokenizer. This allows eepp to support the same color schemes and syntax definitions from
// lite. Making much easier to implement a complete code editor.
/*static bool allSpaces( const std::string& str ) {
for ( auto& chr : str )
if ( ' ' != chr )
return false;
return true;
}*/
#define MAX_TOKEN_SIZE ( 512 )
static void pushToken( std::vector<SyntaxToken>& tokens, const std::string& type,
const std::string& text ) {
if ( !tokens.empty() && ( tokens[tokens.size() - 1].type == type /*||
allSpaces( tokens[tokens.size() - 1].text )*/ ) ) {
if ( !tokens.empty() && ( tokens[tokens.size() - 1].type == type ) ) {
tokens[tokens.size() - 1].type = type;
tokens[tokens.size() - 1].text += text;
} else {
@@ -69,10 +61,45 @@ std::pair<int, int> findNonEscaped( const std::string& text, const std::string&
}
}
std::pair<std::vector<SyntaxToken>, int> SyntaxTokenizer::tokenize( const SyntaxDefinition& syntax,
const std::string& text,
const int& state,
const size_t& startIndex ) {
struct SyntaxState {
const SyntaxDefinition* currentSyntax{ nullptr };
const SyntaxPattern* subsyntaxInfo{ nullptr };
Uint64 currentPatternIdx{ 0 };
Uint64 currentLevel{ 0 };
};
SyntaxState retrieveSyntaxState( const SyntaxDefinition& syntax, const Uint64& state ) {
SyntaxState syntaxState{ &syntax, nullptr, state, 0 };
if ( state > 0 &&
( state > 255 ||
( state < syntaxState.currentSyntax->getPatterns().size() &&
!syntaxState.currentSyntax->getPatterns()[state - 1].syntax.empty() ) ) ) {
for ( size_t i = 0; i <= 2; ++i ) {
Uint64 target = ( state >> ( 8 * i ) ) & 0xFF;
if ( target != SYNTAX_TOKENIZER_STATE_NONE ) {
if ( target < syntaxState.currentSyntax->getPatterns().size() &&
!syntaxState.currentSyntax->getPatterns()[target - 1].syntax.empty() ) {
syntaxState.subsyntaxInfo =
&syntaxState.currentSyntax->getPatterns()[target - 1];
syntaxState.currentSyntax =
&SyntaxDefinitionManager::instance()->getStyleByLanguageName(
syntaxState.subsyntaxInfo->syntax );
syntaxState.currentPatternIdx = SYNTAX_TOKENIZER_STATE_NONE;
syntaxState.currentLevel++;
} else {
syntaxState.currentPatternIdx = target;
}
} else {
break;
}
}
}
return syntaxState;
}
std::pair<std::vector<SyntaxToken>, Uint64>
SyntaxTokenizer::tokenize( const SyntaxDefinition& syntax, const std::string& text,
const Uint64& state, const size_t& startIndex ) {
std::vector<SyntaxToken> tokens;
if ( syntax.getPatterns().empty() ) {
pushToken( tokens, "normal", text );
@@ -81,28 +108,89 @@ std::pair<std::vector<SyntaxToken>, int> SyntaxTokenizer::tokenize( const Syntax
size_t i = startIndex;
int retState = state;
SyntaxState curState = retrieveSyntaxState( syntax, state );
auto setSubsyntaxPatternIdx = [&curState, &retState]( const Uint64& patternIndex ) {
curState.currentPatternIdx = patternIndex;
retState &= ~( 0xFF << ( curState.currentLevel * 8 ) );
retState |= ( patternIndex << ( curState.currentLevel * 8 ) );
};
auto pushSubsyntax = [&setSubsyntaxPatternIdx, &curState](
const SyntaxPattern& enteringSubsyntax, const Uint64& patternIndex ) {
setSubsyntaxPatternIdx( patternIndex );
curState.currentLevel++;
curState.subsyntaxInfo = &enteringSubsyntax;
curState.currentSyntax = &SyntaxDefinitionManager::instance()->getStyleByLanguageName(
curState.subsyntaxInfo->syntax );
setSubsyntaxPatternIdx( SYNTAX_TOKENIZER_STATE_NONE );
};
auto popSubsyntax = [&setSubsyntaxPatternIdx, &curState, &syntax, &retState]() {
setSubsyntaxPatternIdx( SYNTAX_TOKENIZER_STATE_NONE );
curState.currentLevel--;
setSubsyntaxPatternIdx( SYNTAX_TOKENIZER_STATE_NONE );
curState = retrieveSyntaxState( syntax, retState );
};
while ( i < text.size() ) {
if ( retState != SYNTAX_TOKENIZER_STATE_NONE ) {
const SyntaxPattern& pattern = syntax.getPatterns()[retState];
if ( curState.currentPatternIdx != SYNTAX_TOKENIZER_STATE_NONE ) {
const SyntaxPattern& pattern =
curState.currentSyntax->getPatterns()[curState.currentPatternIdx - 1];
std::pair<int, int> range =
findNonEscaped( text, pattern.patterns[1], i,
pattern.patterns.size() >= 3 ? pattern.patterns[2] : "" );
if ( range.first != -1 ) {
pushToken( tokens, pattern.type, text.substr( i, range.second - i ) );
retState = SYNTAX_TOKENIZER_STATE_NONE;
i = range.second;
} else {
pushToken( tokens, pattern.type, text.substr( i ) );
break;
bool skip = false;
if ( curState.subsyntaxInfo != nullptr ) {
std::pair<int, int> rangeSubsyntax =
findNonEscaped( text, curState.subsyntaxInfo->patterns[1], i,
curState.subsyntaxInfo->patterns.size() >= 3
? curState.subsyntaxInfo->patterns[2]
: "" );
if ( rangeSubsyntax.first != -1 &&
( range.first == -1 || rangeSubsyntax.first < range.first ) ) {
pushToken( tokens, curState.subsyntaxInfo->type,
text.substr( i, rangeSubsyntax.second - i ) );
popSubsyntax();
i = rangeSubsyntax.second;
skip = true;
}
}
if ( !skip ) {
if ( range.first != -1 ) {
pushToken( tokens, pattern.type, text.substr( i, range.second - i ) );
setSubsyntaxPatternIdx( SYNTAX_TOKENIZER_STATE_NONE );
i = range.second;
} else {
pushToken( tokens, pattern.type, text.substr( i ) );
break;
}
}
}
if ( curState.subsyntaxInfo != nullptr ) {
std::pair<int, int> rangeSubsyntax = findNonEscaped(
text, "^" + curState.subsyntaxInfo->patterns[1], i,
curState.subsyntaxInfo->patterns.size() >= 3 ? curState.subsyntaxInfo->patterns[2]
: "" );
if ( rangeSubsyntax.first != -1 ) {
pushToken( tokens, curState.subsyntaxInfo->type,
text.substr( i, rangeSubsyntax.second - i ) );
popSubsyntax();
i = rangeSubsyntax.second;
}
}
bool matched = false;
for ( size_t patternIndex = 0; patternIndex < syntax.getPatterns().size();
for ( size_t patternIndex = 0; patternIndex < curState.currentSyntax->getPatterns().size();
patternIndex++ ) {
const SyntaxPattern& pattern = syntax.getPatterns()[patternIndex];
const SyntaxPattern& pattern = curState.currentSyntax->getPatterns()[patternIndex];
if ( i != 0 && pattern.patterns[0][0] == '^' )
continue;
const std::string& patternStr(
@@ -114,10 +202,12 @@ std::pair<std::vector<SyntaxToken>, int> SyntaxTokenizer::tokenize( const Syntax
text[i - 1] == pattern.patterns[2][0] )
continue;
std::string patternText( text.substr( start, end - start ) );
std::string type = syntax.getSymbol( patternText );
std::string type = curState.currentSyntax->getSymbol( patternText );
pushToken( tokens, type.empty() ? pattern.type : type, patternText );
if ( pattern.patterns.size() > 1 ) {
retState = patternIndex;
if ( !pattern.syntax.empty() ) {
pushSubsyntax( pattern, patternIndex + 1 );
} else if ( pattern.patterns.size() > 1 ) {
setSubsyntaxPatternIdx( patternIndex + 1 );
}
i = end;
matched = true;