From f8d466de22becd2bb4fbd356a0983cec0e704025 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mart=C3=ADn=20Lucas=20Golini?= Date: Fri, 10 May 2024 02:06:15 -0300 Subject: [PATCH] Text encodings WIP. --- include/eepp/ui/doc/textformat.hpp | 1 + include/eepp/window/platformhelper.hpp | 5 ++ src/eepp/ui/doc/textdocument.cpp | 29 ++++++- src/eepp/ui/doc/textformat.cpp | 82 ++++++++++++++++++- .../backend/SDL2/platformhelpersdl2.cpp | 34 +++++--- .../backend/SDL2/platformhelpersdl2.hpp | 4 + 6 files changed, 135 insertions(+), 20 deletions(-) diff --git a/include/eepp/ui/doc/textformat.hpp b/include/eepp/ui/doc/textformat.hpp index d80bd370b..729297912 100644 --- a/include/eepp/ui/doc/textformat.hpp +++ b/include/eepp/ui/doc/textformat.hpp @@ -17,6 +17,7 @@ class EE_API TextFormat { UTF16LE = String::hash( "UTF-16 LE" ), UTF16BE = String::hash( "UTF-16 BE" ), Latin1 = String::hash( "ISO-8859-1" ), + Shift_JIS = String::hash( "Shift_JIS" ), }; enum class LineEnding { LF, CRLF, CR }; diff --git a/include/eepp/window/platformhelper.hpp b/include/eepp/window/platformhelper.hpp index 44f094be1..2fdcf0d92 100644 --- a/include/eepp/window/platformhelper.hpp +++ b/include/eepp/window/platformhelper.hpp @@ -15,6 +15,11 @@ class EE_API PlatformHelper { */ virtual bool openURL( const std::string& url ) = 0; + virtual char* iconv( const char* tocode, const char* fromcode, const char* inbuf, + size_t inbytesleft ) = 0; + + virtual void iconvFree( char* buf ) = 0; + #if EE_PLATFORM == EE_PLATFORM_ANDROID /** @return The Activity object for the application */ virtual void* getActivity() = 0; diff --git a/src/eepp/ui/doc/textdocument.cpp b/src/eepp/ui/doc/textdocument.cpp index d6b014b6f..596749bc3 100644 --- a/src/eepp/ui/doc/textdocument.cpp +++ b/src/eepp/ui/doc/textdocument.cpp @@ -1,5 +1,4 @@ -#include -#include +#include #include #include #include @@ -12,6 +11,7 @@ #include #include #include +#include #include using namespace std::literals; @@ -104,6 +104,17 @@ void TextDocument::resetCursor() { notifySelectionChanged(); } +String shiftJISToUTF32( const std::string_view& shiftJISString ) { + String string; + auto* ret = Window::Engine::instance()->getPlatformHelper()->iconv( + "UTF-32LE", "SHIFT-JIS", shiftJISString.data(), shiftJISString.size() ); + if ( ret ) { + string = String( reinterpret_cast( ret ) ); + Window::Engine::instance()->getPlatformHelper()->iconvFree( ret ); + } + return string; +} + static constexpr int codepointSize( TextFormat::Encoding enc ) { switch ( enc ) { case TextFormat::Encoding::UTF16LE: @@ -167,7 +178,9 @@ static String ptrGetLine( char* data, const size_t& size, size_t& position, position++; } - if ( enc == TextFormat::Encoding::Latin1 ) + if ( enc == TextFormat::Encoding::Shift_JIS ) + return shiftJISToUTF32( std::string_view{ data, position } ); + else if ( enc == TextFormat::Encoding::Latin1 ) return String::fromLatin1( data, position ); return String( data, position ); @@ -687,6 +700,7 @@ bool TextDocument::save( IOStream& stream, bool keepUndoRedoStatus ) { MD5::update( md5Ctx, bom, sizeof( bom ) ); break; } + case TextFormat::Encoding::Shift_JIS: case TextFormat::Encoding::Latin1: break; } @@ -764,6 +778,15 @@ bool TextDocument::save( IOStream& stream, bool keepUndoRedoStatus ) { MD5::update( md5Ctx, latin1.data(), latin1.size() ); break; } + case TextFormat::Encoding::Shift_JIS: { + auto* ret = Window::Engine::instance()->getPlatformHelper()->iconv( + "SHIFT-JIS", "UTF-8", text.c_str(), text.size() ); + auto len = strlen( ret ); + stream.write( ret, len ); + MD5::update( md5Ctx, ret, len ); + Window::Engine::instance()->getPlatformHelper()->iconvFree( ret ); + break; + } case TextFormat::Encoding::UTF8: { stream.write( text.c_str(), text.size() ); MD5::update( md5Ctx, text.data(), text.size() ); diff --git a/src/eepp/ui/doc/textformat.cpp b/src/eepp/ui/doc/textformat.cpp index eb7eefcb6..f75be661a 100644 --- a/src/eepp/ui/doc/textformat.cpp +++ b/src/eepp/ui/doc/textformat.cpp @@ -91,6 +91,47 @@ template struct UTF16 { using UTF16_LE = UTF16; using UTF16_BE = UTF16; +//------------------------------------------------------------------- +// Shift JIS +//------------------------------------------------------------------- +struct ShiftJIS { + static inline Uint16 getUnit( const char* src ) { + return Uint8( src[0] ) | ( Uint16( Uint8( src[1] ) ) << 8 ); + } + + static inline TextDecodeResult decodePoint( std::string_view view ) { + // Shift JIS ranges for single-byte and double-byte characters + static constexpr std::pair firstByteRange1( 0x81, 0x9F ); + static constexpr std::pair firstByteRange2( 0xE0, 0xEF ); + static constexpr std::pair secondByteRange1( 0x40, 0x7E ); + static constexpr std::pair secondByteRange2( 0x80, 0xFC ); + + if ( view.size() == 0 ) + return {}; + + Uint8 first = view[0]; + if ( first < 0x7F ) + return { first, TextDecodeResult::Status::Valid, 1 }; + + if ( view.size() < 2 && + ( ( first >= secondByteRange1.first && first <= secondByteRange1.second ) || + ( first >= secondByteRange2.first && first <= secondByteRange2.second ) ) ) { + return { first, TextDecodeResult::Status::Valid, 1 }; + } + + Uint8 second = view[1]; + + if ( ( ( first >= firstByteRange1.first && first <= firstByteRange1.second ) || + ( first >= firstByteRange2.first && first <= firstByteRange2.second ) ) && + ( ( second >= secondByteRange1.first && second <= secondByteRange1.second ) || + ( second >= secondByteRange2.first && second <= secondByteRange2.second ) ) ) { + return { getUnit( view.data() ), TextDecodeResult::Status::Valid, 2 }; + } + + return { first, TextDecodeResult::Status::Invalid, 1 }; + } +}; + //------------------------------------------------------------------- // UTF8 //------------------------------------------------------------------- @@ -193,6 +234,9 @@ template <> struct TextEncoding::Wrapper { template <> struct TextEncoding::Wrapper { static TextEncoding Instance; }; +template <> struct TextEncoding::Wrapper { + static TextEncoding Instance; +}; //------------------------------------------------------------------- // TextEncoding (indirect through function vectors) @@ -217,6 +261,11 @@ TextEncoding TextEncoding::Wrapper::Instance = { 2, }; +TextEncoding TextEncoding::Wrapper::Instance = { + &ShiftJIS::decodePoint, + 1, +}; + const TextEncoding* encodingFromEnum( TextFormat::Encoding enc ) { switch ( enc ) { default: @@ -243,7 +292,10 @@ struct TextFileStats { Uint32 numPlainAscii = 0; // includes whitespace, excludes control characters < 32 Uint32 numWhitespace = 0; Uint32 numExtended = 0; + Uint32 num16bytes = 0; float ooNumPoints = 0.f; + float score = 0.f; + bool count16b{ false }; Uint32 numInvalidPoints() const { return numPoints - numValidPoints; } @@ -256,11 +308,15 @@ struct TextFileStats { } } - float getScore() const { - return ( 2.5f * numWhitespace + numPlainAscii - 100.f * numInvalidPoints() - - 50.f * numControl + 5.f * numExtended ) * - ooNumPoints; + void calcScore() { + if ( !score ) { + score = ( 2.5f * numWhitespace + numPlainAscii - 100.f * numInvalidPoints() - + 50.f * numControl + 5.f * numExtended + 2.5f * num16bytes ) * + ooNumPoints; + } } + + float getScore() const { return score; } }; static Uint32 scanTextFile( TextFileStats& stats, IOStream& ins, const TextEncoding* encoding, @@ -314,6 +370,8 @@ static Uint32 scanTextFile( TextFileStats& stats, IOStream& ins, const TextEncod } } else if ( decoded.point >= 65536 ) { stats.numExtended++; + } else if ( stats.count16b && decoded.point >= 0x8140 ) { + stats.num16bytes++; } } prevWasCR = ( decoded.point == '\r' ); @@ -321,6 +379,7 @@ static Uint32 scanTextFile( TextFileStats& stats, IOStream& ins, const TextEncod if ( stats.numPoints > 0 ) { stats.ooNumPoints = 1.f / stats.numPoints; } + stats.calcScore(); return numBytes; } @@ -371,6 +430,16 @@ TextFormat guessFileEncoding( IOStream& ins ) { encoding = TextFormat::Encoding::UTF16BE; } + TextFileStats statsShiftJIS; + statsShiftJIS.count16b = true; + scanTextFile( statsShiftJIS, ins, TextEncoding::get(), NumBytesForAutodetect ); + ins.seek( 0 ); + + if ( statsShiftJIS.getScore() > stats->getScore() ) { + stats = &statsShiftJIS; + encoding = TextFormat::Encoding::Shift_JIS; + } + // Choose between the UTF16 and 8-bit encoding: if ( stats8.getScore() >= stats->getScore() ) { stats = &stats8; @@ -448,6 +517,8 @@ TextFormat::Encoding TextFormat::encodingFromString( const std::string_view& str return TextFormat::Encoding::UTF16BE; case static_cast( TextFormat::Encoding::Latin1 ): return TextFormat::Encoding::Latin1; + case static_cast( TextFormat::Encoding::Shift_JIS ): + return TextFormat::Encoding::Shift_JIS; case static_cast( TextFormat::Encoding::UTF8 ): default: return TextFormat::Encoding::UTF8; @@ -462,6 +533,8 @@ std::string TextFormat::encodingToString( TextFormat::Encoding enc ) { return "UTF-16 BE"; case TextFormat::Encoding::Latin1: return "ISO-8859-1"; + case TextFormat::Encoding::Shift_JIS: + return "Shift_JIS"; case TextFormat::Encoding::UTF8: default: break; @@ -475,6 +548,7 @@ std::vector> TextFormat::encodings( encs.emplace_back( Encoding::UTF16BE, encodingToString( Encoding::UTF16BE ) ); encs.emplace_back( Encoding::UTF16LE, encodingToString( Encoding::UTF16LE ) ); encs.emplace_back( Encoding::Latin1, encodingToString( Encoding::Latin1 ) ); + encs.emplace_back( Encoding::Shift_JIS, encodingToString( Encoding::Shift_JIS ) ); return encs; } diff --git a/src/eepp/window/backend/SDL2/platformhelpersdl2.cpp b/src/eepp/window/backend/SDL2/platformhelpersdl2.cpp index a384bd41e..93e30437c 100644 --- a/src/eepp/window/backend/SDL2/platformhelpersdl2.cpp +++ b/src/eepp/window/backend/SDL2/platformhelpersdl2.cpp @@ -1,14 +1,13 @@ +#include #include #include -#include using namespace EE::System; #if EE_PLATFORM == EE_PLATFORM_EMSCRIPTEN #include -EM_JS(void, emscripten_open_url, (const char *msg), { - window.open(UTF8ToString(msg), 'blank'); -}); +EM_JS( void, emscripten_open_url, ( const char* msg ), + { window.open( UTF8ToString( msg ), 'blank' ); } ); #endif #if EE_PLATFORM == EE_PLATFORM_ANDROID @@ -21,18 +20,27 @@ PlatformHelperSDL2::PlatformHelperSDL2() {} bool PlatformHelperSDL2::openURL( const std::string& url ) { #if EE_PLATFORM == EE_PLATFORM_EMSCRIPTEN - emscripten_open_url(url.c_str()); + emscripten_open_url( url.c_str() ); return true; #else - #if SDL_VERSION_ATLEAST(2,0,14) - int res = SDL_OpenURL( url.c_str() ); - if ( res != 0 ) - Log::error( "PlatformHelperSDL2::openURL: Failed with error - %s", SDL_GetError() ); - return res == 0; - #else - return false; - #endif +#if SDL_VERSION_ATLEAST( 2, 0, 14 ) + int res = SDL_OpenURL( url.c_str() ); + if ( res != 0 ) + Log::error( "PlatformHelperSDL2::openURL: Failed with error - %s", SDL_GetError() ); + return res == 0; +#else + return false; #endif +#endif +} + +char* PlatformHelperSDL2::iconv( const char* tocode, const char* fromcode, const char* inbuf, + size_t inbytesleft ) { + return SDL_iconv_string( tocode, fromcode, inbuf, inbytesleft ); +} + +void PlatformHelperSDL2::iconvFree( char* buf ) { + SDL_free( buf ); } #if EE_PLATFORM == EE_PLATFORM_ANDROID diff --git a/src/eepp/window/backend/SDL2/platformhelpersdl2.hpp b/src/eepp/window/backend/SDL2/platformhelpersdl2.hpp index 72b06089d..5c070f903 100644 --- a/src/eepp/window/backend/SDL2/platformhelpersdl2.hpp +++ b/src/eepp/window/backend/SDL2/platformhelpersdl2.hpp @@ -12,6 +12,10 @@ class EE_API PlatformHelperSDL2 : public PlatformHelper { bool openURL( const std::string& url ); + char* iconv( const char* tocode, const char* fromcode, const char* inbuf, size_t inbytesleft ); + + void iconvFree( char* buf ); + #if EE_PLATFORM == EE_PLATFORM_ANDROID void* getActivity();