Text encodings WIP.

This commit is contained in:
Martín Lucas Golini
2024-05-10 02:06:15 -03:00
parent 6b8da507d3
commit f8d466de22
6 changed files with 135 additions and 20 deletions

View File

@@ -17,6 +17,7 @@ class EE_API TextFormat {
UTF16LE = String::hash( "UTF-16 LE" ),
UTF16BE = String::hash( "UTF-16 BE" ),
Latin1 = String::hash( "ISO-8859-1" ),
Shift_JIS = String::hash( "Shift_JIS" ),
};
enum class LineEnding { LF, CRLF, CR };

View File

@@ -15,6 +15,11 @@ class EE_API PlatformHelper {
*/
virtual bool openURL( const std::string& url ) = 0;
virtual char* iconv( const char* tocode, const char* fromcode, const char* inbuf,
size_t inbytesleft ) = 0;
virtual void iconvFree( char* buf ) = 0;
#if EE_PLATFORM == EE_PLATFORM_ANDROID
/** @return The Activity object for the application */
virtual void* getActivity() = 0;

View File

@@ -1,5 +1,4 @@
#include <cstdio>
#include <eepp/core/debug.hpp>
#include <eepp/core/debug.hpp>
#include <eepp/network/uri.hpp>
#include <eepp/system/filesystem.hpp>
#include <eepp/system/iostreamfile.hpp>
@@ -12,6 +11,7 @@
#include <eepp/ui/doc/syntaxdefinitionmanager.hpp>
#include <eepp/ui/doc/syntaxhighlighter.hpp>
#include <eepp/ui/doc/textdocument.hpp>
#include <eepp/window/engine.hpp>
#include <string>
using namespace std::literals;
@@ -104,6 +104,17 @@ void TextDocument::resetCursor() {
notifySelectionChanged();
}
String shiftJISToUTF32( const std::string_view& shiftJISString ) {
String string;
auto* ret = Window::Engine::instance()->getPlatformHelper()->iconv(
"UTF-32LE", "SHIFT-JIS", shiftJISString.data(), shiftJISString.size() );
if ( ret ) {
string = String( reinterpret_cast<String::StringBaseType*>( ret ) );
Window::Engine::instance()->getPlatformHelper()->iconvFree( ret );
}
return string;
}
static constexpr int codepointSize( TextFormat::Encoding enc ) {
switch ( enc ) {
case TextFormat::Encoding::UTF16LE:
@@ -167,7 +178,9 @@ static String ptrGetLine( char* data, const size_t& size, size_t& position,
position++;
}
if ( enc == TextFormat::Encoding::Latin1 )
if ( enc == TextFormat::Encoding::Shift_JIS )
return shiftJISToUTF32( std::string_view{ data, position } );
else if ( enc == TextFormat::Encoding::Latin1 )
return String::fromLatin1( data, position );
return String( data, position );
@@ -687,6 +700,7 @@ bool TextDocument::save( IOStream& stream, bool keepUndoRedoStatus ) {
MD5::update( md5Ctx, bom, sizeof( bom ) );
break;
}
case TextFormat::Encoding::Shift_JIS:
case TextFormat::Encoding::Latin1:
break;
}
@@ -764,6 +778,15 @@ bool TextDocument::save( IOStream& stream, bool keepUndoRedoStatus ) {
MD5::update( md5Ctx, latin1.data(), latin1.size() );
break;
}
case TextFormat::Encoding::Shift_JIS: {
auto* ret = Window::Engine::instance()->getPlatformHelper()->iconv(
"SHIFT-JIS", "UTF-8", text.c_str(), text.size() );
auto len = strlen( ret );
stream.write( ret, len );
MD5::update( md5Ctx, ret, len );
Window::Engine::instance()->getPlatformHelper()->iconvFree( ret );
break;
}
case TextFormat::Encoding::UTF8: {
stream.write( text.c_str(), text.size() );
MD5::update( md5Ctx, text.data(), text.size() );

View File

@@ -91,6 +91,47 @@ template <bool BigEndian> struct UTF16 {
using UTF16_LE = UTF16<false>;
using UTF16_BE = UTF16<true>;
//-------------------------------------------------------------------
// Shift JIS
//-------------------------------------------------------------------
struct ShiftJIS {
static inline Uint16 getUnit( const char* src ) {
return Uint8( src[0] ) | ( Uint16( Uint8( src[1] ) ) << 8 );
}
static inline TextDecodeResult decodePoint( std::string_view view ) {
// Shift JIS ranges for single-byte and double-byte characters
static constexpr std::pair<unsigned char, unsigned char> firstByteRange1( 0x81, 0x9F );
static constexpr std::pair<unsigned char, unsigned char> firstByteRange2( 0xE0, 0xEF );
static constexpr std::pair<unsigned char, unsigned char> secondByteRange1( 0x40, 0x7E );
static constexpr std::pair<unsigned char, unsigned char> secondByteRange2( 0x80, 0xFC );
if ( view.size() == 0 )
return {};
Uint8 first = view[0];
if ( first < 0x7F )
return { first, TextDecodeResult::Status::Valid, 1 };
if ( view.size() < 2 &&
( ( first >= secondByteRange1.first && first <= secondByteRange1.second ) ||
( first >= secondByteRange2.first && first <= secondByteRange2.second ) ) ) {
return { first, TextDecodeResult::Status::Valid, 1 };
}
Uint8 second = view[1];
if ( ( ( first >= firstByteRange1.first && first <= firstByteRange1.second ) ||
( first >= firstByteRange2.first && first <= firstByteRange2.second ) ) &&
( ( second >= secondByteRange1.first && second <= secondByteRange1.second ) ||
( second >= secondByteRange2.first && second <= secondByteRange2.second ) ) ) {
return { getUnit( view.data() ), TextDecodeResult::Status::Valid, 2 };
}
return { first, TextDecodeResult::Status::Invalid, 1 };
}
};
//-------------------------------------------------------------------
// UTF8
//-------------------------------------------------------------------
@@ -193,6 +234,9 @@ template <> struct TextEncoding::Wrapper<UTF16_LE> {
template <> struct TextEncoding::Wrapper<UTF16_BE> {
static TextEncoding Instance;
};
template <> struct TextEncoding::Wrapper<ShiftJIS> {
static TextEncoding Instance;
};
//-------------------------------------------------------------------
// TextEncoding (indirect through function vectors)
@@ -217,6 +261,11 @@ TextEncoding TextEncoding::Wrapper<UTF16_BE>::Instance = {
2,
};
TextEncoding TextEncoding::Wrapper<ShiftJIS>::Instance = {
&ShiftJIS::decodePoint,
1,
};
const TextEncoding* encodingFromEnum( TextFormat::Encoding enc ) {
switch ( enc ) {
default:
@@ -243,7 +292,10 @@ struct TextFileStats {
Uint32 numPlainAscii = 0; // includes whitespace, excludes control characters < 32
Uint32 numWhitespace = 0;
Uint32 numExtended = 0;
Uint32 num16bytes = 0;
float ooNumPoints = 0.f;
float score = 0.f;
bool count16b{ false };
Uint32 numInvalidPoints() const { return numPoints - numValidPoints; }
@@ -256,11 +308,15 @@ struct TextFileStats {
}
}
float getScore() const {
return ( 2.5f * numWhitespace + numPlainAscii - 100.f * numInvalidPoints() -
50.f * numControl + 5.f * numExtended ) *
ooNumPoints;
void calcScore() {
if ( !score ) {
score = ( 2.5f * numWhitespace + numPlainAscii - 100.f * numInvalidPoints() -
50.f * numControl + 5.f * numExtended + 2.5f * num16bytes ) *
ooNumPoints;
}
}
float getScore() const { return score; }
};
static Uint32 scanTextFile( TextFileStats& stats, IOStream& ins, const TextEncoding* encoding,
@@ -314,6 +370,8 @@ static Uint32 scanTextFile( TextFileStats& stats, IOStream& ins, const TextEncod
}
} else if ( decoded.point >= 65536 ) {
stats.numExtended++;
} else if ( stats.count16b && decoded.point >= 0x8140 ) {
stats.num16bytes++;
}
}
prevWasCR = ( decoded.point == '\r' );
@@ -321,6 +379,7 @@ static Uint32 scanTextFile( TextFileStats& stats, IOStream& ins, const TextEncod
if ( stats.numPoints > 0 ) {
stats.ooNumPoints = 1.f / stats.numPoints;
}
stats.calcScore();
return numBytes;
}
@@ -371,6 +430,16 @@ TextFormat guessFileEncoding( IOStream& ins ) {
encoding = TextFormat::Encoding::UTF16BE;
}
TextFileStats statsShiftJIS;
statsShiftJIS.count16b = true;
scanTextFile( statsShiftJIS, ins, TextEncoding::get<ShiftJIS>(), NumBytesForAutodetect );
ins.seek( 0 );
if ( statsShiftJIS.getScore() > stats->getScore() ) {
stats = &statsShiftJIS;
encoding = TextFormat::Encoding::Shift_JIS;
}
// Choose between the UTF16 and 8-bit encoding:
if ( stats8.getScore() >= stats->getScore() ) {
stats = &stats8;
@@ -448,6 +517,8 @@ TextFormat::Encoding TextFormat::encodingFromString( const std::string_view& str
return TextFormat::Encoding::UTF16BE;
case static_cast<String::HashType>( TextFormat::Encoding::Latin1 ):
return TextFormat::Encoding::Latin1;
case static_cast<String::HashType>( TextFormat::Encoding::Shift_JIS ):
return TextFormat::Encoding::Shift_JIS;
case static_cast<String::HashType>( TextFormat::Encoding::UTF8 ):
default:
return TextFormat::Encoding::UTF8;
@@ -462,6 +533,8 @@ std::string TextFormat::encodingToString( TextFormat::Encoding enc ) {
return "UTF-16 BE";
case TextFormat::Encoding::Latin1:
return "ISO-8859-1";
case TextFormat::Encoding::Shift_JIS:
return "Shift_JIS";
case TextFormat::Encoding::UTF8:
default:
break;
@@ -475,6 +548,7 @@ std::vector<std::pair<TextFormat::Encoding, std::string>> TextFormat::encodings(
encs.emplace_back( Encoding::UTF16BE, encodingToString( Encoding::UTF16BE ) );
encs.emplace_back( Encoding::UTF16LE, encodingToString( Encoding::UTF16LE ) );
encs.emplace_back( Encoding::Latin1, encodingToString( Encoding::Latin1 ) );
encs.emplace_back( Encoding::Shift_JIS, encodingToString( Encoding::Shift_JIS ) );
return encs;
}

View File

@@ -1,14 +1,13 @@
#include <eepp/system/log.hpp>
#include <eepp/window/backend/SDL2/base.hpp>
#include <eepp/window/backend/SDL2/platformhelpersdl2.hpp>
#include <eepp/system/log.hpp>
using namespace EE::System;
#if EE_PLATFORM == EE_PLATFORM_EMSCRIPTEN
#include <emscripten.h>
EM_JS(void, emscripten_open_url, (const char *msg), {
window.open(UTF8ToString(msg), 'blank');
});
EM_JS( void, emscripten_open_url, ( const char* msg ),
{ window.open( UTF8ToString( msg ), 'blank' ); } );
#endif
#if EE_PLATFORM == EE_PLATFORM_ANDROID
@@ -21,18 +20,27 @@ PlatformHelperSDL2::PlatformHelperSDL2() {}
bool PlatformHelperSDL2::openURL( const std::string& url ) {
#if EE_PLATFORM == EE_PLATFORM_EMSCRIPTEN
emscripten_open_url(url.c_str());
emscripten_open_url( url.c_str() );
return true;
#else
#if SDL_VERSION_ATLEAST(2,0,14)
int res = SDL_OpenURL( url.c_str() );
if ( res != 0 )
Log::error( "PlatformHelperSDL2::openURL: Failed with error - %s", SDL_GetError() );
return res == 0;
#else
return false;
#endif
#if SDL_VERSION_ATLEAST( 2, 0, 14 )
int res = SDL_OpenURL( url.c_str() );
if ( res != 0 )
Log::error( "PlatformHelperSDL2::openURL: Failed with error - %s", SDL_GetError() );
return res == 0;
#else
return false;
#endif
#endif
}
char* PlatformHelperSDL2::iconv( const char* tocode, const char* fromcode, const char* inbuf,
size_t inbytesleft ) {
return SDL_iconv_string( tocode, fromcode, inbuf, inbytesleft );
}
void PlatformHelperSDL2::iconvFree( char* buf ) {
SDL_free( buf );
}
#if EE_PLATFORM == EE_PLATFORM_ANDROID

View File

@@ -12,6 +12,10 @@ class EE_API PlatformHelperSDL2 : public PlatformHelper {
bool openURL( const std::string& url );
char* iconv( const char* tocode, const char* fromcode, const char* inbuf, size_t inbytesleft );
void iconvFree( char* buf );
#if EE_PLATFORM == EE_PLATFORM_ANDROID
void* getActivity();