250 lines
9.7 KiB
C++
250 lines
9.7 KiB
C++
#pragma once
|
||
#ifndef CATA_SRC_CATACHARSET_H
|
||
#define CATA_SRC_CATACHARSET_H
|
||
|
||
#include <cstddef>
|
||
#include <cstdint>
|
||
#include <iosfwd>
|
||
#include <string>
|
||
#include <vector>
|
||
|
||
constexpr int ANY_LENGTH = 5;
|
||
constexpr int NULL_UNICODE = 0x0000;
|
||
constexpr int PERCENT_SIGN_UNICODE = 0x0025;
|
||
constexpr int UNKNOWN_UNICODE = 0xFFFD;
|
||
|
||
class utf8_wrapper;
|
||
|
||
// get a Unicode character from a utf8 string
|
||
uint32_t UTF8_getch( const char **src, int *srclen );
|
||
inline uint32_t UTF8_getch( const std::string &str )
|
||
{
|
||
const char *utf8str = str.c_str();
|
||
int len = str.length();
|
||
return UTF8_getch( &utf8str, &len );
|
||
}
|
||
// convert cursorx value to byte position
|
||
int cursorx_to_position( const char *line, int cursorx, int *prevpos = nullptr, int maxlen = -1 );
|
||
int utf8_width( const char *s, bool ignore_tags = false );
|
||
int utf8_width( const std::string &str, bool ignore_tags = false );
|
||
int utf8_width( const utf8_wrapper &str, bool ignore_tags = false );
|
||
|
||
std::string left_justify( const std::string &str, int width, bool ignore_tags = false );
|
||
std::string right_justify( const std::string &str, int width, bool ignore_tags = false );
|
||
std::string utf8_justify( const std::string &str, int width, bool ignore_tags = false );
|
||
|
||
/**
|
||
* Center text inside whole line.
|
||
* @param text to be centered.
|
||
* @param start_pos printable position on line.
|
||
* @param end_pos printable position on line.
|
||
* @return First char position of centered text or start_pos if text is too big.
|
||
*/
|
||
int center_text_pos( const char *text, int start_pos, int end_pos );
|
||
int center_text_pos( const std::string &text, int start_pos, int end_pos );
|
||
int center_text_pos( const utf8_wrapper &text, int start_pos, int end_pos );
|
||
std::string utf32_to_utf8( uint32_t ch );
|
||
std::string utf8_truncate( const std::string &s, size_t length );
|
||
|
||
std::string base64_encode( const std::string &str );
|
||
std::string base64_decode( const std::string &str );
|
||
|
||
std::wstring utf8_to_wstr( const std::string &str );
|
||
std::string wstr_to_utf8( const std::wstring &wstr );
|
||
|
||
std::string wstr_to_native( const std::wstring &wstr );
|
||
|
||
std::string utf32_to_utf8( const std::u32string &str );
|
||
std::u32string utf8_to_utf32( const std::string &str );
|
||
|
||
// Split the given string into displayed characters. Each element of the returned vector
|
||
// contains one 'regular' codepoint and all subsequent combining characters.
|
||
std::vector<std::string> utf8_display_split( const std::string & );
|
||
|
||
/**
|
||
* UTF8-Wrapper over std::string.
|
||
* It looks and feels like a std::string, but uses code points counts
|
||
* as index, not bytes.
|
||
* A multi-byte Unicode character might be represented
|
||
* as 3 bytes in UTF8, this class will see these 3 bytes as 1 character.
|
||
* It will never separate them. It will however split between code points
|
||
* which might be problematic when containing combination characters.
|
||
* In this case use the *_display functions. They operate on the display width.
|
||
* Code points with a zero width are considered to belong to the previous code
|
||
* point and are not split from that.
|
||
* Having a string with like [letter0][letter1][combination-mark][letter2]
|
||
* (assuming each letter has a width of 1) will return a display_width of 3.
|
||
* substr_display(1, 2) returns [letter1][combination-mark][letter2],
|
||
* substr_display(1, 1) returns [letter1][combination-mark]
|
||
* substr_display(2, 1) returns [letter2]
|
||
*
|
||
* Note: functions use code points, not bytes, for counting/indexing!
|
||
* Functions with the _display suffix use display width for counting/indexing!
|
||
* Protected functions might behave different
|
||
*
|
||
* For function documentation see std::string, the functions here
|
||
* mimic the behavior of the equally named std::string function.
|
||
*/
|
||
class utf8_wrapper
|
||
{
|
||
public:
|
||
utf8_wrapper() : _length( 0 ), _display_width( 0 ) { }
|
||
explicit utf8_wrapper( const std::string &d );
|
||
explicit utf8_wrapper( const char *d );
|
||
|
||
void insert( size_t start, const utf8_wrapper &other );
|
||
utf8_wrapper substr( size_t start, size_t length ) const;
|
||
utf8_wrapper substr( size_t start ) const {
|
||
return substr( start, _length - start );
|
||
}
|
||
void erase( size_t start, size_t length );
|
||
void erase( size_t start ) {
|
||
erase( start, _length - start );
|
||
}
|
||
void append( const utf8_wrapper &other );
|
||
utf8_wrapper &replace_all( const utf8_wrapper &search, const utf8_wrapper &replace );
|
||
/**
|
||
* Returns a substring based on the display width, not the number of
|
||
* code points (as the other substr function does).
|
||
* @param start Start the returned substring with the character that is
|
||
* at that position when this string would be have been printed (rounded down).
|
||
* E.g. a string "a´a´" (where a is a normal character, and ` is a combination
|
||
* code point) would be displayed as two cells: "áá".
|
||
* substr_display(0,2) would return the whole string, substr_display(0,1)
|
||
* would return the first two code points, substr_display(1,1) would return the
|
||
* last two code points.
|
||
* @param length Display length of the returned string, the returned string can
|
||
* have a shorter display length (especially if the last character is a multi-cell
|
||
* character and including it would exceed the length parameter).
|
||
*/
|
||
utf8_wrapper substr_display( size_t start, size_t length ) const;
|
||
utf8_wrapper substr_display( size_t start ) const {
|
||
return substr_display( start, _length - start );
|
||
}
|
||
|
||
utf8_wrapper &operator=( const std::string &d ) {
|
||
*this = utf8_wrapper( d );
|
||
return *this;
|
||
}
|
||
const std::string &str() const {
|
||
return _data;
|
||
}
|
||
|
||
// Returns Unicode character at position start
|
||
uint32_t at( size_t start ) const;
|
||
|
||
// Returns number of Unicode characters
|
||
size_t size() const {
|
||
return _length;
|
||
}
|
||
size_t length() const {
|
||
return size();
|
||
}
|
||
bool empty() const {
|
||
return size() == 0;
|
||
}
|
||
// Display size might be different from length, as some characters
|
||
// are displayed as 2 chars in a terminal
|
||
size_t display_width() const {
|
||
return _display_width;
|
||
}
|
||
const char *c_str() const {
|
||
return _data.c_str();
|
||
}
|
||
/**
|
||
* Return a substring at most maxlength width (display width).
|
||
* If the string had to shortened, an ellipsis (...) is added. The
|
||
* string with the ellipsis will be exactly maxlength displayed
|
||
* characters.
|
||
*/
|
||
std::string shorten( size_t maxlength ) const;
|
||
protected:
|
||
std::string _data;
|
||
size_t _length;
|
||
size_t _display_width;
|
||
// Byte offset into @ref _data for codepoint at index start.
|
||
// bstart is a initial offset (in bytes!). The function operates on
|
||
// _data.substr(bstart), it ignores everything before bstart.
|
||
size_t byte_start( size_t bstart, size_t start ) const;
|
||
// Byte offset into @ref _date for the codepoint starting at displayed cell start,
|
||
// if the first character occupies two cells, than byte_start_display(2)
|
||
// would return the byte offset of the second codepoint
|
||
// byte_start_display(1) and byte_start_display(0) would return 0
|
||
size_t byte_start_display( size_t bstart, size_t start ) const;
|
||
// Same as @ref substr, but with a byte index as start
|
||
utf8_wrapper substr_byte( size_t bytestart, size_t length, bool use_display_width ) const;
|
||
void init_utf8_wrapper();
|
||
};
|
||
|
||
/* A range that iterates through Unicode code points in a UTF-8 encoded string
|
||
* without incurring dynamic memory allocation.
|
||
*
|
||
* Example:
|
||
* for( char32_t c : utf8_view( "..." ) ) {
|
||
* do_something_with( c );
|
||
* }
|
||
*/
|
||
class utf8_view
|
||
{
|
||
private:
|
||
const char *buffer;
|
||
std::size_t length;
|
||
|
||
class iterator
|
||
{
|
||
public:
|
||
using iterator_category = std::input_iterator_tag;
|
||
using difference_type = std::ptrdiff_t;
|
||
using value_type = char32_t;
|
||
using pointer = value_type*;
|
||
using reference = value_type&;
|
||
private:
|
||
const char *ptr;
|
||
const char *next_ptr;
|
||
int remaining;
|
||
int next_remaining;
|
||
char32_t unicode;
|
||
|
||
void decode() {
|
||
if( remaining > 0 ) {
|
||
unicode = UTF8_getch( &next_ptr, &next_remaining );
|
||
} else {
|
||
next_ptr = nullptr;
|
||
next_remaining = 0;
|
||
unicode = 0;
|
||
}
|
||
}
|
||
public:
|
||
explicit iterator( const char *ptr, int remaining ) : ptr( ptr ), remaining( remaining ) {
|
||
next_ptr = ptr;
|
||
next_remaining = remaining;
|
||
decode();
|
||
}
|
||
bool operator != ( const iterator &rhs ) const noexcept {
|
||
return this->ptr != rhs.ptr;
|
||
}
|
||
const iterator &operator++() noexcept {
|
||
ptr = next_ptr;
|
||
remaining = next_remaining;
|
||
decode();
|
||
return *this;
|
||
}
|
||
char32_t operator*() const noexcept {
|
||
return unicode;
|
||
}
|
||
};
|
||
|
||
public:
|
||
explicit utf8_view( const std::string &str ) : buffer( str.c_str() ), length( str.length() ) {}
|
||
|
||
iterator begin() const noexcept {
|
||
return iterator( buffer, length );
|
||
}
|
||
|
||
iterator end() const noexcept {
|
||
return iterator( buffer + length, 0 );
|
||
}
|
||
};
|
||
|
||
#endif // CATA_SRC_CATACHARSET_H
|