Cataclysm-DDA/src/catacharset.h

250 lines
9.7 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#pragma once
#ifndef CATA_SRC_CATACHARSET_H
#define CATA_SRC_CATACHARSET_H
#include <cstddef>
#include <cstdint>
#include <iosfwd>
#include <string>
#include <vector>
constexpr int ANY_LENGTH = 5;
constexpr int NULL_UNICODE = 0x0000;
constexpr int PERCENT_SIGN_UNICODE = 0x0025;
constexpr int UNKNOWN_UNICODE = 0xFFFD;
class utf8_wrapper;
// get a Unicode character from a utf8 string
uint32_t UTF8_getch( const char **src, int *srclen );
inline uint32_t UTF8_getch( const std::string &str )
{
const char *utf8str = str.c_str();
int len = str.length();
return UTF8_getch( &utf8str, &len );
}
// convert cursorx value to byte position
int cursorx_to_position( const char *line, int cursorx, int *prevpos = nullptr, int maxlen = -1 );
int utf8_width( const char *s, bool ignore_tags = false );
int utf8_width( const std::string &str, bool ignore_tags = false );
int utf8_width( const utf8_wrapper &str, bool ignore_tags = false );
std::string left_justify( const std::string &str, int width, bool ignore_tags = false );
std::string right_justify( const std::string &str, int width, bool ignore_tags = false );
std::string utf8_justify( const std::string &str, int width, bool ignore_tags = false );
/**
* Center text inside whole line.
* @param text to be centered.
* @param start_pos printable position on line.
* @param end_pos printable position on line.
* @return First char position of centered text or start_pos if text is too big.
*/
int center_text_pos( const char *text, int start_pos, int end_pos );
int center_text_pos( const std::string &text, int start_pos, int end_pos );
int center_text_pos( const utf8_wrapper &text, int start_pos, int end_pos );
std::string utf32_to_utf8( uint32_t ch );
std::string utf8_truncate( const std::string &s, size_t length );
std::string base64_encode( const std::string &str );
std::string base64_decode( const std::string &str );
std::wstring utf8_to_wstr( const std::string &str );
std::string wstr_to_utf8( const std::wstring &wstr );
std::string wstr_to_native( const std::wstring &wstr );
std::string utf32_to_utf8( const std::u32string &str );
std::u32string utf8_to_utf32( const std::string &str );
// Split the given string into displayed characters. Each element of the returned vector
// contains one 'regular' codepoint and all subsequent combining characters.
std::vector<std::string> utf8_display_split( const std::string & );
/**
* UTF8-Wrapper over std::string.
* It looks and feels like a std::string, but uses code points counts
* as index, not bytes.
* A multi-byte Unicode character might be represented
* as 3 bytes in UTF8, this class will see these 3 bytes as 1 character.
* It will never separate them. It will however split between code points
* which might be problematic when containing combination characters.
* In this case use the *_display functions. They operate on the display width.
* Code points with a zero width are considered to belong to the previous code
* point and are not split from that.
* Having a string with like [letter0][letter1][combination-mark][letter2]
* (assuming each letter has a width of 1) will return a display_width of 3.
* substr_display(1, 2) returns [letter1][combination-mark][letter2],
* substr_display(1, 1) returns [letter1][combination-mark]
* substr_display(2, 1) returns [letter2]
*
* Note: functions use code points, not bytes, for counting/indexing!
* Functions with the _display suffix use display width for counting/indexing!
* Protected functions might behave different
*
* For function documentation see std::string, the functions here
* mimic the behavior of the equally named std::string function.
*/
class utf8_wrapper
{
public:
utf8_wrapper() : _length( 0 ), _display_width( 0 ) { }
explicit utf8_wrapper( const std::string &d );
explicit utf8_wrapper( const char *d );
void insert( size_t start, const utf8_wrapper &other );
utf8_wrapper substr( size_t start, size_t length ) const;
utf8_wrapper substr( size_t start ) const {
return substr( start, _length - start );
}
void erase( size_t start, size_t length );
void erase( size_t start ) {
erase( start, _length - start );
}
void append( const utf8_wrapper &other );
utf8_wrapper &replace_all( const utf8_wrapper &search, const utf8_wrapper &replace );
/**
* Returns a substring based on the display width, not the number of
* code points (as the other substr function does).
* @param start Start the returned substring with the character that is
* at that position when this string would be have been printed (rounded down).
* E.g. a string "a´a´" (where a is a normal character, and ` is a combination
* code point) would be displayed as two cells: "áá".
* substr_display(0,2) would return the whole string, substr_display(0,1)
* would return the first two code points, substr_display(1,1) would return the
* last two code points.
* @param length Display length of the returned string, the returned string can
* have a shorter display length (especially if the last character is a multi-cell
* character and including it would exceed the length parameter).
*/
utf8_wrapper substr_display( size_t start, size_t length ) const;
utf8_wrapper substr_display( size_t start ) const {
return substr_display( start, _length - start );
}
utf8_wrapper &operator=( const std::string &d ) {
*this = utf8_wrapper( d );
return *this;
}
const std::string &str() const {
return _data;
}
// Returns Unicode character at position start
uint32_t at( size_t start ) const;
// Returns number of Unicode characters
size_t size() const {
return _length;
}
size_t length() const {
return size();
}
bool empty() const {
return size() == 0;
}
// Display size might be different from length, as some characters
// are displayed as 2 chars in a terminal
size_t display_width() const {
return _display_width;
}
const char *c_str() const {
return _data.c_str();
}
/**
* Return a substring at most maxlength width (display width).
* If the string had to shortened, an ellipsis (...) is added. The
* string with the ellipsis will be exactly maxlength displayed
* characters.
*/
std::string shorten( size_t maxlength ) const;
protected:
std::string _data;
size_t _length;
size_t _display_width;
// Byte offset into @ref _data for codepoint at index start.
// bstart is a initial offset (in bytes!). The function operates on
// _data.substr(bstart), it ignores everything before bstart.
size_t byte_start( size_t bstart, size_t start ) const;
// Byte offset into @ref _date for the codepoint starting at displayed cell start,
// if the first character occupies two cells, than byte_start_display(2)
// would return the byte offset of the second codepoint
// byte_start_display(1) and byte_start_display(0) would return 0
size_t byte_start_display( size_t bstart, size_t start ) const;
// Same as @ref substr, but with a byte index as start
utf8_wrapper substr_byte( size_t bytestart, size_t length, bool use_display_width ) const;
void init_utf8_wrapper();
};
/* A range that iterates through Unicode code points in a UTF-8 encoded string
* without incurring dynamic memory allocation.
*
* Example:
* for( char32_t c : utf8_view( "..." ) ) {
* do_something_with( c );
* }
*/
class utf8_view
{
private:
const char *buffer;
std::size_t length;
class iterator
{
public:
using iterator_category = std::input_iterator_tag;
using difference_type = std::ptrdiff_t;
using value_type = char32_t;
using pointer = value_type*;
using reference = value_type&;
private:
const char *ptr;
const char *next_ptr;
int remaining;
int next_remaining;
char32_t unicode;
void decode() {
if( remaining > 0 ) {
unicode = UTF8_getch( &next_ptr, &next_remaining );
} else {
next_ptr = nullptr;
next_remaining = 0;
unicode = 0;
}
}
public:
explicit iterator( const char *ptr, int remaining ) : ptr( ptr ), remaining( remaining ) {
next_ptr = ptr;
next_remaining = remaining;
decode();
}
bool operator != ( const iterator &rhs ) const noexcept {
return this->ptr != rhs.ptr;
}
const iterator &operator++() noexcept {
ptr = next_ptr;
remaining = next_remaining;
decode();
return *this;
}
char32_t operator*() const noexcept {
return unicode;
}
};
public:
explicit utf8_view( const std::string &str ) : buffer( str.c_str() ), length( str.length() ) {}
iterator begin() const noexcept {
return iterator( buffer, length );
}
iterator end() const noexcept {
return iterator( buffer + length, 0 );
}
};
#endif // CATA_SRC_CATACHARSET_H