utfcpp下载utfcpp源代码下载

UTF8-CPP：带C ++的UTF-8以便携式方式

介绍

C ++开发人员仍然错过了一种处理Unicode编码字符串的简单和便携式方式。原始的C ++标准（称为C ++ 98或C ++ 03）是Unicode不可知的。后来的标准版本已经取得了一些进展，但是仅使用标准设施与Unicode合作仍然很难。

我想出了一个小型的C ++ 98兼容的通用库，以处理UTF-8编码的字符串。对于任何与STL算法和迭代器一起使用的人，它应该很容易使用。该代码可供任何目的免费使用 - 查看许可证。自2006年首次发行以来，该图书馆在商业和开源项目中都被使用了很多，并且被证明是稳定且有用的。

UTF8-CPP：带C ++的UTF-8以便携式方式
- 介绍
- 安装
- 使用示例
  - 入门样本
  - 检查文件是否包含有效的UTF-8文本
  - 确保字符串包含有效的UTF-8文本
- 兴趣点 - 设计目标和决策 - 替代方案
- 参考
  - 来自UTF8名称空间的功能
    - UTF8 ::附录
      - Octet_iterator附录（UTFCHAR32_T CP，OCTET_ITERATOR结果）
      - void append（utfchar32_t cp，std :: string＆s）;
    - UTF8 ::附录16
      - Word_iterator附录16（utfchar32_t CP，Word_iterator结果）
      - void Append（UTFCHAR32_T CP，STD :: U16STRING＆S）
    - UTF8 :: Next
    - UTF8 :: Next16
    - utf8 :: peek_next
    - UTF8 ::先验
    - UTF8 :: Advance
    - UTF8 ::距离
    - UTF8 :: UTF16TO8
      - Octet_iterator UTF16TO8（U16BIT_ITERATOR启动，U16BIT_ITERATOR END，OCTET_ITERATOR结果）
      - std :: String utf16to8（const std :: u16String＆s）
      - std :: String utf16to8（std :: u16String_view s）
    - UTF8 :: UTF16TOU8
      - std :: u8string utf16tou8（const std :: u16string＆s）
      - std :: u8string utf16tou8（const std :: u16string_view＆s）
    - UTF8 :: UTF8TO16
      - U16BIT_ITERATOR UTF8TO16（octet_iterator start，Octet_iterator end，u16bit_iterator结果）
      - std :: u16String utf8to16（const std :: string＆s）
      - std :: u16String utf8to16（std :: string_view s）
      - std :: u16String utf8to16（std :: u8string＆s）
      - std :: u16String utf8to16（std :: u8string_view＆s）
    - UTF8 :: UTF32TO8
      - octet_iterator utf32to8（u32bit_iterator start，u32bit_iterator end，octet_iterator结果）
      - std :: String utf32to8（const std :: u32String＆s）
      - STD :: U8STRING UTF32TO8（CONST STD :: U32STRING＆S）
      - std :: u8string utf32to8（const std :: u32string_view＆s）
      - std :: String utf32to8（const std :: u32String＆s）
      - std :: String utf32to8（std :: U32String_view S）
    - UTF8 :: UTF8TO32
      - U32BIT_ITERATOR UTF8TO32（octet_iterator start，Octet_iterator end，u32bit_iterator结果）
      - STD :: U32STRING UTF8TO32（const std :: U8String＆S）
      - std :: u32String utf8to32（const std :: u8string_view＆s）
      - std :: u32String utf8to32（const std :: string＆s）
      - std :: u32String utf8to32（std :: string_view s）
    - utf8 :: find_invalid
      - octet_iterator find_invalid（octet_iterator start，octet_iterator end）
      - const char* find_invalid（const char* str）
      - std :: size_t find_invalid（const std :: string＆s）
      - std :: size_t find_invalid（std :: string_view s）
    - utf8 :: is_valid
      - bool is_valid（octet_iterator start，octet_iterator end）
      - bool is_valid（const char* str）
      - bool is_valid（const std :: string＆s）
      - bool is_valid（std :: string_view s）
    - utf8 :: replace_invalid
      - output_iterator replace_invalid（octet_iterator start，octet_iterator end，output_iterator out，utfchar32_t替换）
      - std :: String repent_invalid（const std :: string＆s，utfchar32_t替换）
      - std :: string repent_invalid（std :: string_view s，char32_t替换）
    - utf8 :: start_with_bom
      - bool start_with_bom（octet_iterator it，octet_iterator end）
      - bool start_with_bom（const std :: string＆s）
      - bool start_with_bom（std :: string_view s）
  - UTF8名称空间的类型
    - UTF8 ::异常
    - UTF8 :: Invalid_code_point
    - UTF8 :: Invalid_utf8
    - UTF8 :: Invalid_UTF16
    - utf8 :: not_enough_room
    - UTF8 :: iterator
      - 成员功能
  - 来自UTF8 ::未检查的命名空间的功能
    - utf8 :: unabled :: Append
    - utf8 :: unabled :: append16
    - utf8 :: unabled :: Next
    - UTF8 :: Next16
    - utf8 :: unabled :: peek_next
    - utf8 :: unabled ::先验
    - UTF8 :: UNCHECKED :: ADVAND
    - utf8 :: unabled ::距离
    - utf8 :: unabled :: utf16to8
    - utf8 :: unabled :: utf8to16
    - utf8 :: unabled :: utf32to8
    - UTF8 :: UNCHECKED :: UTF8TO32
    - utf8 :: uncecked :: replace_invalid
  - UTF8 ::未选中名称空间的类型
    - UTF8 :: iterator
      - 成员功能

安装

这是一个仅标题库，部署它的支持方式是：

从https://github.com/nemtrif/utfcpp/releass下载版本
解开发行版
将UTFCPP/源文件的内容复制到您保留项目的文件的目录中

cmakelist.txt文件最初仅用于测试目的，但不幸的是，随着时间的流逝，我接受了添加了安装目标的贡献。这不是安装UTFCPP库的支持方式，我正在考虑在以后的版本中删除cmakelist.txt。

使用示例

入门样本

为了说明库的使用，让我们从一个小但完整的程序开始，该程序打开包含UTF-8编码文本的文件，按行读取IT，检查无效的UTF-8字节序列的每一行，然后将其转换为UTF-16编码，然后返回到RECKF-8：

# include < fstream >
# include < iostream >
# include < string >
# include < vector >
# include " utf8.h "
using namespace std ;
int main ( int argc, char ** argv)
{
    if (argc != 2 ) {
        cout << " n Usage: docsample filename n " ;
        return 0 ;
    }
    const char * test_file_path = argv[ 1 ];
    // Open the test file (must be UTF-8 encoded)
    ifstream fs8 (test_file_path);
    if (!fs8. is_open ()) {
        cout << " Could not open " << test_file_path << endl;
        return 0 ;
    }

    unsigned line_count = 1 ;
    string line;
    // Play with all the lines in the file
    while ( getline (fs8, line)) {
        // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function)
# if __cplusplus >= 201103L // C++ 11 or later
        auto end_it = utf8::find_invalid (line. begin (), line. end ());
# else
        string::iterator end_it = utf8::find_invalid (line. begin (), line. end ());
# endif // C++ 11
        if (end_it != line. end ()) {
            cout << " Invalid UTF-8 encoding detected at line " << line_count << " n " ;
            cout << " This part is fine: " << string (line. begin (), end_it) << " n " ;
        }
        // Get the line length (at least for the valid part)
        int length = utf8::distance (line. begin (), end_it);
        cout << " Length of line " << line_count << " is " << length <<  " n " ;

        // Convert it to utf-16
# if __cplusplus >= 201103L // C++ 11 or later
        u16string utf16line = utf8::utf8to16 (line);
# else
        vector< unsigned short > utf16line;
        utf8::utf8to16 (line. begin (), end_it, back_inserter (utf16line));
# endif // C++ 11
        // And back to utf-8;
# if __cplusplus >= 201103L // C++ 11 or later
        string utf8line = utf8::utf16to8 (utf16line);
# else
        string utf8line; 
        utf8::utf16to8 (utf16line. begin (), utf16line. end (), back_inserter (utf8line));
# endif // C++ 11
        // Confirm that the conversion went OK:
        if (utf8line != string (line. begin (), end_it))
            cout << " Error in UTF-16 conversion at line: " << line_count << " n " ;        

        line_count++;
    } 

    return 0 ;
}

在上一个代码样本中，对于每行，我们用find_invalid进行了无效的UTF-8序列的检测；通过使用utf8::distance确定字符的数量（更准确地说 - UNICODE代码点的数量，包括线的结束，甚至是BOM的结尾）。最后，我们将每行转换为使用utf8to16编码的UTF-16，并使用utf16to8转换为UTF-8。

请注意旧编译器的不同使用模式。例如，这就是我们将UTF-8编码的字符串转换为使用PRE-C ++ 11编译器编码的UTF-16的字符串：

    vector< unsigned short > utf16line;
    utf8::utf8to16 (line.begin(), end_it, back_inserter(utf16line));

使用更现代的编译器，相同的操作看起来像：

    u16string utf16line = utf8::utf8to16(line);

如果__cplusplus宏指向C ++ 11或更高版本，则该库将曝光考虑C ++标准Unicode字符串并移动语义的API。使用较旧的编译器，仍然可以使用相同的功能，只是以一些方便的方式使用

如果您不信任__cplusplus宏或例如，即使使用现代编译器，也不想包含C ++ 11个助手功能，则在包括utf8.h之前定义UTF_CPP_CPLUSPLUS宏，并为其分配了它的值，并为您要使用的标准 - 与__cplusplus相同。这对于保守的编译器也很有用，即使它们对最近的标准版有很好的支持-Microsoft的Visual C ++是一个示例，即使它们对__cplusplus宏有很好的支持。

检查文件是否包含有效的UTF-8文本

这是一个函数，可以检查文件的内容是否有效UTF-8编码文本，而无需将内容读取到内存：

 bool valid_utf8_file ( const char * file_name)
{
    ifstream ifs (file_name);
    if (!ifs)
        return false ; // even better, throw here

    istreambuf_iterator< char > it (ifs. rdbuf ());
    istreambuf_iterator< char > eos;

    return utf8::is_valid (it, eos);
}

由于函数utf8::is_valid()可与输入迭代器一起使用，因此我们能够将istreambuf_iterator传递给it ，并直接读取文件的内容而无需将其加载到内存。

请注意，其他采用输入迭代参数的功能可以以类似的方式使用。例如，要读取UTF-8编码的文本文件的内容并将文本转换为UTF-16，请执行以下操作：

    utf8::utf8to16 (it, eos, back_inserter(u16string));

确保字符串包含有效的UTF-8文本

如果我们有一些“可能”包含UTF-8编码文本的文本，并且我们希望用替换字符替换任何无效的UTF-8序列，则可以使用以下功能：

 void fix_utf8_string (std::string& str)
{
    std::string temp;
    utf8::replace_invalid (str. begin (), str. end (), back_inserter (temp));
    str = temp;
}

该函数将用Unicode替换字符替换任何无效的UTF-8序列。有一个超载功能，使呼叫者能够提供自己的替换字符。

兴趣点

设计目标和决策

图书馆的设计为：

通用：无论好坏，那里都有许多C ++字符串类，并且该库应与其中的尽可能多。
便携式：库应在不同的平台和编译器上可移植。唯一的不可存放的代码是一个小部分，该部分声明了不同尺寸的无符号整数：三个Typedefs。如果图书馆的用户不匹配他们的平台，则可以更改它们。默认设置应适用于Windows（32和64位），大多数32位和64位UNIX衍生物。仅在API级别的现代编译器中包含对C ++ 03 POST语言功能的支持，因此即使在漂亮的旧编译器中，库也应该使用。
轻量级：遵循“仅付款您使用的指南”。
不感动：避免在用户上强制任何特定的设计甚至编程样式。这是一个库，而不是框架。

替代方案

对于替代方案和比较，我建议以下文章：Jeanheyd Meneide撰写的C和C ++编码API的奇妙世界。在文章中，该库与以下内容进行了比较：

Simdutf
ICONV
boost.text
ICU
encoding_rs
Windows API函数用于在编码之间转换文本
ZTD.TEXT

本文介绍了作者对API设计质量的看法，但也介绍了一些速度基准。

参考

来自UTF8名称空间的功能

UTF8 ::附录

Octet_iterator附录（UTFCHAR32_T CP，OCTET_ITERATOR结果）

可在1.0版和更高版本中提供。

将32位代码点编码为UTF-8序列序列，并将序列附加到UTF-8字符串上。

 template < typename octet_iterator>
octet_iterator append ( utfchar32_t cp, octet_iterator result);

octet_iterator ：输出迭代器。
cp ：一个32位整数，代表代码点以附加到序列。
result ：输出迭代器到序列中的位置附加代码点的位置。
返回值：迭代器指向新附加序列之后的位置。

使用的示例：

 unsigned char u[ 5 ] = { 0 , 0 , 0 , 0 , 0 };
unsigned char * end = append( 0x0448 , u);
assert (u[ 0 ] == 0xd1 && u[ 1 ] == 0x88 && u[ 2 ] == 0 && u[ 3 ] == 0 && u[ 4 ] == 0 );

请注意， append不会分配任何内存 - 这是呼叫者的负担，以确保分配足够的内存以进行操作。为了使事情变得更有趣， append可以在序列中添加1到4个八位字之间的任何位置。在实践中，您通常需要使用std::back_inserter来确保分配必要的内存。

如果是无效的代码点，则抛出utf8::invalid_code_point 。

void append（utfchar32_t cp，std :: string＆s）;

在版本3.0和更高版本中可用。在4.0之前，它需要C ++ 11编译器；用4.0提起要求。

将32位代码点编码为UTF-8序列序列，并将序列附加到UTF-8字符串上。

 void append ( utfchar32_t cp, std::string& s);

cp ：一个代码点，用于附加到字符串。
s ：一个UTF-8编码的字符串，以将代码点附加到。

使用的示例：

std::string u;
append ( 0x0448 , u);
assert (u[ 0 ] == char ( 0xd1 ) && u[1] == char( 0x88 ) && u.length() == 2);

如果是无效的代码点，则抛出utf8::invalid_code_point 。

UTF8 ::附录16

Word_iterator附录16（utfchar32_t CP，Word_iterator结果）

在4.0版和更高版本中可用。

编码一个32位代码点作为UTF-16单词序列，并将序列附加到UTF-16字符串上。

 template < typename word_iterator>
word_iterator append16 ( utfchar32_t cp, word_iterator result);

word_iterator ：输出迭代器。
cp ：一个32位整数，代表代码点以附加到序列。
result ：输出迭代器到序列中的位置附加代码点的位置。
返回值：迭代器指向新附加序列之后的位置。

使用的示例：

 unsigned short u[ 2 ] = { 0 , 0 };
unsigned short * end = append16( 0x0448 , u);
assert (u[ 0 ] == 0x0448 && u[ 1 ] == 0 );

请注意， append16不会分配任何内存 - 确保分配足够的内存用于操作是呼叫者的负担。为了使事情变得更有趣， append16可以在序列中添加一个或两个单词。在实践中，您通常需要使用std::back_inserter来确保分配必要的内存。

如果是无效的代码点，则抛出utf8::invalid_code_point 。

void Append（UTFCHAR32_T CP，STD :: U16STRING＆S）

在4.0版和更高版本中可用。需要C ++ 11兼容的编译器。

编码一个32位代码点作为UTF-16单词序列，并将序列附加到UTF-16字符串上。

 void append ( utfchar32_t cp, std::u16string& s);

cp ：一个代码点，用于附加到字符串。
s ：一个UTF-16编码的字符串，以将代码点附加到。

使用的示例：

std::u16string u;
append ( 0x0448 , u);
assert (u[ 0 ] == 0x0448 && u.length() == 1);

如果是无效的代码点，则抛出utf8::invalid_code_point 。

UTF8 :: Next

可在1.0版和更高版本中提供。

给定迭代器到UTF-8序列的开头，它返回代码点并将迭代器移至下一个位置。

 template < typename octet_iterator> 
utfchar32_t next (octet_iterator& it, octet_iterator end);

octet_iterator ：输入迭代器。
it ：指向UTF-8编码代码点开始的迭代器的引用。函数返回后，它会增加以指向下一个代码点的开始。
end ：要处理的UTF-8序列的结尾。 end it则会抛出utf8::not_enough_room exception。
返回值：处理后的UTF-8代码点的32位表示。

使用的示例：

 char * twochars = " xe6x97xa5xd1x88 " ;
char * w = twochars;
int cp = next(w, twochars + 6 );
assert (cp == 0x65e5 );
assert (w == twochars + 3 );

此功能通常用于通过UTF-8编码的字符串迭代。

如果是无效的UTF-8序列，则会抛出utf8::invalid_utf8异常。

UTF8 :: Next16

在4.0版和更高版本中可用。

给定迭代器到UTF-16序列的开头，它返回代码点并将迭代器移至下一个位置。

 template < typename word_iterator>
utfchar32_t next16 (word_iterator& it, word_iterator end);

word_iterator ：输入迭代器。
it ：指向UTF-16编码代码点开始的迭代器的引用。函数返回后，它会增加以指向下一个代码点的开始。
end ：要处理的UTF-16序列的结尾。 end it则会抛出utf8::not_enough_room exception。
返回值：处理后的UTF-16代码点的32位表示。

使用的示例：

 const unsigned short u[ 3 ] = { 0x65e5 , 0xd800 , 0xdf46 };
const unsigned short * w = u;
int cp = next16(w, w + 3 );
assert (cp, 0x65e5 );
assert (w, u + 1 );

此功能通常用于通过UTF-16编码的字符串迭代。

如果是无效的UTF-16序列，则会引发utf8::invalid_utf8异常。

utf8 :: peek_next

可在2.1版和更高版本中提供。

给定迭代器到UTF-8序列的开头，它返回以下序列的代码点，而无需更改迭代器的值。

 template < typename octet_iterator> 
utfchar32_t peek_next (octet_iterator it, octet_iterator end);

octet_iterator ：输入迭代器。
it ：指向UTF-8编码代码点开始的迭代器。
end ：要处理的UTF-8序列的结尾。 end it则会抛出utf8::not_enough_room exception。
返回值：处理后的UTF-8代码点的32位表示。

使用的示例：

 char * twochars = " xe6x97xa5xd1x88 " ;
char * w = twochars;
int cp = peek_next(w, twochars + 6 );
assert (cp == 0x65e5 );
assert (w == twochars);

如果是无效的UTF-8序列，则会抛出utf8::invalid_utf8异常。

UTF8 ::先验

可在版本1.02和更高版本中提供。

给定对指向UTF-8序列中八位字的迭代器的引用，它降低了迭代器，直到达到了先前的UTF-8编码代码点的开始并返回代码点的32位表示。

 template < typename octet_iterator> 
utfchar32_t prior (octet_iterator& it, octet_iterator start);

octet_iterator ：双向迭代器。
it ：指向UTF-8编码字符串中八位位的引用。函数返回后，将其降低至指向上一个代码点的开始。
start ：一个迭代器到搜索代码点开始的序列的开头。这是一种安全措施，以防止在搜索UTF-8铅八位位时通过字符串的开头。
返回值：上一个代码点的32位表示。

使用的示例：

 char * twochars = " xe6x97xa5xd1x88 " ;
unsigned char * w = twochars + 3 ;
int cp = prior (w, twochars);
assert (cp == 0x65e5 );
assert (w == twochars);

该功能有两个目的：一个是通过UTF-8编码的字符串向后两个迭代。请注意，通常是一个更好的主意，因为utf8::next速度更快。第二目的是如果我们在字符串中具有随机位置，则找到UTF-8序列的开始。请注意，在这种情况下utf8::prior在某些情况下可能无法检测到无效的UTF-8序列：例如，如果有多余的轨八位，它将跳过它们。

it通常会指向代码点的开始，并且start将指向字符串的开头，以确保我们不会向后走得太远。 it会降低，直到指向铅UTF-8八位位，然后以该八位位开始为32位表示并返回的UTF-8序列。

如果在击中UTF-8 LEAD OCTAT之前就可以到达start ，或者如果铅OCTET启动了无效的UTF-8序列，则会抛出invalid_utf8异常。

如果start等于it ，则会抛出一个not_enough_room异常。

UTF8 :: Advance

可在1.0版和更高版本中提供。

通过UTF-8序列中指定数量的代码点来推进迭代器。

 template < typename octet_iterator, typename distance_type> 
void advance (octet_iterator& it, distance_type n, octet_iterator end);

octet_iterator ：输入迭代器。
distance_type ：一种积分类型转换为octet_iterator的差异类型。
it ：指向UTF-8编码代码点开始的迭代器的引用。函数返回后，它会增加以指向以下代码点的第n个。
n ：代码点的数量it提高。负值意味着减少。
end ：要处理的UTF-8序列的限制。如果n是正的，并且在提取代码点期间it end ，则会抛出utf8::not_enough_room异常。如果n是负的，并且it指向UTF-8序列的TA轨道字节时it end ，则抛出utf8::invalid_code_point exception。

使用的示例：

 char * twochars = " xe6x97xa5xd1x88 " ;
unsigned char * w = twochars;
advance (w, 2 , twochars + 6 );
assert (w == twochars + 5 );
advance (w, - 2 , twochars);
assert (w == twochars);

如果是无效的代码点，则抛出utf8::invalid_code_point 。

UTF8 ::距离

可在1.0版和更高版本中提供。

给定迭代器以序列为两个UTF-8编码的代码点，返回它们之间的代码点的数量。

 template < typename octet_iterator> 
typename std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator first, octet_iterator last);

octet_iterator ：输入迭代器。
first ：迭代器到UTF-8编码代码点的开始。
last ：在我们试图确定长度的序列中，最后一个UTF-8编码代码点的“后端”迭代器。它可以是新代码点的开始。
在代码点以迭代器之间的距离返回值。

使用的示例：

 char * twochars = " xe6x97xa5xd1x88 " ;
size_t dist = utf8::distance(twochars, twochars + 5 );
assert (dist == 2 );

此功能用于查找UTF-8编码字符串的长度（在代码点）。它被称为距离的原因，而不是长度是主要是因为使用了长度是O（1）函数。计算UTF-8字符串的长度是线性操作，在std::distance算法之后对其进行建模看起来更好。

如果是无效的UTF-8序列，则会抛出utf8::invalid_utf8异常。如果last没有指出UTF-8序列的过去，则会抛出utf8::not_enough_room Exception。

UTF8 :: UTF16TO8

Octet_iterator UTF16TO8（U16BIT_ITERATOR启动，U16BIT_ITERATOR END，OCTET_ITERATOR结果）

可在1.0版和更高版本中提供。

将UTF-16编码的字符串转换为UTF-8。

 template < typename u16bit_iterator, typename octet_iterator>
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result);

u16bit_iterator ：输入迭代器。
octet_iterator ：输出迭代器。
start ：指向UTF-16编码字符串的开始的迭代器要转换。
end ：指向通过UTF-16编码字符串的末端的迭代器进行转换。
result ：输出迭代器到UTF-8字符串中的位置，以附加转换结果。
返回值：迭代器指向附加的UTF-8字符串之后的位置。

使用的示例：

 unsigned short utf16string[] = { 0x41 , 0x0448 , 0x65e5 , 0xd834 , 0xdd1e };
vector< unsigned char > utf8result;
utf16to8 (utf16string, utf16string + 5 , back_inserter(utf8result));
assert (utf8result.size() == 10);

在无效的UTF-16序列的情况下，抛出了utf8::invalid_utf16异常。

std :: String utf16to8（const std :: u16String＆s）

在版本3.0和更高版本中可用。需要C ++ 11兼容的编译器。

将UTF-16编码的字符串转换为UTF-8。

std::string utf16to8 ( const std::u16string& s);

s ：UTF-16编码字符串。返回值：UTF-8编码字符串。

使用的示例：

    u16string utf16string = { 0x41 , 0x0448 , 0x65e5 , 0xd834 , 0xdd1e };
    string u = utf16to8(utf16string);
    assert (u.size() == 10);

在无效的UTF-16序列的情况下，抛出了utf8::invalid_utf16异常。

std :: String utf16to8（std :: u16String_view s）

可在版本3.2及以后提供。需要C ++ 17兼容的编译器。

将UTF-16编码的字符串转换为UTF-8。

std::string utf16to8 (std::u16string_view s);

s ：UTF-16编码字符串。返回值：UTF-8编码字符串。

使用的示例：

    u16string utf16string = { 0x41 , 0x0448 , 0x65e5 , 0xd834 , 0xdd1e };
    u16string_view utf16stringview (u16string);
    string u = utf16to8(utf16string);
    assert (u.size() == 10);

在无效的UTF-16序列的情况下，抛出了utf8::invalid_utf16异常。

UTF8 :: UTF16TOU8

std :: u8string utf16tou8（const std :: u16string＆s）

在4.0版和更高版本中可用。需要C ++ 20兼容的编译器。

将UTF-16编码的字符串转换为UTF-8。

std::u8string utf16tou8 ( const std::u16string& s);

s ：UTF-16编码字符串。返回值：UTF-8编码字符串。

使用的示例：

    u16string utf16string = { 0x41 , 0x0448 , 0x65e5 , 0xd834 , 0xdd1e };
    u8string u = utf16tou8(utf16string);
    assert (u.size() == 10);

在无效的UTF-16序列的情况下，抛出了utf8::invalid_utf16异常。

std :: u8string utf16tou8（const std :: u16string_view＆s）

在4.0版和更高版本中可用。需要C ++ 20兼容的编译器。

将UTF-16编码的字符串转换为UTF-8。

std::u8string utf16tou8 ( const std::u16string_view& s);

s ：UTF-16编码字符串。返回值：UTF-8编码字符串。

使用的示例：

    u16string utf16string = { 0x41 , 0x0448 , 0x65e5 , 0xd834 , 0xdd1e };
    u16string_view utf16stringview (u16string);
    u8string u = utf16tou8(utf16string);
    assert (u.size() == 10);

在无效的UTF-16序列的情况下，抛出了utf8::invalid_utf16异常。

UTF8 :: UTF8TO16

U16BIT_ITERATOR UTF8TO16（octet_iterator start，Octet_iterator end，u16bit_iterator结果）

可在1.0版和更高版本中提供。

将UTF-8编码字符串转换为UTF-16

 template < typename u16bit_iterator, typename octet_iterator>
u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result);

octet_iterator ：输入迭代器。
u16bit_iterator ：输出迭代器。
start ：指向UTF-8编码字符串的开头的迭代器要转换。 end ：迭代器指向传递UTF-8编码字符串的末端进行转换。
result ：输出迭代器到UTF-16字符串中的位置，以附加转换结果。
返回值：迭代器指向附加的UTF-16字符串之后的位置。

使用的示例：

 char utf8_with_surrogates[] = " xe6x97xa5xd1x88xf0x9dx84x9e " ;
vector < unsigned short > utf16result;
utf8to16 (utf8_with_surrogates, utf8_with_surrogates + 9 , back_inserter(utf16result));
assert (utf16result.size() == 4);
assert (utf16result[ 2 ] == 0xd834 );
assert (utf16result[ 3 ] == 0xdd1e );

如果是无效的UTF-8序列，则会抛出utf8::invalid_utf8异常。如果end没有指向UTF-8序列的过去，则会抛出utf8::not_enough_room Exception。

std :: u16String utf8to16（const std :: string＆s）

在版本3.0和更高版本中可用。需要C ++ 11兼容的编译器。

将UTF-8编码的字符串转换为UTF-16。

std::u16string utf8to16 ( const std::string& s);

s ：一个用于转换的UTF-8编码字符串。
返回值：UTF-16编码字符串

使用的示例：

string utf8_with_surrogates = " xe6x97xa5xd1x88xf0x9dx84x9e " ;
u16string utf16result = utf8to16(utf8_with_surrogates);
assert (utf16result.length() == 4);
assert (utf16result[ 2 ] == 0xd834 );
assert (utf16result[ 3 ] == 0xdd1e );

如果是无效的UTF-8序列，则会抛出utf8::invalid_utf8异常。

std :: u16String utf8to16（std :: string_view s）

可在版本3.2及以后提供。需要C ++ 17兼容的编译器。

将UTF-8编码的字符串转换为UTF-16。

std::u16string utf8to16 (std::string_view s);

s ：一个用于转换的UTF-8编码字符串。
返回值：UTF-16编码字符串

使用的示例：

string_view utf8_with_surrogates = " xe6x97xa5xd1x88xf0x9dx84x9e " ;
u16string utf16result = utf8to16(utf8_with_surrogates);
assert (utf16result.length() == 4);
assert (utf16result[ 2 ] == 0xd834 );
assert (utf16result[ 3 ] == 0xdd1e );

如果是无效的UTF-8序列，则会抛出utf8::invalid_utf8异常。

std :: u16String utf8to16（std :: u8string＆s）

在4.0版和更高版本中可用。需要C ++ 20兼容的编译器。

将UTF-8编码的字符串转换为UTF-16。

std::u16string utf8to16 (std::u8string& s);

s ：一个用于转换的UTF-8编码字符串。
返回值：UTF-16编码字符串

使用的示例：

std::u8string utf8_with_surrogates = " xe6x97xa5xd1x88xf0x9dx84x9e " ;
std::u16string utf16result = utf8to16(utf8_with_surrogates);
assert (utf16result.length() == 4);
assert (utf16result[ 2 ] == 0xd834 );
assert (utf16result[ 3 ] == 0xdd1e );

如果是无效的UTF-8序列，则会抛出utf8::invalid_utf8异常。

std :: u16String utf8to16（std :: u8string_view＆s）

在4.0版和更高版本中可用。需要C ++ 20兼容的编译器。

将UTF-8编码的字符串转换为UTF-16。

std::u16string utf8to16 (std::u8string_view& s);

s ：一个用于转换的UTF-8编码字符串。
返回值：UTF-16编码字符串

使用的示例：

std::u8string utf8_with_surrogates = " xe6x97xa5xd1x88xf0x9dx84x9e " ;
std::u8string_view utf8stringview {utf8_with_surrogates}
std::u16string utf16result = utf8to16(utf8stringview);
assert (utf16result.length() == 4);
assert (utf16result[ 2 ] == 0xd834 );
assert (utf16result[ 3 ] == 0xdd1e );

如果是无效的UTF-8序列，则会抛出utf8::invalid_utf8异常。

UTF8 :: UTF32TO8

octet_iterator utf32to8（u32bit_iterator start，u32bit_iterator end，octet_iterator结果）

可在1.0版和更高版本中提供。

将UTF-32编码的字符串转换为UTF-8。

 template < typename octet_iterator, typename u32bit_iterator>
octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result);

octet_iterator ：输出迭代器。
u32bit_iterator ：输入迭代器。
start ：指向UTF-32编码字符串的开始的迭代器要转换。
end ：指向将UTF-32的字符串传递到末端的迭代器要转换。
result ：输出迭代器到UTF-8字符串中的位置，以附加转换结果。
返回值：迭代器指向附加的UTF-8字符串之后的位置。

使用的示例：

 int utf32string[] = { 0x448 , 0x65E5 , 0x10346 , 0 };
vector< unsigned char > utf8result;
utf32to8 (utf32string, utf32string + 3 , back_inserter(utf8result));
assert (utf8result.size() == 9);

在无效UTF-32字符串的情况下，抛出了utf8::invalid_code_point 。

std :: String utf32to8（const std :: u32String＆s）

在版本3.0和更高版本中可用。需要C ++ 11兼容的编译器。

将UTF-32编码的字符串转换为UTF-8。

std::string utf32to8 ( const std::u32string& s);

s ：UTF-32编码字符串。
返回值：UTF-8编码字符串。

使用的示例：

u32string utf32string = { 0x448 , 0x65E5 , 0x10346 };
string utf8result = utf32to8(utf32string);
assert (utf8result.size() == 9);

在无效UTF-32字符串的情况下，抛出了utf8::invalid_code_point 。

STD :: U8STRING UTF32TO8（CONST STD :: U32STRING＆S）

在4.0版和更高版本中可用。需要C ++ 20兼容的编译器。

将UTF-32编码的字符串转换为UTF-8。

std::u8string utf32to8 ( const std::u32string& s);

s ：UTF-32编码字符串。
返回值：UTF-8编码字符串。

使用的示例：

u32string utf32string = { 0x448 , 0x65E5 , 0x10346 };
u8string utf8result = utf32to8(utf32string);
assert (utf8result.size() == 9);

在无效UTF-32字符串的情况下，抛出了utf8::invalid_code_point 。

std :: u8string utf32to8（const std :: u32string_view＆s）

在4.0版和更高版本中可用。需要C ++ 20兼容的编译器。

将UTF-32编码的字符串转换为UTF-8。

std::u8string utf32to8 ( const std::u32string_view& s);

s ：UTF-32编码字符串。
返回值：UTF-8编码字符串。

使用的示例：

u32string utf32string = { 0x448 , 0x65E5 , 0x10346 };
u32string_view utf32stringview (utf32string);
u8string utf8result = utf32to8(utf32stringview);
assert (utf8result.size() == 9);

在无效UTF-32字符串的情况下，抛出了utf8::invalid_code_point 。

std :: String utf32to8（const std :: u32String＆s）

在版本3.0和更高版本中可用。需要C ++ 11兼容的编译器。

将UTF-32编码的字符串转换为UTF-8。

std::string utf32to8 ( const std::u32string& s);

s ：UTF-32编码字符串。
返回值：UTF-8编码字符串。

使用的示例：

u32string utf32string = { 0x448 , 0x65E5 , 0x10346 };
string utf8result = utf32to8(utf32string);
assert (utf8result.size() == 9);

在无效UTF-32字符串的情况下，抛出了utf8::invalid_code_point 。

std :: String utf32to8（std :: U32String_view S）

可在版本3.2及以后提供。需要C ++ 17兼容的编译器。

将UTF-32编码的字符串转换为UTF-8。

std::string utf32to8 (std::u32string_view s);

s ：UTF-32编码字符串。
返回值：UTF-8编码字符串。

使用的示例：

u32string utf32string = { 0x448 , 0x65E5 , 0x10346 };
u32string_view utf32stringview (utf32string);
string utf8result = utf32to8(utf32stringview);
assert (utf8result.size() == 9);

在无效UTF-32字符串的情况下，抛出了utf8::invalid_code_point 。

UTF8 :: UTF8TO32

U32BIT_ITERATOR UTF8TO32（octet_iterator start，Octet_iterator end，u32bit_iterator结果）

可在1.0版和更高版本中提供。

将UTF-8编码的字符串转换为UTF-32。

 template < typename octet_iterator, typename u32bit_iterator>
u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result);

octet_iterator ：输入迭代器。
u32bit_iterator ：输出迭代器。
start ：指向UTF-8编码字符串的开头的迭代器要转换。
end ：迭代器指向传递UTF-8编码字符串的末端进行转换。
result ：输出迭代器到UTF-32字符串中的位置，以附加转换结果。
返回值：迭代器指向附加的UTF-32字符串之后的位置。

使用的示例：

 char * twochars = " xe6x97xa5xd1x88 " ;
vector< int > utf32result;
utf8to32 (twochars, twochars + 5 , back_inserter(utf32result));
assert (utf32result.size() == 2);

如果是无效的UTF-8序列，则会抛出utf8::invalid_utf8异常。如果end没有指向UTF-8序列的过去，则会抛出utf8::not_enough_room Exception。

STD :: U32STRING UTF8TO32（const std :: U8String＆S）

在4.0版和更高版本中可用。需要C ++ 20兼容的编译器。

将UTF-8编码的字符串转换为UTF-32。

std::u32string utf8to32 ( const std::u8string& s);

s ：UTF-8编码字符串。返回值：UTF-32编码字符串。

使用的示例：

 const std::u8string* twochars = u8" xe6x97xa5xd1x88 " ;
u32string utf32result = utf8to32(twochars);
assert (utf32result.size() == 2);

如果是无效的UTF-8序列，则会抛出utf8::invalid_utf8异常。

std :: u32String utf8to32（const std :: u8string_view＆s）

在4.0版和更高版本中可用。需要C ++ 20兼容的编译器。

将UTF-8编码的字符串转换为UTF-32。

std::u32string utf8to32 ( const std::u8string_view& s);

s ：UTF-8编码字符串。返回值：UTF-32编码字符串。

使用的示例：

 const u8string* twochars = u8" xe6x97xa5xd1x88 " ;
const u8string_view stringview{twochars};
u32string utf32result = utf8to32(stringview);
assert (utf32result.size() == 2);

如果是无效的UTF-8序列，则会抛出utf8::invalid_utf8异常。

std :: u32String utf8to32（const std :: string＆s）

在版本3.0和更高版本中可用。需要C ++ 11兼容的编译器。

将UTF-8编码的字符串转换为UTF-32。

std::u32string utf8to32 ( const std::string& s);

s ：UTF-8编码字符串。返回值：UTF-32编码字符串。

使用的示例：

 const char * twochars = " xe6x97xa5xd1x88 " ;
u32string utf32result = utf8to32(twochars);
assert (utf32result.size() == 2);

如果是无效的UTF-8序列，则会抛出utf8::invalid_utf8异常。

std :: u32String utf8to32（std :: string_view s）

可在版本3.2及以后提供。需要C ++ 17兼容的编译器。

将UTF-8编码的字符串转换为UTF-32。

std::u32string utf8to32 (std::string_view s);

s ：UTF-8编码字符串。返回值：UTF-32编码字符串。

使用的示例：

string_view twochars = " xe6x97xa5xd1x88 " ;
u32string utf32result = utf8to32(twochars);
assert (utf32result.size() == 2);

如果是无效的UTF-8序列，则会抛出utf8::invalid_utf8异常。

utf8 :: find_invalid

octet_iterator find_invalid（octet_iterator start，octet_iterator end）

可在1.0版和更高版本中提供。

检测UTF-8字符串中无效的序列。

 template < typename octet_iterator> 
octet_iterator find_invalid (octet_iterator start, octet_iterator end);

octet_iterator ：输入迭代器。
start ：指向UTF-8字符串开始测试有效性的迭代器。
end ：指向通过UTF-8字符串的末端的迭代器来测试有效性。
返回值：指向UTF-8字符串中第一个无效八位位的迭代器。如果没有发现，则等于end 。

使用的示例：

 char utf_invalid[] = " xe6x97xa5xd1x88xfa " ;
char * invalid = find_invalid(utf_invalid, utf_invalid + 6 );
assert (invalid == utf_invalid + 5 );

此功能通常用于确保UTF-8字符串在使用其他功能处理之前有效。如果在进行任何未选中的操作之前，请尤其重要。

const char* find_invalid（const char* str）

在4.0版和更高版本中可用。

检测C风格UTF-8字符串中的无效序列。

 const char * find_invalid ( const char * str);

str ：UTF-8编码字符串。返回值：UTF-8字符串中第一个无效八位位的指针。如果没有发现，则指向尾部零字节。

使用的示例：

 const char * utf_invalid = " xe6x97xa5xd1x88xfa " ;
const char * invalid = find_invalid(utf_invalid);
assert ((invalid - utf_invalid) == 5);

此功能通常用于确保UTF-8字符串在使用其他功能处理之前有效。如果在进行任何未选中的操作之前，请尤其重要。

std :: size_t find_invalid（const std :: string＆s）

在版本3.0和更高版本中可用。在4.0之前，它需要C ++ 11编译器；用4.0解除了要求

检测UTF-8字符串中无效的序列。

std:: size_t find_invalid ( const std::string& s);

s ：UTF-8编码字符串。返回值：UTF-8字符串中第一个无效八位位的索引。如果没有发现，则等于std::string::npos 。

使用的示例：

string utf_invalid = " xe6x97xa5xd1x88xfa " ;
auto invalid = find_invalid(utf_invalid);
assert (invalid == 5 );

此功能通常用于确保UTF-8字符串在使用其他功能处理之前有效。如果在进行任何未选中的操作之前，请尤其重要。

std :: size_t find_invalid（std :: string_view s）

可在版本3.2及以后提供。需要C ++ 17兼容的编译器。

检测UTF-8字符串中无效的序列。

std:: size_t find_invalid (std::string_view s);

s ：UTF-8编码字符串。返回值：UTF-8字符串中第一个无效八位位的索引。如果没有发现，则等于std::string_view::npos 。

使用的示例：

string_view utf_invalid = " xe6x97xa5xd1x88xfa " ;
auto invalid = find_invalid(utf_invalid);
assert (invalid == 5 );

此功能通常用于确保UTF-8字符串在使用其他功能处理之前有效。如果在进行任何未选中的操作之前，请尤其重要。

utf8 :: is_valid

bool is_valid（octet_iterator start，octet_iterator end）

可在1.0版和更高版本中提供。

检查一系列八位位置是否是有效的UTF-8字符串。

 template < typename octet_iterator> 
bool is_valid (octet_iterator start, octet_iterator end);

octet_iterator ：输入迭代器。
start ：指向UTF-8字符串开始测试有效性的迭代器。
end ：指向通过UTF-8字符串的末端的迭代器来测试有效性。
返回值：如果序列是有效的UTF-8字符串， true ； false话。

使用的示例：

 char utf_invalid[] = " xe6x97xa5xd1x88xfa " ;
bool bvalid = is_valid(utf_invalid, utf_invalid + 6 );
assert (bvalid == false );

is_valid是find_invalid(start, end) == end; 。您可能需要使用它来确保字节序列是有效的UTF-8字符串，而无需知道如果它不有效，则在哪里失败。

bool is_valid（const char* str）

在4.0版和更高版本中可用。

检查C风格的字符串是否包含有效的UTF-8编码文本。

 bool is_valid ( const char * str);

str ：UTF-8编码字符串。
返回值： true如果字符串包含有效的UTF-8编码文本； false话。

使用的示例：

 char utf_invalid[] = " xe6x97xa5xd1x88xfa " ;
bool bvalid = is_valid(utf_invalid);
assert (bvalid == false );

您可能需要使用is_valid来确保字符串包含有效的UTF-8文本，而无需知道该字符串如果不有效，则在哪里失败。

bool is_valid（const std :: string＆s）

在版本3.0和更高版本中可用。在4.0之前，它需要C ++ 11编译器；用4.0解除了要求

检查字符串对象是否包含有效的UTF-8编码文本。

 bool is_valid ( const std::string& s);

s ：UTF-8编码字符串。
返回值： true如果字符串包含有效的UTF-8编码文本； false话。

使用的示例：

 char utf_invalid[] = " xe6x97xa5xd1x88xfa " ;
bool bvalid = is_valid(utf_invalid);
assert (bvalid == false );

您可能需要使用is_valid来确保字符串包含有效的UTF-8文本，而无需知道该字符串如果不有效，则在哪里失败。

bool is_valid（std :: string_view s）

可在版本3.2及以后提供。需要C ++ 17兼容的编译器。

检查字符串对象是否包含有效的UTF-8编码文本。

 bool is_valid (std::string_view s);

s ：UTF-8编码字符串。
返回值： true如果字符串包含有效的UTF-8编码文本； false话。

使用的示例：

string_view utf_invalid = " xe6x97xa5xd1x88xfa " ;
bool bvalid = is_valid(utf_invalid);
assert (bvalid == false );

您可能需要使用is_valid来确保字符串包含有效的UTF-8文本，而无需知道该字符串如果不有效，则在哪里失败。

utf8 :: replace_invalid

output_iterator replace_invalid（octet_iterator start，octet_iterator end，output_iterator out，utfchar32_t替换）

可在2.0版和更高版本中提供。

用替换标记替换字符串中的所有无效UTF-8序列。

 template < typename octet_iterator, typename output_iterator>
output_iterator replace_invalid (octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement);
template < typename octet_iterator, typename output_iterator>
output_iterator replace_invalid (octet_iterator start, octet_iterator end, output_iterator out);

octet_iterator ：输入迭代器。
output_iterator ：输出迭代器。
start ：指向UTF-8字符串开始的迭代器，以寻找无效的UTF-8序列。
end ：指向UTF-8字符串端的迭代器，以查找无效的UTF-8序列。
out ：输出迭代器到存储替换结果的范围。
replacement ：替换标记的Unicode代码点。 The version without this parameter assumes the value 0xfffd
Return value: An iterator pointing to the place after the UTF-8 string with replaced invalid sequences.

Example of use:

 char invalid_sequence[] = " a x80xe0xa0xc0xafxedxa0x80 z " ;
vector< char > replace_invalid_result;
replace_invalid (invalid_sequence, invalid_sequence + sizeof (invalid_sequence), back_inserter(replace_invalid_result), '?');
bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
assert (bvalid);
char * fixed_invalid_sequence = " a????z " ;
assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence));

replace_invalid does not perform in-place replacement of invalid sequences. Rather, it produces a copy of the original string with the invalid sequences replaced with a replacement marker. Therefore, out must not be in the [start, end] range.

std::string replace_invalid(const std::string& s, utfchar32_t replacement)

Available in version 3.0 and later. Prior to 4.0 it required a C++ 11 compiler; the requirement is lifted with 4.0

Replaces all invalid UTF-8 sequences within a string with a replacement marker.

std::string replace_invalid ( const std::string& s, utfchar32_t replacement);
std::string replace_invalid ( const std::string& s);

s : a UTF-8 encoded string.
replacement : A Unicode code point for the replacement marker. The version without this parameter assumes the value 0xfffd
Return value: A UTF-8 encoded string with replaced invalid sequences.

Example of use:

string invalid_sequence = " a x80xe0xa0xc0xafxedxa0x80 z " ;
string replace_invalid_result = replace_invalid(invalid_sequence, ' ? ' );
bvalid = is_valid(replace_invalid_result);
assert (bvalid);
const string fixed_invalid_sequence = " a????z " ;
assert (fixed_invalid_sequence == replace_invalid_result);

std::string replace_invalid(std::string_view s, char32_t replacement)

Available in version 3.2 and later. Requires a C++ 17 compliant compiler.

Replaces all invalid UTF-8 sequences within a string with a replacement marker.

std::string replace_invalid (std::string_view s, char32_t replacement);
std::string replace_invalid (std::string_view s);

Example of use:

string_view invalid_sequence = " a x80xe0xa0xc0xafxedxa0x80 z " ;
string replace_invalid_result = replace_invalid(invalid_sequence, ' ? ' );
bool bvalid = is_valid(replace_invalid_result);
assert (bvalid);
const string fixed_invalid_sequence = " a????z " ;
assert (fixed_invalid_sequence, replace_invalid_result);

utf8::starts_with_bom

bool starts_with_bom (octet_iterator it, octet_iterator end)

Available in version 2.3 and later.

Checks whether an octet sequence starts with a UTF-8 byte order mark (BOM)

 template < typename octet_iterator> 
bool starts_with_bom (octet_iterator it, octet_iterator end);

octet_iterator : an input iterator.
it : beginning of the octet sequence to check
end : pass-end of the sequence to check
Return value: true if the sequence starts with a UTF-8 byte order mark; false if not.

Example of use:

 unsigned char byte_order_mark[] = { 0xef , 0xbb , 0xbf };
bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof (byte_order_mark));
assert (bbom == true );

The typical use of this function is to check the first three bytes of a file. If they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 encoded text.

bool starts_with_bom(const std::string& s)

Available in version 3.0 and later. Prior to 4.0 it required a C++ 11 compiler; the requirement is lifted with 4.0

Checks whether a string starts with a UTF-8 byte order mark (BOM)

 bool starts_with_bom ( const std::string& s);

s : a UTF-8 encoded string. Return value: true if the string starts with a UTF-8 byte order mark; false if not.

Example of use:

string byte_order_mark = { char ( 0xef ), char ( 0xbb ), char ( 0xbf )};
bool bbom = starts_with_bom(byte_order_mark);
assert (bbom == true );
string threechars = " xf0x90x8dx86xe6x97xa5xd1x88 " ;
bool no_bbom = starts_with_bom(threechars);
assert (no_bbom == false );

The typical use of this function is to check the first three bytes of a file. If they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 encoded text.

bool starts_with_bom(std::string_view s)

Available in version 3.2 and later. Requires a C++ 17 compliant compiler.

Checks whether a string starts with a UTF-8 byte order mark (BOM)

 bool starts_with_bom (std::string_view s);

s : a UTF-8 encoded string. Return value: true if the string starts with a UTF-8 byte order mark; false if not.

Example of use:

string byte_order_mark = { char ( 0xef ), char ( 0xbb ), char ( 0xbf )};
string_view byte_order_mark_view (byte_order_mark);
bool bbom = starts_with_bom(byte_order_mark_view);
assert (bbom);
string_view threechars = " xf0x90x8dx86xe6x97xa5xd1x88 " ;
bool no_bbom = starts_with_bom(threechars);
assert (!no_bbom);

The typical use of this function is to check the first three bytes of a file. If they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 encoded text.

Types From utf8 Namespace

utf8::exception

Available in version 2.3 and later.

Base class for the exceptions thrown by UTF CPP library functions.

 class exception : public std :: exception {};

Example of use:

 try {
  code_that_uses_utf_cpp_library ();
}
catch ( const utf8:: exception & utfcpp_ex) {
  cerr << utfcpp_ex. what ();
}

utf8::invalid_code_point

Available in version 1.0 and later.

Thrown by UTF8 CPP functions such as advance and next if an UTF-8 sequence represents and invalid code point.

 class invalid_code_point : public exception {
public: 
    utfchar32_t code_point () const ;
};

Member function code_point() can be used to determine the invalid code point that caused the exception to be thrown.

utf8::invalid_utf8

Available in version 1.0 and later.

Thrown by UTF8 CPP functions such as next and prior if an invalid UTF-8 sequence is detected during decoding.

 class invalid_utf8 : public exception {
public: 
    utfchar8_t utf8_octet () const ;
};

Member function utf8_octet() can be used to determine the beginning of the byte sequence that caused the exception to be thrown.

utf8::invalid_utf16

Available in version 1.0 and later.

Thrown by UTF8 CPP function utf16to8 if an invalid UTF-16 sequence is detected during decoding.

 class invalid_utf16 : public exception {
public: 
    utfchar16_t utf16_word () const ;
};

Member function utf16_word() can be used to determine the UTF-16 code unit that caused the exception to be thrown.

utf8::not_enough_room

Available in version 1.0 and later.

Thrown by UTF8 CPP functions such as next if the end of the decoded UTF-8 sequence was reached before the code point was decoded.

 class not_enough_room : public exception {};

utf8::iterator

Available in version 2.0 and later.

Adapts the underlying octet iterator to iterate over the sequence of code points, rather than raw octets.

 template < typename octet_iterator>
class iterator ;

Member functions

iterator(); the default constructor; the underlying octet_iterator is constructed with its default constructor.

explicit iterator (const octet_iterator& octet_it, const octet_iterator& range_start, const octet_iterator& range_end); a constructor that initializes the underlying octet_iterator with octet_it and sets the range in which the iterator is considered valid.

octet_iterator base () const; returns the underlying octet_iterator.

utfchar32_t operator * () const; decodes the utf-8 sequence the underlying octet_iterator is pointing to and returns the code point.

bool operator == (const iterator& rhs) const; returns true if the two underlying iterators are equal.

bool operator != (const iterator& rhs) const; returns true if the two underlying iterators are not equal.

iterator& operator ++ (); the prefix increment - moves the iterator to the next UTF-8 encoded code point.

iterator operator ++ (int); the postfix increment - moves the iterator to the next UTF-8 encoded code point and returns the current one.

iterator& operator -- (); the prefix decrement - moves the iterator to the previous UTF-8 encoded code point.

iterator operator -- (int); the postfix decrement - moves the iterator to the previous UTF-8 encoded code point and returns the current one.

Example of use:

 char * threechars = " xf0x90x8dx86xe6x97xa5xd1x88 " ;
utf8::iterator< char *> it (threechars, threechars, threechars + 9 );
utf8::iterator< char *> it2 = it;
assert (it2 == it);
assert (*it == 0x10346 );
assert (*(++it) == 0x65e5);
assert ((*it++) == 0x65e5);
assert (*it == 0x0448 );
assert (it != it2);
utf8::iterator< char *> endit (threechars + 9 , threechars, threechars + 9 );  
assert (++it == endit);
assert (*(--it) == 0x0448);
assert ((*it--) == 0x0448);
assert (*it == 0x65e5 );
assert (--it == utf8::iterator< char *>(threechars, threechars, threechars + 9 ));
assert (*it == 0x10346 );

The purpose of utf8::iterator adapter is to enable easy iteration as well as the use of STL algorithms with UTF-8 encoded strings. Increment and decrement operators are implemented in terms of utf8::next() and utf8::prior() functions.

Note that utf8::iterator adapter is a checked iterator. It operates on the range specified in the constructor; any attempt to go out of that range will result in an exception. Even the comparison operators require both iterator object to be constructed against the same range - otherwise an exception is thrown. Typically, the range will be determined by sequence container functions begin and end , ie:

std::string s = " example " ;
utf8::iterator i (s.begin(), s.begin(), s.end());

Functions From utf8::unchecked Namespace

utf8::unchecked::append

Available in version 1.0 and later.

Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence to a UTF-8 string.

 template < typename octet_iterator>
octet_iterator append ( utfchar32_t cp, octet_iterator result);

cp : A 32 bit integer representing a code point to append to the sequence.
result : An output iterator to the place in the sequence where to append the code point.
Return value: An iterator pointing to the place after the newly appended sequence.

Example of use:

 unsigned char u[ 5 ] = { 0 , 0 , 0 , 0 , 0 };
unsigned char * end = unchecked::append( 0x0448 , u);
assert (u[ 0 ] == 0xd1 && u[ 1 ] == 0x88 && u[ 2 ] == 0 && u[ 3 ] == 0 && u[ 4 ] == 0 );

This is a faster but less safe version of utf8::append . It does not check for validity of the supplied code point, and may produce an invalid UTF-8 sequence.

utf8::unchecked::append16

Available in version 4.0 and later.

Encodes a 32 bit code point as a UTF-16 sequence of words and appends the sequence to a UTF-16 string.

 template < typename word_iterator>
word_iterator append16 ( utfchar32_t cp, word_iterator result)

Example of use:

 unsigned short u[ 5 ] = { 0 , 0 };
utf8::unchecked::append16 ( 0x0448 , u);
assert (u[ 0 ], 0x0448 );
assert (u[ 1 ], 0x0000 );

This is a faster but less safe version of utf8::append . It does not check for validity of the supplied code point, and may produce an invalid UTF-8 sequence.

utf8::unchecked::next

Available in version 1.0 and later.

Given the iterator to the beginning of a UTF-8 sequence, it returns the code point and moves the iterator to the next position.

 template < typename octet_iterator>
utfchar32_t next (octet_iterator& it);

it : a reference to an iterator pointing to the beginning of an UTF-8 encoded code point. After the function returns, it is incremented to point to the beginning of the next code point.
Return value: the 32 bit representation of the processed UTF-8 code point.

Example of use:

 char * twochars = " xe6x97xa5xd1x88 " ;
char * w = twochars;
int cp = unchecked::next(w);
assert (cp == 0x65e5 );
assert (w == twochars + 3 );

This is a faster but less safe version of utf8::next . It does not check for validity of the supplied UTF-8 sequence.

utf8::next16

Available in version 4.0 and later.

Given the iterator to the beginning of the UTF-16 sequence, it returns the code point and moves the iterator to the next position.

 template < typename word_iterator>
utfchar32_t next16 (word_iterator& it);

word_iterator : an input iterator.
it : a reference to an iterator pointing to the beginning of an UTF-16 encoded code point. After the function returns, it is incremented to point to the beginning of the next code point.

Return value: the 32 bit representation of the processed UTF-16 code point.

Example of use:

 const unsigned short u[ 3 ] = { 0x65e5 , 0xd800 , 0xdf46 };
const unsigned short * w = u;
int cp = unchecked::next16(w);
assert (cp, 0x65e5 );
assert (w, u + 1 );

This function is typically used to iterate through a UTF-16 encoded string.

This is a faster but less safe version of utf8::next16 . It does not check for validity of the supplied UTF-8 sequence.

utf8::unchecked::peek_next

Available in version 2.1 and later.

Given the iterator to the beginning of a UTF-8 sequence, it returns the code point.

 template < typename octet_iterator>
utfchar32_t peek_next (octet_iterator it);

it : an iterator pointing to the beginning of an UTF-8 encoded code point.
Return value: the 32 bit representation of the processed UTF-8 code point.

Example of use:

 char * twochars = " xe6x97xa5xd1x88 " ;
char * w = twochars;
int cp = unchecked::peek_next(w);
assert (cp == 0x65e5 );
assert (w == twochars);

This is a faster but less safe version of utf8::peek_next . It does not check for validity of the supplied UTF-8 sequence.

utf8::unchecked::prior

Available in version 1.02 and later.

Given a reference to an iterator pointing to an octet in a UTF-8 sequence, it decreases the iterator until it hits the beginning of the previous UTF-8 encoded code point and returns the 32 bits representation of the code point.

 template < typename octet_iterator>
utfchar32_t prior (octet_iterator& it);

it : a reference pointing to an octet within a UTF-8 encoded string. After the function returns, it is decremented to point to the beginning of the previous code point.
Return value: the 32 bit representation of the previous code point.

Example of use:

 char * twochars = " xe6x97xa5xd1x88 " ;
char * w = twochars + 3 ;
int cp = unchecked::prior (w);
assert (cp == 0x65e5 );
assert (w == twochars);

This is a faster but less safe version of utf8::prior . It does not check for validity of the supplied UTF-8 sequence and offers no boundary checking.

utf8::unchecked::advance

Available in version 1.0 and later.

Advances an iterator by the specified number of code points within an UTF-8 sequence.

 template < typename octet_iterator, typename distance_type>
void advance (octet_iterator& it, distance_type n);

it : a reference to an iterator pointing to the beginning of an UTF-8 encoded code point. After the function returns, it is incremented to point to the nth following code point. n : number of code points it should be advanced. A negative value means decrement.

Example of use:

 char * twochars = " xe6x97xa5xd1x88 " ;
char * w = twochars;
unchecked::advance (w, 2 );
assert (w == twochars + 5 );

This is a faster but less safe version of utf8::advance . It does not check for validity of the supplied UTF-8 sequence and offers no boundary checking.

utf8::unchecked::distance

Available in version 1.0 and later.

Given the iterators to two UTF-8 encoded code points in a sequence, returns the number of code points between them.

 template < typename octet_iterator>
typename std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator first, octet_iterator last);

first : an iterator to a beginning of a UTF-8 encoded code point.
last : an iterator to a "post-end" of the last UTF-8 encoded code point in the sequence we are trying to determine the length. It can be the beginning of a new code point, or not.
Return value: the distance between the iterators, in code points.

Example of use:

 char * twochars = " xe6x97xa5xd1x88 " ;
size_t dist = utf8::unchecked::distance(twochars, twochars + 5 );
assert (dist == 2 );

This is a faster but less safe version of utf8::distance . It does not check for validity of the supplied UTF-8 sequence.

utf8::unchecked::utf16to8

Available in version 1.0 and later.

Converts a UTF-16 encoded string to UTF-8.

 template < typename u16bit_iterator, typename octet_iterator>
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result);

start : an iterator pointing to the beginning of the UTF-16 encoded string to convert.
end : an iterator pointing to pass-the-end of the UTF-16 encoded string to convert.
result : an output iterator to the place in the UTF-8 string where to append the result of conversion.
Return value: An iterator pointing to the place after the appended UTF-8 string.

Example of use:

 unsigned short utf16string[] = { 0x41 , 0x0448 , 0x65e5 , 0xd834 , 0xdd1e };
vector< unsigned char > utf8result;
unchecked::utf16to8 (utf16string, utf16string + 5 , back_inserter(utf8result));
assert (utf8result.size() == 10);

This is a faster but less safe version of utf8::utf16to8 . It does not check for validity of the supplied UTF-16 sequence.

utf8::unchecked::utf8to16

Available in version 1.0 and later.

Converts an UTF-8 encoded string to UTF-16

 template < typename u16bit_iterator, typename octet_iterator>
u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result);

start : an iterator pointing to the beginning of the UTF-8 encoded string to convert. end : an iterator pointing to pass-the-end of the UTF-8 encoded string to convert.
result : an output iterator to the place in the UTF-16 string where to append the result of conversion.
Return value: An iterator pointing to the place after the appended UTF-16 string.

Example of use:

 char utf8_with_surrogates[] = " xe6x97xa5xd1x88xf0x9dx84x9e " ;
vector < unsigned short > utf16result;
unchecked::utf8to16 (utf8_with_surrogates, utf8_with_surrogates + 9 , back_inserter(utf16result));
assert (utf16result.size() == 4);
assert (utf16result[ 2 ] == 0xd834 );
assert (utf16result[ 3 ] == 0xdd1e );

This is a faster but less safe version of utf8::utf8to16 . It does not check for validity of the supplied UTF-8 sequence.

utf8::unchecked::utf32to8

Available in version 1.0 and later.

Converts a UTF-32 encoded string to UTF-8.

 template < typename octet_iterator, typename u32bit_iterator>
octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result);

start : an iterator pointing to the beginning of the UTF-32 encoded string to convert.
end : an iterator pointing to pass-the-end of the UTF-32 encoded string to convert.
result : an output iterator to the place in the UTF-8 string where to append the result of conversion.
Return value: An iterator pointing to the place after the appended UTF-8 string.

Example of use:

 int utf32string[] = { 0x448 , 0x65e5 , 0x10346 , 0 };
vector< unsigned char > utf8result;
utf32to8 (utf32string, utf32string + 3 , back_inserter(utf8result));
assert (utf8result.size() == 9);

This is a faster but less safe version of utf8::utf32to8 . It does not check for validity of the supplied UTF-32 sequence.

utf8::unchecked::utf8to32

Available in version 1.0 and later.

Converts a UTF-8 encoded string to UTF-32.

 template < typename octet_iterator, typename u32bit_iterator>
u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result);

start : an iterator pointing to the beginning of the UTF-8 encoded string to convert.
end : an iterator pointing to pass-the-end of the UTF-8 encoded string to convert.
result : an output iterator to the place in the UTF-32 string where to append the result of conversion.
Return value: An iterator pointing to the place after the appended UTF-32 string.

Example of use:

 char * twochars = " xe6x97xa5xd1x88 " ;
vector< int > utf32result;
unchecked::utf8to32 (twochars, twochars + 5 , back_inserter(utf32result));
assert (utf32result.size() == 2);

This is a faster but less safe version of utf8::utf8to32 . It does not check for validity of the supplied UTF-8 sequence.

utf8::unchecked::replace_invalid

Available in version 3.1 and later.

Replaces all invalid UTF-8 sequences within a string with a replacement marker.

 template < typename octet_iterator, typename output_iterator>
output_iterator replace_invalid (octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement);
template < typename octet_iterator, typename output_iterator>
output_iterator replace_invalid (octet_iterator start, octet_iterator end, output_iterator out);

octet_iterator : an input iterator.
output_iterator : an output iterator.
start : an iterator pointing to the beginning of the UTF-8 string to look for invalid UTF-8 sequences.
end : an iterator pointing to pass-the-end of the UTF-8 string to look for invalid UTF-8 sequences.
out : An output iterator to the range where the result of replacement is stored.
replacement : A Unicode code point for the replacement marker. The version without this parameter assumes the value 0xfffd
Return value: An iterator pointing to the place after the UTF-8 string with replaced invalid sequences.

Example of use:

 char invalid_sequence[] = " a x80xe0xa0xc0xafxedxa0x80 z " ;
vector< char > replace_invalid_result;
unchecked::replace_invalid (invalid_sequence, invalid_sequence + sizeof (invalid_sequence), back_inserter(replace_invalid_result), '?');
bvalid = utf8::is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
assert (bvalid);
char * fixed_invalid_sequence = " a????z " ;
assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence));

Unlike utf8::replace_invalid , this function does not verify validity of the replacement marker.

Types From utf8::unchecked Namespace

utf8::iterator

Available in version 2.0 and later.

Adapts the underlying octet iterator to iterate over the sequence of code points, rather than raw octets.

 template < typename octet_iterator>
class iterator ;

Member functions

iterator(); the default constructor; the underlying octet_iterator is constructed with its default constructor.

explicit iterator (const octet_iterator& octet_it); a constructor that initializes the underlying octet_iterator with octet_it .