ดาวน์โหลด utfcpp - ดาวน์โหลดซอร์สโค้ด utfcpp

UTF8-CPP: UTF-8 พร้อม C ++ ในแบบพกพา

การแนะนำ

นักพัฒนา C ++ ยังคงพลาดวิธีที่ง่ายและพกพาในการจัดการสตริงที่เข้ารหัส Unicode มาตรฐาน C ++ ดั้งเดิม (เรียกว่า C ++ 98 หรือ C ++ 03) คือ Unicode Agnostic ความคืบหน้าบางอย่างเกิดขึ้นในรุ่นต่อมาของมาตรฐาน แต่ก็ยังยากที่จะทำงานกับ Unicode โดยใช้สิ่งอำนวยความสะดวกมาตรฐานเท่านั้น

ฉันมาพร้อมกับห้องสมุดทั่วไปที่เข้ากันได้กับ C ++ 98 เพื่อจัดการสตริงที่เข้ารหัส UTF-8 สำหรับทุกคนที่เคยทำงานกับอัลกอริทึม STL และตัววนซ้ำมันควรจะใช้งานง่ายและเป็นธรรมชาติ รหัสสามารถใช้ได้อย่างอิสระสำหรับวัตถุประสงค์ใด ๆ - ตรวจสอบใบอนุญาต ห้องสมุดมีการใช้งานมากมายตั้งแต่การเปิดตัวครั้งแรกในปี 2549 ทั้งในโครงการเชิงพาณิชย์และโอเพนซอร์ซและพิสูจน์แล้วว่ามีเสถียรภาพและมีประโยชน์

สารบัญ

UTF8-CPP: UTF-8 พร้อม C ++ ในแบบพกพา
- การแนะนำ
- การติดตั้ง
- ตัวอย่างการใช้งาน
  - ตัวอย่างเบื้องต้น
  - ตรวจสอบว่าไฟล์มีข้อความ UTF-8 ที่ถูกต้องหรือไม่
  - ตรวจสอบให้แน่ใจว่าสตริงมีข้อความ UTF-8 ที่ถูกต้อง
- จุดที่น่าสนใจ - เป้าหมายการออกแบบและการตัดสินใจ - ทางเลือก
- อ้างอิง
  - ฟังก์ชั่นจากเนมสเปซ UTF8
    - UTF8 :: ผนวก
      - ต่อท้าย Octet_iterator (UTFCHAR32_T CP, ผลลัพธ์ OCTET_ITERATOR)
      - เป็นโมฆะผนวก (utfchar32_t cp, std :: string & s);
    - UTF8 :: ภาคผนวก 16
      - Word_iterator Append16 (UTFCHAR32_T CP, ผลลัพธ์ Word_iterator)
      - เป็นโมฆะผนวก (utfchar32_t cp, std :: u16string & s)
    - utf8 :: ถัดไป
    - utf8 :: next16
    - utf8 :: peek_next
    - UTF8 :: ก่อน
    - utf8 :: ล่วงหน้า
    - UTF8 :: ระยะทาง
    - utf8 :: utf16to8
      - octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator สิ้นสุด, ผลลัพธ์ octet_iterator)
      - std :: string utf16to8 (const std :: u16string & s)
      - std :: string utf16to8 (std :: u16string_view s)
    - utf8 :: utf16tou8
      - std :: u8string utf16tou8 (const std :: u16string & s)
      - std :: u8string utf16tou8 (const std :: u16string_view & s)
    - utf8 :: utf8to16
      - u16bit_iterator utf8to16 (เริ่มต้น Octet_iterator, end Octet_iterator, ผลลัพธ์ U16bit_iterator)
      - std :: u16string utf8to16 (const std :: string & s)
      - std :: u16string utf8to16 (std :: string_view s)
      - std :: u16string utf8to16 (std :: u8string & s)
      - std :: u16string utf8to16 (std :: u8string_view & s)
    - utf8 :: utf32to8
      - octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator สิ้นสุด, ผลลัพธ์ octet_iterator)
      - std :: string utf32to8 (const std :: u32string & s)
      - std :: u8string utf32to8 (const std :: u32string & s)
      - std :: u8string utf32to8 (const std :: u32string_view & s)
      - std :: string utf32to8 (const std :: u32string & s)
      - std :: string utf32to8 (std :: u32string_view s)
    - utf8 :: utf8to32
      - u32bit_iterator utf8to32 (start octet_iterator, end Octet_iterator, ผลลัพธ์ U32bit_iterator)
      - std :: u32string utf8to32 (const std :: u8string & s)
      - std :: u32string utf8to32 (const std :: u8string_view & s)
      - std :: u32string utf8to32 (const std :: string & s)
      - std :: u32string utf8to32 (std :: string_view s)
    - utf8 :: find_invalid
      - octet_iterator find_invalid (artet_iterator start, end Octet_iterator)
      - const char* find_invalid (const char* str)
      - std :: size_t find_invalid (const std :: string & s)
      - std :: size_t find_invalid (std :: string_view s)
    - utf8 :: is_valid
      - BOOL IS_VALID (OCTET_ITERATOR Start, Octet_iterator End)
      - bool is_valid (const char* str)
      - บูล is_valid (const std :: string & s)
      - bool is_valid (std :: string_view s)
    - utf8 :: applent_invalid
      - output_iterator replace_invalid (start octet_iterator, end Octet_iterator, output_iterator out, การเปลี่ยน UTFCHAR32_T)
      - std :: string replic_invalid (const std :: string & s, utfchar32_t แทนที่)
      - std :: string replic_invalid (std :: string_view s, char32_t แทนที่)
    - utf8 :: start_with_bom
      - bool start_with_bom (octet_iterator it, end octet_iterator)
      - bool start_with_bom (const std :: string & s)
      - bool start_with_bom (std :: string_view s)
  - ประเภทจากเนมสเปซ UTF8
    - utf8 :: ข้อยกเว้น
    - UTF8 :: InvalID_CODE_POINT
    - UTF8 :: Invalid_utf8
    - UTF8 :: Invalid_UTF16
    - utf8 :: not_enough_room
    - utf8 :: iterator
      - ฟังก์ชั่นสมาชิก
  - ฟังก์ชั่นจาก utf8 :: namespace ที่ไม่ได้ตรวจสอบ
    - utf8 :: unchected :: ผนวก
    - utf8 :: unchected :: append16
    - utf8 :: unchected :: ถัดไป
    - utf8 :: next16
    - utf8 :: unchected :: peek_next
    - utf8 :: unchected :: ก่อนหน้า
    - UTF8 :: ไม่ได้ตรวจสอบ :: ล่วงหน้า
    - UTF8 :: ไม่ได้ตรวจสอบ :: ระยะทาง
    - utf8 :: unchected :: utf16to8
    - utf8 :: unchected :: utf8to16
    - utf8 :: unchected :: utf32to8
    - utf8 :: unchected :: utf8to32
    - utf8 :: unchected :: applent_invalid
  - ประเภทจาก utf8 :: namespace ที่ไม่ได้ตรวจสอบ
    - utf8 :: iterator
      - ฟังก์ชั่นสมาชิก

การติดตั้ง

นี่คือห้องสมุดส่วนหัวเท่านั้นและวิธีการที่รองรับการปรับใช้คือ:

ดาวน์โหลดรีลีสจาก https://github.com/nemtrif/utfcpp/releases ลงในไดเรกทอรีชั่วคราว
เปิดซิปการเปิดตัว
คัดลอกเนื้อหาของไฟล์ UTFCPP/Source ลงในไดเรกทอรีที่คุณเก็บไว้รวมไฟล์สำหรับโครงการของคุณ

ไฟล์ cmakelist.txt ถูกสร้างขึ้นเพื่อการทดสอบเท่านั้น แต่น่าเสียดายที่เมื่อเวลาผ่านไปฉันยอมรับการมีส่วนร่วมที่เพิ่มเป้าหมายการติดตั้ง นี่ไม่ใช่วิธีที่รองรับในการติดตั้งไลบรารี UTFCPP และฉันกำลังพิจารณาที่จะลบ cmakelist.txt ในการเปิดตัวในอนาคต

ตัวอย่างการใช้งาน

ตัวอย่างเบื้องต้น

เพื่อแสดงให้เห็นถึงการใช้ไลบรารีให้เริ่มต้นด้วยโปรแกรมขนาดเล็ก แต่สมบูรณ์ที่เปิดไฟล์ที่มีข้อความที่เข้ารหัส UTF-8 อ่านทีละบรรทัดตรวจสอบแต่ละบรรทัดสำหรับลำดับไบต์ UTF-8 ที่ไม่ถูกต้องและแปลงเป็น UTF-16

# include < fstream >
# include < iostream >
# include < string >
# include < vector >
# include " utf8.h "
using namespace std ;
int main ( int argc, char ** argv)
{
    if (argc != 2 ) {
        cout << " n Usage: docsample filename n " ;
        return 0 ;
    }
    const char * test_file_path = argv[ 1 ];
    // Open the test file (must be UTF-8 encoded)
    ifstream fs8 (test_file_path);
    if (!fs8. is_open ()) {
        cout << " Could not open " << test_file_path << endl;
        return 0 ;
    }

    unsigned line_count = 1 ;
    string line;
    // Play with all the lines in the file
    while ( getline (fs8, line)) {
        // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function)
# if __cplusplus >= 201103L // C++ 11 or later
        auto end_it = utf8::find_invalid (line. begin (), line. end ());
# else
        string::iterator end_it = utf8::find_invalid (line. begin (), line. end ());
# endif // C++ 11
        if (end_it != line. end ()) {
            cout << " Invalid UTF-8 encoding detected at line " << line_count << " n " ;
            cout << " This part is fine: " << string (line. begin (), end_it) << " n " ;
        }
        // Get the line length (at least for the valid part)
        int length = utf8::distance (line. begin (), end_it);
        cout << " Length of line " << line_count << " is " << length <<  " n " ;

        // Convert it to utf-16
# if __cplusplus >= 201103L // C++ 11 or later
        u16string utf16line = utf8::utf8to16 (line);
# else
        vector< unsigned short > utf16line;
        utf8::utf8to16 (line. begin (), end_it, back_inserter (utf16line));
# endif // C++ 11
        // And back to utf-8;
# if __cplusplus >= 201103L // C++ 11 or later
        string utf8line = utf8::utf16to8 (utf16line);
# else
        string utf8line; 
        utf8::utf16to8 (utf16line. begin (), utf16line. end (), back_inserter (utf8line));
# endif // C++ 11
        // Confirm that the conversion went OK:
        if (utf8line != string (line. begin (), end_it))
            cout << " Error in UTF-16 conversion at line: " << line_count << " n " ;        

        line_count++;
    } 

    return 0 ;
}

ในตัวอย่างรหัสก่อนหน้าสำหรับแต่ละบรรทัดเราทำการตรวจจับลำดับ UTF-8 ที่ไม่ถูกต้องด้วย find_invalid ; จำนวนอักขระ (แม่นยำยิ่งขึ้น - จำนวนจุดรหัส Unicode รวมถึงจุดสิ้นสุดของบรรทัดและแม้กระทั่ง bom ถ้ามีหนึ่ง) ในแต่ละบรรทัดถูกกำหนดด้วยการใช้ utf8::distance ; ในที่สุดเราได้แปลงแต่ละบรรทัดเป็นการเข้ารหัส UTF-16 ด้วย utf8to16 และกลับเป็น UTF-8 ด้วย utf16to8

หมายเหตุรูปแบบการใช้งานที่แตกต่างกันสำหรับคอมไพเลอร์เก่า ตัวอย่างเช่นนี่คือวิธีที่เราแปลงสตริงที่เข้ารหัส UTF-8 เป็น UTF-16 ที่เข้ารหัสด้วยคอมไพเลอร์ก่อน-C ++ 11:

    vector< unsigned short > utf16line;
    utf8::utf8to16 (line.begin(), end_it, back_inserter(utf16line));

ด้วยคอมไพเลอร์ที่ทันสมัยกว่าการดำเนินการเดียวกันจะมีลักษณะ:

    u16string utf16line = utf8::utf8to16(line);

หากแมโคร __cplusplus ชี้ไปที่ C ++ 11 หรือใหม่กว่าห้องสมุดจะเปิดเผย API ที่คำนึงถึงสตริง Unicode มาตรฐาน C ++ และย้ายความหมาย ด้วยคอมไพเลอร์รุ่นเก่าจึงยังคงเป็นไปได้ที่จะใช้ฟังก์ชั่นเดียวกัน

ในกรณีที่คุณไม่ไว้วางใจแมโคร __cplusplus หรือไม่ต้องการรวมฟังก์ชั่นผู้ช่วย C ++ 11 แม้จะมีคอมไพเลอร์ที่ทันสมัยกำหนด macro UTF_CPP_CPLUSPLUS ก่อนรวม utf8.h และกำหนดค่าสำหรับมาตรฐาน __cplusplus คุณต้องการใช้ สิ่งนี้ยังมีประโยชน์กับคอมไพเลอร์ที่อนุรักษ์นิยมในการตั้งค่า macro __cplusplus แม้ว่าพวกเขาจะได้รับการสนับสนุนที่ดีสำหรับรุ่นมาตรฐานล่าสุด - Visual C ++ ของ Microsoft เป็นตัวอย่างหนึ่ง

ตรวจสอบว่าไฟล์มีข้อความ UTF-8 ที่ถูกต้องหรือไม่

นี่คือฟังก์ชั่นที่ตรวจสอบว่าเนื้อหาของไฟล์เป็นข้อความที่เข้ารหัส UTF-8 ที่ถูกต้องโดยไม่ต้องอ่านเนื้อหาลงในหน่วยความจำ:

 bool valid_utf8_file ( const char * file_name)
{
    ifstream ifs (file_name);
    if (!ifs)
        return false ; // even better, throw here

    istreambuf_iterator< char > it (ifs. rdbuf ());
    istreambuf_iterator< char > eos;

    return utf8::is_valid (it, eos);
}

เนื่องจากฟังก์ชั่น utf8::is_valid() ทำงานร่วมกับอินพุตตัววนซ้ำเราจึงสามารถส่งผ่าน istreambuf_iterator ไปยัง it และอ่านเนื้อหาของไฟล์โดยตรงโดยไม่ต้องโหลดไปยังหน่วยความจำก่อน

โปรดทราบว่าฟังก์ชั่นอื่น ๆ ที่ใช้อาร์กิวเมนต์ Iterator อินพุตสามารถใช้ในลักษณะเดียวกัน ตัวอย่างเช่นในการอ่านเนื้อหาของไฟล์ข้อความที่เข้ารหัส UTF-8 และแปลงข้อความเป็น UTF-16 เพียงทำบางอย่างเช่น:

    utf8::utf8to16 (it, eos, back_inserter(u16string));

ตรวจสอบให้แน่ใจว่าสตริงมีข้อความ UTF-8 ที่ถูกต้อง

หากเรามีข้อความบางอย่างที่ "อาจ" มีข้อความที่เข้ารหัส UTF-8 และเราต้องการแทนที่ลำดับ UTF-8 ที่ไม่ถูกต้องใด ๆ ด้วยอักขระทดแทนอาจใช้ฟังก์ชั่นต่อไปนี้: อาจใช้ฟังก์ชั่นต่อไปนี้:

 void fix_utf8_string (std::string& str)
{
    std::string temp;
    utf8::replace_invalid (str. begin (), str. end (), back_inserter (temp));
    str = temp;
}

ฟังก์ชั่นจะแทนที่ลำดับ UTF-8 ที่ไม่ถูกต้องใด ๆ ด้วยอักขระการเปลี่ยน Unicode มีฟังก์ชั่นโอเวอร์โหลดที่ช่วยให้ผู้โทรสามารถจัดหาอักขระทดแทนของตัวเองได้

จุดสนใจ

การออกแบบเป้าหมายและการตัดสินใจ

ห้องสมุดได้รับการออกแบบให้เป็น:

ทั่วไป: เพื่อให้ดีขึ้นหรือแย่ลงมีคลาสสตริง C ++ จำนวนมากออกไปที่นั่นและห้องสมุดควรทำงานกับพวกเขาให้ได้มากที่สุด
พกพา: ไลบรารีควรพกพาทั้งในแพลตฟอร์มและคอมไพเลอร์ที่แตกต่างกัน รหัสที่ไม่สามารถพกพาได้เพียงอย่างเดียวคือส่วนเล็ก ๆ ที่ประกาศจำนวนเต็มที่ไม่ได้ลงนามในขนาดที่แตกต่างกัน: สาม typedefs พวกเขาสามารถเปลี่ยนแปลงได้โดยผู้ใช้ห้องสมุดหากพวกเขาไม่ตรงกับแพลตฟอร์มของพวกเขา การตั้งค่าเริ่มต้นควรใช้งานได้กับ Windows (ทั้ง 32 และ 64 บิต) และอนุพันธ์ UNIX 32 บิตและ 64 บิตส่วนใหญ่ การรองรับคุณสมบัติภาษาโพสต์ C ++ 03 รวมอยู่ในคอมไพเลอร์ที่ทันสมัยในระดับ API เท่านั้นดังนั้นห้องสมุดควรทำงานได้แม้จะมีคอมไพเลอร์เก่า ๆ
น้ำหนักเบา: ทำตามแนวทาง "จ่ายเฉพาะสิ่งที่คุณใช้"
ไม่รบกวน: หลีกเลี่ยงการบังคับใช้การออกแบบเฉพาะหรือแม้กระทั่งรูปแบบการเขียนโปรแกรมกับผู้ใช้ นี่คือห้องสมุดไม่ใช่กรอบ

ทางเลือก

สำหรับทางเลือกและการเปรียบเทียบฉันขอแนะนำบทความต่อไปนี้: โลกที่น่ากลัวอย่างน่าอัศจรรย์ของ APIs การเข้ารหัส C และ C ++ (กับสนิมบางอย่าง) โดย Jeanheyd Meneide ในบทความห้องสมุดนี้เปรียบเทียบกับ:

simdutf
ไอคอน
boost.text
ICU
encoding_rs
ฟังก์ชั่น Windows API สำหรับการแปลงข้อความระหว่างการเข้ารหัส
ztd.text

บทความนำเสนอมุมมองของผู้เขียนเกี่ยวกับคุณภาพของการออกแบบ API แต่ยังรวมถึงมาตรฐานความเร็ว

อ้างอิง

ฟังก์ชั่นจากเนมสเปซ UTF8

UTF8 :: ผนวก

ต่อท้าย Octet_iterator (UTFCHAR32_T CP, ผลลัพธ์ OCTET_ITERATOR)

มีให้ในเวอร์ชัน 1.0 และใหม่กว่า

เข้ารหัสจุดรหัส 32 บิตเป็นลำดับ UTF-8 ของ octets และผนวกลำดับกับสตริง UTF-8

 template < typename octet_iterator>
octet_iterator append ( utfchar32_t cp, octet_iterator result);

octet_iterator : ตัววนซ้ำเอาต์พุต
cp : จำนวนเต็ม 32 บิตที่แสดงถึงจุดรหัสเพื่อผนวกกับลำดับ
result : ตัววนซ้ำเอาต์พุตไปยังสถานที่ในลำดับที่จะผนวกจุดรหัส
ค่าส่งคืน: ตัววนซ้ำชี้ไปที่สถานที่หลังจากลำดับที่ต่อท้ายใหม่

ตัวอย่างการใช้งาน:

 unsigned char u[ 5 ] = { 0 , 0 , 0 , 0 , 0 };
unsigned char * end = append( 0x0448 , u);
assert (u[ 0 ] == 0xd1 && u[ 1 ] == 0x88 && u[ 2 ] == 0 && u[ 3 ] == 0 && u[ 4 ] == 0 );

โปรดทราบว่า append ไม่ได้จัดสรรหน่วยความจำใด ๆ - เป็นภาระของผู้โทรเพื่อให้แน่ใจว่ามีหน่วยความจำเพียงพอที่จัดสรรสำหรับการดำเนินการ เพื่อให้สิ่งที่น่าสนใจยิ่งขึ้นต่อ append สามารถเพิ่มที่ใดก็ได้ระหว่าง 1 ถึง 4 octets ไปยังลำดับ ในทางปฏิบัติคุณมักจะต้องการใช้ std::back_inserter เพื่อให้แน่ใจว่ามีการจัดสรรหน่วยความจำที่จำเป็น

ในกรณีของจุดรหัสที่ไม่ถูกต้องข้อยกเว้น utf8::invalid_code_point จะถูกโยนทิ้ง

เป็นโมฆะผนวก (utfchar32_t cp, std :: string & s);

มีให้ในเวอร์ชัน 3.0 และใหม่กว่า ก่อนหน้า 4.0 ต้องใช้คอมไพเลอร์ C ++ 11; ข้อกำหนดจะถูกยกด้วย 4.0

เข้ารหัสจุดรหัส 32 บิตเป็นลำดับ UTF-8 ของ octets และผนวกลำดับกับสตริง UTF-8

 void append ( utfchar32_t cp, std::string& s);

cp : โค้ดชี้ไปที่สตริง
s : สตริงที่เข้ารหัส UTF-8 เพื่อผนวกจุดรหัสไปที่

ตัวอย่างการใช้งาน:

std::string u;
append ( 0x0448 , u);
assert (u[ 0 ] == char ( 0xd1 ) && u[1] == char( 0x88 ) && u.length() == 2);

ในกรณีของจุดรหัสที่ไม่ถูกต้องข้อยกเว้น utf8::invalid_code_point จะถูกโยนทิ้ง

UTF8 :: ภาคผนวก 16

Word_iterator Append16 (UTFCHAR32_T CP, ผลลัพธ์ Word_iterator)

มีให้ในเวอร์ชัน 4.0 และใหม่กว่า

เข้ารหัสจุดรหัส 32 บิตเป็นลำดับ UTF-16 ของคำและผนวกลำดับเข้ากับสตริง UTF-16

 template < typename word_iterator>
word_iterator append16 ( utfchar32_t cp, word_iterator result);

word_iterator : ตัววนซ้ำเอาต์พุต
cp : จำนวนเต็ม 32 บิตที่แสดงถึงจุดรหัสเพื่อผนวกกับลำดับ
result : ตัววนซ้ำเอาต์พุตไปยังสถานที่ในลำดับที่จะผนวกจุดรหัส
ค่าส่งคืน: ตัววนซ้ำชี้ไปที่สถานที่หลังจากลำดับที่ต่อท้ายใหม่

ตัวอย่างการใช้งาน:

 unsigned short u[ 2 ] = { 0 , 0 };
unsigned short * end = append16( 0x0448 , u);
assert (u[ 0 ] == 0x0448 && u[ 1 ] == 0 );

โปรดทราบว่า append16 ไม่ได้จัดสรรหน่วยความจำใด ๆ - เป็นภาระของผู้โทรเพื่อให้แน่ใจว่ามีหน่วยความจำเพียงพอที่จัดสรรสำหรับการดำเนินการ เพื่อให้สิ่งที่น่าสนใจยิ่งขึ้น append16 สามารถเพิ่มหนึ่งหรือสองคำในลำดับ ในทางปฏิบัติคุณมักจะต้องการใช้ std::back_inserter เพื่อให้แน่ใจว่ามีการจัดสรรหน่วยความจำที่จำเป็น

ในกรณีของจุดรหัสที่ไม่ถูกต้องข้อยกเว้น utf8::invalid_code_point จะถูกโยนทิ้ง

เป็นโมฆะผนวก (utfchar32_t cp, std :: u16string & s)

มีให้ในเวอร์ชัน 4.0 และใหม่กว่า ต้องใช้คอมไพเลอร์ที่สอดคล้องกับ C ++ 11

เข้ารหัสจุดรหัส 32 บิตเป็นลำดับ UTF-16 ของคำและผนวกลำดับเข้ากับสตริง UTF-16

 void append ( utfchar32_t cp, std::u16string& s);

cp : โค้ดชี้ไปที่สตริง
s : สตริงที่เข้ารหัส UTF-16 เพื่อผนวกจุดรหัสไปที่

ตัวอย่างการใช้งาน:

std::u16string u;
append ( 0x0448 , u);
assert (u[ 0 ] == 0x0448 && u.length() == 1);

ในกรณีของจุดรหัสที่ไม่ถูกต้องข้อยกเว้น utf8::invalid_code_point จะถูกโยนทิ้ง

utf8 :: ถัดไป

มีให้ในเวอร์ชัน 1.0 และใหม่กว่า

ด้วยตัววนซ้ำไปยังจุดเริ่มต้นของลำดับ UTF-8 มันจะส่งคืนจุดรหัสและย้ายตัววนซ้ำไปยังตำแหน่งถัดไป

 template < typename octet_iterator> 
utfchar32_t next (octet_iterator& it, octet_iterator end);

octet_iterator : อินพุตตัววนซ้ำ
it : การอ้างอิงถึงตัววนซ้ำที่ชี้ไปที่จุดเริ่มต้นของจุดรหัสที่เข้ารหัส UTF-8 หลังจากฟังก์ชั่นส่งคืนมันจะเพิ่มขึ้นเพื่อชี้ไปที่จุดเริ่มต้นของจุดรหัสถัดไป
end : สิ้นสุดลำดับ UTF-8 ที่จะประมวลผล หาก it ได้รับเท่ากันถึง end ในระหว่างการสกัดรหัสจุด utf8::not_enough_room จะถูกโยนออกไป
ค่าส่งคืน: การแสดง 32 บิตของจุดรหัส UTF-8 ที่ประมวลผล

ตัวอย่างการใช้งาน:

 char * twochars = " xe6x97xa5xd1x88 " ;
char * w = twochars;
int cp = next(w, twochars + 6 );
assert (cp == 0x65e5 );
assert (w == twochars + 3 );

ฟังก์ชั่นนี้มักจะใช้ซ้ำผ่านสตริงที่เข้ารหัส UTF-8

ในกรณีที่มีลำดับ UTF-8 ที่ไม่ถูกต้อง utf8::invalid_utf8 จะถูกโยนทิ้ง

utf8 :: next16

มีให้ในเวอร์ชัน 4.0 และใหม่กว่า

ด้วยตัววนซ้ำไปยังจุดเริ่มต้นของลำดับ UTF-16 มันจะส่งคืนจุดรหัสและย้ายตัววนซ้ำไปยังตำแหน่งถัดไป

 template < typename word_iterator>
utfchar32_t next16 (word_iterator& it, word_iterator end);

word_iterator : ตัววนซ้ำอินพุต
it : การอ้างอิงถึงตัววนซ้ำที่ชี้ไปที่จุดเริ่มต้นของจุดรหัสที่เข้ารหัส UTF-16 หลังจากฟังก์ชั่นส่งคืนมันจะเพิ่มขึ้นเพื่อชี้ไปที่จุดเริ่มต้นของจุดรหัสถัดไป
end : สิ้นสุดลำดับ UTF-16 ที่จะประมวลผล หาก it ได้รับเท่ากันถึง end ในระหว่างการสกัดรหัสจุด utf8::not_enough_room จะถูกโยนออกไป
ค่าส่งคืน: การแสดง 32 บิตของจุดรหัส UTF-16 ที่ประมวลผล

ตัวอย่างการใช้งาน:

 const unsigned short u[ 3 ] = { 0x65e5 , 0xd800 , 0xdf46 };
const unsigned short * w = u;
int cp = next16(w, w + 3 );
assert (cp, 0x65e5 );
assert (w, u + 1 );

ฟังก์ชั่นนี้มักจะใช้ซ้ำผ่านสตริงที่เข้ารหัส UTF-16

ในกรณีที่มีลำดับ UTF-16 ที่ไม่ถูกต้อง utf8::invalid_utf8 จะถูกโยนทิ้ง

utf8 :: peek_next

มีให้ในเวอร์ชัน 2.1 และใหม่กว่า

ด้วยตัววนซ้ำไปยังจุดเริ่มต้นของลำดับ UTF-8 มันจะส่งคืนจุดรหัสสำหรับลำดับต่อไปนี้โดยไม่ต้องเปลี่ยนค่าของตัววนซ้ำ

 template < typename octet_iterator> 
utfchar32_t peek_next (octet_iterator it, octet_iterator end);

octet_iterator : อินพุตตัววนซ้ำ
it : ตัววนซ้ำชี้ไปที่จุดเริ่มต้นของจุดรหัสที่เข้ารหัส UTF-8
end : สิ้นสุดลำดับ UTF-8 ที่จะประมวลผล หาก it ได้รับเท่ากันถึง end ในระหว่างการสกัดรหัสจุด utf8::not_enough_room จะถูกโยนออกไป
ค่าส่งคืน: การแสดง 32 บิตของจุดรหัส UTF-8 ที่ประมวลผล

ตัวอย่างการใช้งาน:

 char * twochars = " xe6x97xa5xd1x88 " ;
char * w = twochars;
int cp = peek_next(w, twochars + 6 );
assert (cp == 0x65e5 );
assert (w == twochars);

ในกรณีที่มีลำดับ UTF-8 ที่ไม่ถูกต้อง utf8::invalid_utf8 จะถูกโยนทิ้ง

UTF8 :: ก่อน

มีให้ในเวอร์ชัน 1.02 และใหม่กว่า

จากการอ้างอิงถึงตัววนซ้ำที่ชี้ไปที่ออคเต็ตในลำดับ UTF-8 มันจะลดการวนซ้ำจนกว่าจะถึงจุดเริ่มต้นของจุดรหัส UTF-8 ที่เข้ารหัสก่อนหน้านี้และส่งคืนการแสดง 32 บิตของจุดรหัส

 template < typename octet_iterator> 
utfchar32_t prior (octet_iterator& it, octet_iterator start);

octet_iterator : ตัววนซ้ำแบบสองทิศทาง
it : การอ้างอิงชี้ไปที่ octet ภายในสตริงที่เข้ารหัส UTF-8 หลังจากฟังก์ชั่นส่งคืนมันจะลดลงเพื่อชี้ไปที่จุดเริ่มต้นของจุดรหัสก่อนหน้า
start : ตัววนซ้ำไปยังจุดเริ่มต้นของลำดับที่การค้นหาจุดเริ่มต้นของจุดรหัสจะดำเนินการ มันเป็นมาตรการด้านความปลอดภัยเพื่อป้องกันการผ่านจุดเริ่มต้นของสตริงในการค้นหา Octet ตะกั่ว UTF-8
ค่าส่งคืน: การแสดง 32 บิตของจุดรหัสก่อนหน้า

ตัวอย่างการใช้งาน:

 char * twochars = " xe6x97xa5xd1x88 " ;
unsigned char * w = twochars + 3 ;
int cp = prior (w, twochars);
assert (cp == 0x65e5 );
assert (w == twochars);

ฟังก์ชั่นนี้มีสองวัตถุประสงค์: หนึ่งคือสองซ้ำไปข้างหลังผ่านสตริงที่เข้ารหัส UTF-8 โปรดทราบว่ามันเป็นความคิดที่ดีกว่าที่จะทำซ้ำไปข้างหน้าแทนเนื่องจาก utf8::next เร็วกว่า จุดประสงค์ที่สองคือการหาจุดเริ่มต้นของลำดับ UTF-8 หากเรามีตำแหน่งสุ่มภายในสตริง โปรดทราบว่าในกรณีนั้น utf8::prior อาจไม่ตรวจพบลำดับ UTF-8 ที่ไม่ถูกต้องในบางสถานการณ์: ตัวอย่างเช่นหากมีเท็ตเท็ตที่ฟุ่มเฟือยมันจะข้ามไป

โดยทั่วไปแล้ว it ชี้ไปที่จุดเริ่มต้นของจุดรหัสและ start จะชี้ไปที่จุดเริ่มต้นของสตริงเพื่อให้แน่ใจว่าเราจะไม่ย้อนกลับไปไกลเกินไป it จะลดลงจนกระทั่งมันชี้ไปที่ lead utf-8 octet จากนั้นลำดับ UTF-8 เริ่มต้นด้วย octet นั้นจะถูกถอดรหัสเป็นตัวแทน 32 บิตและส่งคืน

ในกรณีที่ start ก่อนที่ Octet ตะกั่ว UTF-8 จะถูกตีหรือหากลำดับ UTF-8 ที่ไม่ถูกต้องเริ่มต้นโดย lead octet ข้อยกเว้น invalid_utf8 จะถูกโยนทิ้ง

ในกรณี start เท่ากับ it ข้อยกเว้น not_enough_room จะถูกโยนทิ้ง

utf8 :: ล่วงหน้า

มีให้ในเวอร์ชัน 1.0 และใหม่กว่า

ความก้าวหน้าของตัววนซ้ำตามจำนวนจุดรหัสที่ระบุภายในลำดับ UTF-8

 template < typename octet_iterator, typename distance_type> 
void advance (octet_iterator& it, distance_type n, octet_iterator end);

octet_iterator : อินพุตตัววนซ้ำ
distance_type : ประเภทอินทิกรัลแปลงสภาพเป็นประเภทความแตกต่างของ octet_iterator
it : การอ้างอิงถึงตัววนซ้ำที่ชี้ไปที่จุดเริ่มต้นของจุดรหัสที่เข้ารหัส UTF-8 หลังจากฟังก์ชั่นส่งคืนมันจะเพิ่มขึ้นเพื่อชี้ไปที่จุดต่อไปนี้ NTH CODE
n : จำนวนคะแนนรหัส it สูงขึ้น ค่าลบหมายถึงการลดลง
end : ขีด จำกัด ของลำดับ UTF-8 ที่จะประมวลผล หาก n เป็นบวกและ it ได้รับเท่ากันถึง end ในระหว่างการสกัดรหัสจุด utf8::not_enough_room จะถูกโยนทิ้ง ถ้า n เป็นลบและถึง end ในขณะ it it ไปตามเส้นทางของลำดับ UTF-8, utf8::invalid_code_point ยกเว้นจะถูกโยนทิ้ง

ตัวอย่างการใช้งาน:

 char * twochars = " xe6x97xa5xd1x88 " ;
unsigned char * w = twochars;
advance (w, 2 , twochars + 6 );
assert (w == twochars + 5 );
advance (w, - 2 , twochars);
assert (w == twochars);

ในกรณีของจุดรหัสที่ไม่ถูกต้องข้อยกเว้น utf8::invalid_code_point จะถูกโยนทิ้ง

UTF8 :: ระยะทาง

มีให้ในเวอร์ชัน 1.0 และใหม่กว่า

เมื่อพิจารณาว่าตัววนซ้ำไปยังจุดรหัสที่เข้ารหัส UTF-8 สองจุดในลำดับให้ส่งคืนจำนวนจุดรหัสระหว่างพวกเขา

 template < typename octet_iterator> 
typename std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator first, octet_iterator last);

octet_iterator : อินพุตตัววนซ้ำ
first : ตัววนซ้ำไปยังจุดเริ่มต้นของจุดรหัสที่เข้ารหัส UTF-8
last : ตัววนซ้ำไปยัง "โพสต์-เอนด์" ของจุดรหัสที่เข้ารหัส UTF-8 ล่าสุดในลำดับที่เราพยายามกำหนดความยาว อาจเป็นจุดเริ่มต้นของจุดรหัสใหม่หรือไม่
ส่งคืนค่าระยะห่างระหว่างตัววนซ้ำในจุดรหัส

ตัวอย่างการใช้งาน:

 char * twochars = " xe6x97xa5xd1x88 " ;
size_t dist = utf8::distance(twochars, twochars + 5 );
assert (dist == 2 );

ฟังก์ชั่นนี้ใช้เพื่อค้นหาความยาว (ในจุดรหัส) ของสตริงที่เข้ารหัส UTF-8 เหตุผลที่เรียกว่า ระยะทาง แทนที่จะพูด ความยาว ส่วนใหญ่เป็นเพราะนักพัฒนาใช้ ความยาว เป็นฟังก์ชัน O (1) การคำนวณความยาวของสตริง UTF-8 เป็นการทำงานเชิงเส้นและดูดีกว่าที่จะสร้างแบบจำลองหลังจากอัลกอริทึม std::distance

ในกรณีที่มีลำดับ UTF-8 ที่ไม่ถูกต้อง utf8::invalid_utf8 จะถูกโยนทิ้ง หาก last ไม่ได้ชี้ไปที่ลำดับที่ผ่านมาของลำดับ UTF-8, utf8::not_enough_room ถูกโยนออกไป

utf8 :: utf16to8

octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator สิ้นสุด, ผลลัพธ์ octet_iterator)

มีให้ในเวอร์ชัน 1.0 และใหม่กว่า

แปลงสตริงที่เข้ารหัส UTF-16 เป็น UTF-8

 template < typename u16bit_iterator, typename octet_iterator>
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result);

u16bit_iterator : ตัววนซ้ำอินพุต
octet_iterator : ตัววนซ้ำเอาต์พุต
start : ตัววนซ้ำชี้ไปที่จุดเริ่มต้นของสตริงที่เข้ารหัส UTF-16 เพื่อแปลง
end : ตัววนซ้ำที่ชี้ไปที่การส่งผ่านของสตริงที่เข้ารหัส UTF-16 เพื่อแปลง
result : ตัววนซ้ำเอาต์พุตไปยังสถานที่ในสตริง UTF-8 ที่จะผนวกผลลัพธ์ของการแปลง
ค่าส่งคืน: ตัววนซ้ำที่ชี้ไปที่สถานที่หลังจากสตริง UTF-8 ที่ผนวกเข้าด้วยกัน

ตัวอย่างการใช้งาน:

 unsigned short utf16string[] = { 0x41 , 0x0448 , 0x65e5 , 0xd834 , 0xdd1e };
vector< unsigned char > utf8result;
utf16to8 (utf16string, utf16string + 5 , back_inserter(utf8result));
assert (utf8result.size() == 10);

ในกรณีของลำดับ UTF-16 ที่ไม่ถูกต้องมีการโยน utf8::invalid_utf16

std :: string utf16to8 (const std :: u16string & s)

มีให้ในเวอร์ชัน 3.0 และใหม่กว่า ต้องใช้คอมไพเลอร์ที่สอดคล้องกับ C ++ 11

แปลงสตริงที่เข้ารหัส UTF-16 เป็น UTF-8

std::string utf16to8 ( const std::u16string& s);

s : สตริงที่เข้ารหัส UTF-16 ค่าส่งคืน: สตริงที่เข้ารหัส UTF-8

ตัวอย่างการใช้งาน:

    u16string utf16string = { 0x41 , 0x0448 , 0x65e5 , 0xd834 , 0xdd1e };
    string u = utf16to8(utf16string);
    assert (u.size() == 10);

ในกรณีของลำดับ UTF-16 ที่ไม่ถูกต้องมีการโยน utf8::invalid_utf16

std :: string utf16to8 (std :: u16string_view s)

มีให้ในเวอร์ชัน 3.2 และใหม่กว่า ต้องใช้คอมไพเลอร์ที่สอดคล้องกับ C ++ 17

แปลงสตริงที่เข้ารหัส UTF-16 เป็น UTF-8

std::string utf16to8 (std::u16string_view s);

s : สตริงที่เข้ารหัส UTF-16 ค่าส่งคืน: สตริงที่เข้ารหัส UTF-8

ตัวอย่างการใช้งาน:

    u16string utf16string = { 0x41 , 0x0448 , 0x65e5 , 0xd834 , 0xdd1e };
    u16string_view utf16stringview (u16string);
    string u = utf16to8(utf16string);
    assert (u.size() == 10);

ในกรณีของลำดับ UTF-16 ที่ไม่ถูกต้องมีการโยน utf8::invalid_utf16

utf8 :: utf16tou8

std :: u8string utf16tou8 (const std :: u16string & s)

มีให้ในเวอร์ชัน 4.0 และใหม่กว่า ต้องใช้คอมไพเลอร์ที่สอดคล้องกับ C ++ 20

แปลงสตริงที่เข้ารหัส UTF-16 เป็น UTF-8

std::u8string utf16tou8 ( const std::u16string& s);

s : สตริงที่เข้ารหัส UTF-16 ค่าส่งคืน: สตริงที่เข้ารหัส UTF-8

ตัวอย่างการใช้งาน:

    u16string utf16string = { 0x41 , 0x0448 , 0x65e5 , 0xd834 , 0xdd1e };
    u8string u = utf16tou8(utf16string);
    assert (u.size() == 10);

ในกรณีของลำดับ UTF-16 ที่ไม่ถูกต้องมีการโยน utf8::invalid_utf16

std :: u8string utf16tou8 (const std :: u16string_view & s)

มีให้ในเวอร์ชัน 4.0 และใหม่กว่า ต้องใช้คอมไพเลอร์ที่สอดคล้องกับ C ++ 20

แปลงสตริงที่เข้ารหัส UTF-16 เป็น UTF-8

std::u8string utf16tou8 ( const std::u16string_view& s);

s : สตริงที่เข้ารหัส UTF-16 ค่าส่งคืน: สตริงที่เข้ารหัส UTF-8

ตัวอย่างการใช้งาน:

    u16string utf16string = { 0x41 , 0x0448 , 0x65e5 , 0xd834 , 0xdd1e };
    u16string_view utf16stringview (u16string);
    u8string u = utf16tou8(utf16string);
    assert (u.size() == 10);

ในกรณีของลำดับ UTF-16 ที่ไม่ถูกต้องมีการโยน utf8::invalid_utf16

utf8 :: utf8to16

u16bit_iterator utf8to16 (เริ่มต้น Octet_iterator, end Octet_iterator, ผลลัพธ์ U16bit_iterator)

มีให้ในเวอร์ชัน 1.0 และใหม่กว่า

แปลงสตริงที่เข้ารหัส UTF-8 เป็น UTF-16

 template < typename u16bit_iterator, typename octet_iterator>
u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result);

octet_iterator : อินพุตตัววนซ้ำ
u16bit_iterator : ตัววนซ้ำเอาต์พุต
start : ตัววนซ้ำชี้ไปที่จุดเริ่มต้นของสตริงที่เข้ารหัส UTF-8 เพื่อแปลง end : ตัววนซ้ำชี้ไปที่การส่งผ่านของสตริงที่เข้ารหัส UTF-8 เพื่อแปลง
result : ตัววนซ้ำเอาต์พุตไปยังสถานที่ในสตริง UTF-16 ที่จะผนวกผลลัพธ์ของการแปลง
ค่าส่งคืน: ตัววนซ้ำที่ชี้ไปที่สถานที่หลังจากสตริง UTF-16 ที่ผนวกเข้าด้วยกัน

ตัวอย่างการใช้งาน:

 char utf8_with_surrogates[] = " xe6x97xa5xd1x88xf0x9dx84x9e " ;
vector < unsigned short > utf16result;
utf8to16 (utf8_with_surrogates, utf8_with_surrogates + 9 , back_inserter(utf16result));
assert (utf16result.size() == 4);
assert (utf16result[ 2 ] == 0xd834 );
assert (utf16result[ 3 ] == 0xdd1e );

ในกรณีที่มีลำดับ UTF-8 ที่ไม่ถูกต้อง utf8::invalid_utf8 จะถูกโยนทิ้ง หาก end ไม่ได้ชี้ไปที่ลำดับที่ผ่านมาของลำดับ UTF-8, utf8::not_enough_room ถูกโยนออกไป

std :: u16string utf8to16 (const std :: string & s)

มีให้ในเวอร์ชัน 3.0 และใหม่กว่า ต้องใช้คอมไพเลอร์ที่สอดคล้องกับ C ++ 11

แปลงสตริงที่เข้ารหัส UTF-8 เป็น UTF-16

std::u16string utf8to16 ( const std::string& s);

s : สตริงที่เข้ารหัส UTF-8 เพื่อแปลง
ค่าส่งคืน: สตริงที่เข้ารหัส UTF-16

ตัวอย่างการใช้งาน:

string utf8_with_surrogates = " xe6x97xa5xd1x88xf0x9dx84x9e " ;
u16string utf16result = utf8to16(utf8_with_surrogates);
assert (utf16result.length() == 4);
assert (utf16result[ 2 ] == 0xd834 );
assert (utf16result[ 3 ] == 0xdd1e );

ในกรณีที่มีลำดับ UTF-8 ที่ไม่ถูกต้อง utf8::invalid_utf8 จะถูกโยนทิ้ง

std :: u16string utf8to16 (std :: string_view s)

มีให้ในเวอร์ชัน 3.2 และใหม่กว่า ต้องใช้คอมไพเลอร์ที่สอดคล้องกับ C ++ 17

แปลงสตริงที่เข้ารหัส UTF-8 เป็น UTF-16

std::u16string utf8to16 (std::string_view s);

s : สตริงที่เข้ารหัส UTF-8 เพื่อแปลง
ค่าส่งคืน: สตริงที่เข้ารหัส UTF-16

ตัวอย่างการใช้งาน:

string_view utf8_with_surrogates = " xe6x97xa5xd1x88xf0x9dx84x9e " ;
u16string utf16result = utf8to16(utf8_with_surrogates);
assert (utf16result.length() == 4);
assert (utf16result[ 2 ] == 0xd834 );
assert (utf16result[ 3 ] == 0xdd1e );

ในกรณีที่มีลำดับ UTF-8 ที่ไม่ถูกต้อง utf8::invalid_utf8 จะถูกโยนทิ้ง

std :: u16string utf8to16 (std :: u8string & s)

มีให้ในเวอร์ชัน 4.0 และใหม่กว่า ต้องใช้คอมไพเลอร์ที่สอดคล้องกับ C ++ 20

แปลงสตริงที่เข้ารหัส UTF-8 เป็น UTF-16

std::u16string utf8to16 (std::u8string& s);

s : สตริงที่เข้ารหัส UTF-8 เพื่อแปลง
ค่าส่งคืน: สตริงที่เข้ารหัส UTF-16

ตัวอย่างการใช้งาน:

std::u8string utf8_with_surrogates = " xe6x97xa5xd1x88xf0x9dx84x9e " ;
std::u16string utf16result = utf8to16(utf8_with_surrogates);
assert (utf16result.length() == 4);
assert (utf16result[ 2 ] == 0xd834 );
assert (utf16result[ 3 ] == 0xdd1e );

ในกรณีที่มีลำดับ UTF-8 ที่ไม่ถูกต้อง utf8::invalid_utf8 จะถูกโยนทิ้ง

std :: u16string utf8to16 (std :: u8string_view & s)

มีให้ในเวอร์ชัน 4.0 และใหม่กว่า ต้องใช้คอมไพเลอร์ที่สอดคล้องกับ C ++ 20

แปลงสตริงที่เข้ารหัส UTF-8 เป็น UTF-16

std::u16string utf8to16 (std::u8string_view& s);

s : สตริงที่เข้ารหัส UTF-8 เพื่อแปลง
ค่าส่งคืน: สตริงที่เข้ารหัส UTF-16

ตัวอย่างการใช้งาน:

std::u8string utf8_with_surrogates = " xe6x97xa5xd1x88xf0x9dx84x9e " ;
std::u8string_view utf8stringview {utf8_with_surrogates}
std::u16string utf16result = utf8to16(utf8stringview);
assert (utf16result.length() == 4);
assert (utf16result[ 2 ] == 0xd834 );
assert (utf16result[ 3 ] == 0xdd1e );

ในกรณีที่มีลำดับ UTF-8 ที่ไม่ถูกต้อง utf8::invalid_utf8 จะถูกโยนทิ้ง

utf8 :: utf32to8

octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator สิ้นสุด, ผลลัพธ์ octet_iterator)

มีให้ในเวอร์ชัน 1.0 และใหม่กว่า

แปลงสตริงที่เข้ารหัส UTF-32 เป็น UTF-8

 template < typename octet_iterator, typename u32bit_iterator>
octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result);

octet_iterator : ตัววนซ้ำเอาต์พุต
u32bit_iterator : ตัววนซ้ำอินพุต
start : ตัววนซ้ำชี้ไปที่จุดเริ่มต้นของสตริงที่เข้ารหัส UTF-32 เพื่อแปลง
end : ตัววนซ้ำชี้ไปที่การส่งผ่านของสตริงที่เข้ารหัส UTF-32 เพื่อแปลง
result : ตัววนซ้ำเอาต์พุตไปยังสถานที่ในสตริง UTF-8 ที่จะผนวกผลลัพธ์ของการแปลง
ค่าส่งคืน: ตัววนซ้ำที่ชี้ไปที่สถานที่หลังจากสตริง UTF-8 ที่ผนวกเข้าด้วยกัน

ตัวอย่างการใช้งาน:

 int utf32string[] = { 0x448 , 0x65E5 , 0x10346 , 0 };
vector< unsigned char > utf8result;
utf32to8 (utf32string, utf32string + 3 , back_inserter(utf8result));
assert (utf8result.size() == 9);

ในกรณีของสตริง UTF-32 ที่ไม่ถูกต้อง A utf8::invalid_code_point จะถูกโยนออกไป

std :: string utf32to8 (const std :: u32string & s)

มีให้ในเวอร์ชัน 3.0 และใหม่กว่า ต้องใช้คอมไพเลอร์ที่สอดคล้องกับ C ++ 11

แปลงสตริงที่เข้ารหัส UTF-32 เป็น UTF-8

std::string utf32to8 ( const std::u32string& s);

s : สตริงที่เข้ารหัส UTF-32
ค่าส่งคืน: สตริงที่เข้ารหัส UTF-8

ตัวอย่างการใช้งาน:

u32string utf32string = { 0x448 , 0x65E5 , 0x10346 };
string utf8result = utf32to8(utf32string);
assert (utf8result.size() == 9);

ในกรณีของสตริง UTF-32 ที่ไม่ถูกต้อง A utf8::invalid_code_point จะถูกโยนออกไป

std :: u8string utf32to8 (const std :: u32string & s)

มีให้ในเวอร์ชัน 4.0 และใหม่กว่า ต้องใช้คอมไพเลอร์ที่สอดคล้องกับ C ++ 20

แปลงสตริงที่เข้ารหัส UTF-32 เป็น UTF-8

std::u8string utf32to8 ( const std::u32string& s);

s : สตริงที่เข้ารหัส UTF-32
ค่าส่งคืน: สตริงที่เข้ารหัส UTF-8

ตัวอย่างการใช้งาน:

u32string utf32string = { 0x448 , 0x65E5 , 0x10346 };
u8string utf8result = utf32to8(utf32string);
assert (utf8result.size() == 9);

ในกรณีของสตริง UTF-32 ที่ไม่ถูกต้อง A utf8::invalid_code_point จะถูกโยนออกไป

std :: u8string utf32to8 (const std :: u32string_view & s)

มีให้ในเวอร์ชัน 4.0 และใหม่กว่า ต้องใช้คอมไพเลอร์ที่สอดคล้องกับ C ++ 20

แปลงสตริงที่เข้ารหัส UTF-32 เป็น UTF-8

std::u8string utf32to8 ( const std::u32string_view& s);

s : สตริงที่เข้ารหัส UTF-32
ค่าส่งคืน: สตริงที่เข้ารหัส UTF-8

ตัวอย่างการใช้งาน:

u32string utf32string = { 0x448 , 0x65E5 , 0x10346 };
u32string_view utf32stringview (utf32string);
u8string utf8result = utf32to8(utf32stringview);
assert (utf8result.size() == 9);

ในกรณีของสตริง UTF-32 ที่ไม่ถูกต้อง A utf8::invalid_code_point จะถูกโยนออกไป

std :: string utf32to8 (const std :: u32string & s)

มีให้ในเวอร์ชัน 3.0 และใหม่กว่า ต้องใช้คอมไพเลอร์ที่สอดคล้องกับ C ++ 11

แปลงสตริงที่เข้ารหัส UTF-32 เป็น UTF-8

std::string utf32to8 ( const std::u32string& s);

s : สตริงที่เข้ารหัส UTF-32
ค่าส่งคืน: สตริงที่เข้ารหัส UTF-8

ตัวอย่างการใช้งาน:

u32string utf32string = { 0x448 , 0x65E5 , 0x10346 };
string utf8result = utf32to8(utf32string);
assert (utf8result.size() == 9);

ในกรณีของสตริง UTF-32 ที่ไม่ถูกต้อง A utf8::invalid_code_point จะถูกโยนออกไป

std :: string utf32to8 (std :: u32string_view s)

มีให้ในเวอร์ชัน 3.2 และใหม่กว่า ต้องใช้คอมไพเลอร์ที่สอดคล้องกับ C ++ 17

แปลงสตริงที่เข้ารหัส UTF-32 เป็น UTF-8

std::string utf32to8 (std::u32string_view s);

s : สตริงที่เข้ารหัส UTF-32
ค่าส่งคืน: สตริงที่เข้ารหัส UTF-8

ตัวอย่างการใช้งาน:

u32string utf32string = { 0x448 , 0x65E5 , 0x10346 };
u32string_view utf32stringview (utf32string);
string utf8result = utf32to8(utf32stringview);
assert (utf8result.size() == 9);

ในกรณีของสตริง UTF-32 ที่ไม่ถูกต้อง A utf8::invalid_code_point จะถูกโยนออกไป

utf8 :: utf8to32

u32bit_iterator utf8to32 (start octet_iterator, end Octet_iterator, ผลลัพธ์ U32bit_iterator)

มีให้ในเวอร์ชัน 1.0 และใหม่กว่า

แปลงสตริงที่เข้ารหัส UTF-8 เป็น UTF-32

 template < typename octet_iterator, typename u32bit_iterator>
u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result);

octet_iterator : อินพุตตัววนซ้ำ
u32bit_iterator : ตัววนซ้ำเอาต์พุต
start : ตัววนซ้ำชี้ไปที่จุดเริ่มต้นของสตริงที่เข้ารหัส UTF-8 เพื่อแปลง
end : ตัววนซ้ำชี้ไปที่การส่งผ่านของสตริงที่เข้ารหัส UTF-8 เพื่อแปลง
result : ตัววนซ้ำเอาต์พุตไปยังสถานที่ในสตริง UTF-32 ที่จะผนวกผลลัพธ์ของการแปลง
ค่าส่งคืน: ตัววนซ้ำที่ชี้ไปยังสถานที่หลังจากสตริง UTF-32 ที่ผนวกเข้าด้วยกัน

ตัวอย่างการใช้งาน:

 char * twochars = " xe6x97xa5xd1x88 " ;
vector< int > utf32result;
utf8to32 (twochars, twochars + 5 , back_inserter(utf32result));
assert (utf32result.size() == 2);

std :: u32string utf8to32 (const std :: u8string & s)

มีให้ในเวอร์ชัน 4.0 และใหม่กว่า ต้องใช้คอมไพเลอร์ที่สอดคล้องกับ C ++ 20

แปลงสตริงที่เข้ารหัส UTF-8 เป็น UTF-32

std::u32string utf8to32 ( const std::u8string& s);

s : สตริงที่เข้ารหัส UTF-8 ค่าส่งคืน: สตริงที่เข้ารหัส UTF-32

ตัวอย่างการใช้งาน:

 const std::u8string* twochars = u8" xe6x97xa5xd1x88 " ;
u32string utf32result = utf8to32(twochars);
assert (utf32result.size() == 2);

ในกรณีที่มีลำดับ UTF-8 ที่ไม่ถูกต้อง utf8::invalid_utf8 จะถูกโยนทิ้ง

std :: u32string utf8to32 (const std :: u8string_view & s)

มีให้ในเวอร์ชัน 4.0 และใหม่กว่า ต้องใช้คอมไพเลอร์ที่สอดคล้องกับ C ++ 20

แปลงสตริงที่เข้ารหัส UTF-8 เป็น UTF-32

std::u32string utf8to32 ( const std::u8string_view& s);

s : สตริงที่เข้ารหัส UTF-8 ค่าส่งคืน: สตริงที่เข้ารหัส UTF-32

ตัวอย่างการใช้งาน:

 const u8string* twochars = u8" xe6x97xa5xd1x88 " ;
const u8string_view stringview{twochars};
u32string utf32result = utf8to32(stringview);
assert (utf32result.size() == 2);

ในกรณีที่มีลำดับ UTF-8 ที่ไม่ถูกต้อง utf8::invalid_utf8 จะถูกโยนทิ้ง

std :: u32string utf8to32 (const std :: string & s)

มีให้ในเวอร์ชัน 3.0 และใหม่กว่า ต้องใช้คอมไพเลอร์ที่สอดคล้องกับ C ++ 11

แปลงสตริงที่เข้ารหัส UTF-8 เป็น UTF-32

std::u32string utf8to32 ( const std::string& s);

s : สตริงที่เข้ารหัส UTF-8 ค่าส่งคืน: สตริงที่เข้ารหัส UTF-32

ตัวอย่างการใช้งาน:

 const char * twochars = " xe6x97xa5xd1x88 " ;
u32string utf32result = utf8to32(twochars);
assert (utf32result.size() == 2);

ในกรณีที่มีลำดับ UTF-8 ที่ไม่ถูกต้อง utf8::invalid_utf8 จะถูกโยนทิ้ง

std :: u32string utf8to32 (std :: string_view s)

มีให้ในเวอร์ชัน 3.2 และใหม่กว่า ต้องใช้คอมไพเลอร์ที่สอดคล้องกับ C ++ 17

แปลงสตริงที่เข้ารหัส UTF-8 เป็น UTF-32

std::u32string utf8to32 (std::string_view s);

s : สตริงที่เข้ารหัส UTF-8 ค่าส่งคืน: สตริงที่เข้ารหัส UTF-32

ตัวอย่างการใช้งาน:

string_view twochars = " xe6x97xa5xd1x88 " ;
u32string utf32result = utf8to32(twochars);
assert (utf32result.size() == 2);

ในกรณีที่มีลำดับ UTF-8 ที่ไม่ถูกต้อง utf8::invalid_utf8 จะถูกโยนทิ้ง

utf8 :: find_invalid

octet_iterator find_invalid (artet_iterator start, end Octet_iterator)

มีให้ในเวอร์ชัน 1.0 และใหม่กว่า

ตรวจพบลำดับที่ไม่ถูกต้องภายในสตริง UTF-8

 template < typename octet_iterator> 
octet_iterator find_invalid (octet_iterator start, octet_iterator end);

octet_iterator : อินพุตตัววนซ้ำ
start : ตัววนซ้ำชี้ไปที่จุดเริ่มต้นของสตริง UTF-8 เพื่อทดสอบความถูกต้อง
end : ตัววนซ้ำชี้ไปที่การส่งผ่านของสตริง UTF-8 เพื่อทดสอบความถูกต้อง
ค่าส่งคืน: ตัววนซ้ำที่ชี้ไปที่ octet ที่ไม่ถูกต้องแรกในสตริง UTF-8 ในกรณีที่ไม่พบ end เท่ากับ

ตัวอย่างการใช้งาน:

 char utf_invalid[] = " xe6x97xa5xd1x88xfa " ;
char * invalid = find_invalid(utf_invalid, utf_invalid + 6 );
assert (invalid == utf_invalid + 5 );

ฟังก์ชั่นนี้มักจะใช้เพื่อให้แน่ใจว่าสตริง UTF-8 นั้นถูกต้องก่อนที่จะประมวลผลด้วยฟังก์ชั่นอื่น ๆ เป็นสิ่งสำคัญอย่างยิ่งที่จะเรียกมันว่าก่อนที่จะทำการดำเนินการ ที่ไม่ได้ตรวจสอบ ใด ๆ

const char* find_invalid (const char* str)

มีให้ในเวอร์ชัน 4.0 และใหม่กว่า

ตรวจพบลำดับที่ไม่ถูกต้องภายในสตริง C-utf-8 สไตล์ C

 const char * find_invalid ( const char * str);

str : สตริงที่เข้ารหัส UTF-8 ค่าส่งคืน: ตัวชี้ไปยัง octet ที่ไม่ถูกต้องแรกในสตริง UTF-8 ในกรณีที่ไม่พบคะแนนไปยังศูนย์ต่อท้าย

ตัวอย่างการใช้งาน:

 const char * utf_invalid = " xe6x97xa5xd1x88xfa " ;
const char * invalid = find_invalid(utf_invalid);
assert ((invalid - utf_invalid) == 5);

std :: size_t find_invalid (const std :: string & s)

มีให้ในเวอร์ชัน 3.0 และใหม่กว่า ก่อนหน้า 4.0 ต้องใช้คอมไพเลอร์ C ++ 11; ข้อกำหนดจะถูกยกขึ้นด้วย 4.0

ตรวจพบลำดับที่ไม่ถูกต้องภายในสตริง UTF-8

std:: size_t find_invalid ( const std::string& s);

s : สตริงที่เข้ารหัส UTF-8 ค่าส่งคืน: ดัชนีของ octet ที่ไม่ถูกต้องแรกในสตริง UTF-8 ในกรณีที่ไม่พบค่าเท่ากับ std::string::npos

ตัวอย่างการใช้งาน:

string utf_invalid = " xe6x97xa5xd1x88xfa " ;
auto invalid = find_invalid(utf_invalid);
assert (invalid == 5 );

std :: size_t find_invalid (std :: string_view s)

มีให้ในเวอร์ชัน 3.2 และใหม่กว่า ต้องใช้คอมไพเลอร์ที่สอดคล้องกับ C ++ 17

ตรวจพบลำดับที่ไม่ถูกต้องภายในสตริง UTF-8

std:: size_t find_invalid (std::string_view s);

s : สตริงที่เข้ารหัส UTF-8 ค่าส่งคืน: ดัชนีของ octet ที่ไม่ถูกต้องแรกในสตริง UTF-8 ในกรณีที่ไม่พบค่าเท่ากับ std::string_view::npos

ตัวอย่างการใช้งาน:

string_view utf_invalid = " xe6x97xa5xd1x88xfa " ;
auto invalid = find_invalid(utf_invalid);
assert (invalid == 5 );

utf8 :: is_valid

BOOL IS_VALID (OCTET_ITERATOR Start, Octet_iterator End)

มีให้ในเวอร์ชัน 1.0 และใหม่กว่า

ตรวจสอบว่าลำดับของ octets เป็นสตริง UTF-8 ที่ถูกต้องหรือไม่

 template < typename octet_iterator> 
bool is_valid (octet_iterator start, octet_iterator end);

octet_iterator : อินพุตตัววนซ้ำ
start : ตัววนซ้ำชี้ไปที่จุดเริ่มต้นของสตริง UTF-8 เพื่อทดสอบความถูกต้อง
end : ตัววนซ้ำชี้ไปที่การส่งผ่านของสตริง UTF-8 เพื่อทดสอบความถูกต้อง
ค่าส่งคืน: true ถ้าลำดับเป็นสตริง UTF-8 ที่ถูกต้อง; false ถ้าไม่

ตัวอย่างการใช้งาน:

 char utf_invalid[] = " xe6x97xa5xd1x88xfa " ;
bool bvalid = is_valid(utf_invalid, utf_invalid + 6 );
assert (bvalid == false );

is_valid เป็นชวเลขสำหรับ find_invalid(start, end) == end; - คุณอาจต้องการใช้มันเพื่อให้แน่ใจว่าลำดับไบต์เป็นสตริง UTF-8 ที่ถูกต้องโดยไม่จำเป็นต้องรู้ว่ามันล้มเหลวที่ไหนหากไม่ถูกต้อง

bool is_valid (const char* str)

มีให้ในเวอร์ชัน 4.0 และใหม่กว่า

ตรวจสอบว่าสตริงสไตล์ C มีข้อความที่เข้ารหัส UTF-8 ที่ถูกต้องหรือไม่

 bool is_valid ( const char * str);

str : สตริงที่เข้ารหัส UTF-8
ค่าส่งคืน: true ถ้าสตริงมีข้อความที่เข้ารหัส UTF-8 ที่ถูกต้อง; false ถ้าไม่

ตัวอย่างการใช้งาน:

 char utf_invalid[] = " xe6x97xa5xd1x88xfa " ;
bool bvalid = is_valid(utf_invalid);
assert (bvalid == false );

คุณอาจต้องการใช้ is_valid เพื่อให้แน่ใจว่าสตริงมีข้อความ UTF-8 ที่ถูกต้องโดยไม่จำเป็นต้องรู้ว่ามันล้มเหลวที่ไหนหากไม่ถูกต้อง

บูล is_valid (const std :: string & s)

ตรวจสอบว่าวัตถุสตริงมีข้อความที่เข้ารหัส UTF-8 ที่ถูกต้องหรือไม่

 bool is_valid ( const std::string& s);

s : สตริงที่เข้ารหัส UTF-8
ค่าส่งคืน: true ถ้าสตริงมีข้อความที่เข้ารหัส UTF-8 ที่ถูกต้อง; false ถ้าไม่

ตัวอย่างการใช้งาน:

 char utf_invalid[] = " xe6x97xa5xd1x88xfa " ;
bool bvalid = is_valid(utf_invalid);
assert (bvalid == false );

bool is_valid (std :: string_view s)

มีให้ในเวอร์ชัน 3.2 และใหม่กว่า ต้องใช้คอมไพเลอร์ที่สอดคล้องกับ C ++ 17

ตรวจสอบว่าวัตถุสตริงมีข้อความที่เข้ารหัส UTF-8 ที่ถูกต้องหรือไม่

 bool is_valid (std::string_view s);

ตัวอย่างการใช้งาน:

string_view utf_invalid = " xe6x97xa5xd1x88xfa " ;
bool bvalid = is_valid(utf_invalid);
assert (bvalid == false );

utf8 :: applent_invalid

output_iterator replace_invalid (start octet_iterator, end Octet_iterator, output_iterator out, การเปลี่ยน UTFCHAR32_T)

มีให้ในเวอร์ชัน 2.0 และใหม่กว่า

แทนที่ลำดับ UTF-8 ที่ไม่ถูกต้องทั้งหมดภายในสตริงด้วยเครื่องหมายทดแทน

 template < typename octet_iterator, typename output_iterator>
output_iterator replace_invalid (octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement);
template < typename octet_iterator, typename output_iterator>
output_iterator replace_invalid (octet_iterator start, octet_iterator end, output_iterator out);

octet_iterator : อินพุตตัววนซ้ำ
output_iterator : ตัววนซ้ำเอาต์พุต
start : ตัววนซ้ำที่ชี้ไปที่จุดเริ่มต้นของสตริง UTF-8 เพื่อค้นหาลำดับ UTF-8 ที่ไม่ถูกต้อง
end : ตัววนซ้ำที่ชี้ไปที่การส่งผ่านของสตริง UTF-8 เพื่อค้นหาลำดับ UTF-8 ที่ไม่ถูกต้อง
out : ตัววนซ้ำเอาต์พุตไปยังช่วงที่เก็บผลลัพธ์ของการแทนที่
replacement : A Unicode code point for the replacement marker. The version without this parameter assumes the value 0xfffd
Return value: An iterator pointing to the place after the UTF-8 string with replaced invalid sequences.

Example of use:

 char invalid_sequence[] = " a x80xe0xa0xc0xafxedxa0x80 z " ;
vector< char > replace_invalid_result;
replace_invalid (invalid_sequence, invalid_sequence + sizeof (invalid_sequence), back_inserter(replace_invalid_result), '?');
bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
assert (bvalid);
char * fixed_invalid_sequence = " a????z " ;
assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence));

replace_invalid does not perform in-place replacement of invalid sequences. Rather, it produces a copy of the original string with the invalid sequences replaced with a replacement marker. Therefore, out must not be in the [start, end] range.

std::string replace_invalid(const std::string& s, utfchar32_t replacement)

Available in version 3.0 and later. Prior to 4.0 it required a C++ 11 compiler; the requirement is lifted with 4.0

Replaces all invalid UTF-8 sequences within a string with a replacement marker.

std::string replace_invalid ( const std::string& s, utfchar32_t replacement);
std::string replace_invalid ( const std::string& s);

s : a UTF-8 encoded string.
replacement : A Unicode code point for the replacement marker. The version without this parameter assumes the value 0xfffd
Return value: A UTF-8 encoded string with replaced invalid sequences.

Example of use:

string invalid_sequence = " a x80xe0xa0xc0xafxedxa0x80 z " ;
string replace_invalid_result = replace_invalid(invalid_sequence, ' ? ' );
bvalid = is_valid(replace_invalid_result);
assert (bvalid);
const string fixed_invalid_sequence = " a????z " ;
assert (fixed_invalid_sequence == replace_invalid_result);

std::string replace_invalid(std::string_view s, char32_t replacement)

Available in version 3.2 and later. Requires a C++ 17 compliant compiler.

Replaces all invalid UTF-8 sequences within a string with a replacement marker.

std::string replace_invalid (std::string_view s, char32_t replacement);
std::string replace_invalid (std::string_view s);

Example of use:

string_view invalid_sequence = " a x80xe0xa0xc0xafxedxa0x80 z " ;
string replace_invalid_result = replace_invalid(invalid_sequence, ' ? ' );
bool bvalid = is_valid(replace_invalid_result);
assert (bvalid);
const string fixed_invalid_sequence = " a????z " ;
assert (fixed_invalid_sequence, replace_invalid_result);

utf8::starts_with_bom

bool starts_with_bom (octet_iterator it, octet_iterator end)

Available in version 2.3 and later.

Checks whether an octet sequence starts with a UTF-8 byte order mark (BOM)

 template < typename octet_iterator> 
bool starts_with_bom (octet_iterator it, octet_iterator end);

octet_iterator : an input iterator.
it : beginning of the octet sequence to check
end : pass-end of the sequence to check
Return value: true if the sequence starts with a UTF-8 byte order mark; false if not.

Example of use:

 unsigned char byte_order_mark[] = { 0xef , 0xbb , 0xbf };
bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof (byte_order_mark));
assert (bbom == true );

The typical use of this function is to check the first three bytes of a file. If they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 encoded text.

bool starts_with_bom(const std::string& s)

Available in version 3.0 and later. Prior to 4.0 it required a C++ 11 compiler; the requirement is lifted with 4.0

Checks whether a string starts with a UTF-8 byte order mark (BOM)

 bool starts_with_bom ( const std::string& s);

s : a UTF-8 encoded string. Return value: true if the string starts with a UTF-8 byte order mark; false if not.

Example of use:

string byte_order_mark = { char ( 0xef ), char ( 0xbb ), char ( 0xbf )};
bool bbom = starts_with_bom(byte_order_mark);
assert (bbom == true );
string threechars = " xf0x90x8dx86xe6x97xa5xd1x88 " ;
bool no_bbom = starts_with_bom(threechars);
assert (no_bbom == false );

The typical use of this function is to check the first three bytes of a file. If they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 encoded text.

bool starts_with_bom(std::string_view s)

Available in version 3.2 and later. Requires a C++ 17 compliant compiler.

Checks whether a string starts with a UTF-8 byte order mark (BOM)

 bool starts_with_bom (std::string_view s);

s : a UTF-8 encoded string. Return value: true if the string starts with a UTF-8 byte order mark; false if not.

Example of use:

string byte_order_mark = { char ( 0xef ), char ( 0xbb ), char ( 0xbf )};
string_view byte_order_mark_view (byte_order_mark);
bool bbom = starts_with_bom(byte_order_mark_view);
assert (bbom);
string_view threechars = " xf0x90x8dx86xe6x97xa5xd1x88 " ;
bool no_bbom = starts_with_bom(threechars);
assert (!no_bbom);

The typical use of this function is to check the first three bytes of a file. If they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 encoded text.

Types From utf8 Namespace

utf8::exception

Available in version 2.3 and later.

Base class for the exceptions thrown by UTF CPP library functions.

 class exception : public std :: exception {};

Example of use:

 try {
  code_that_uses_utf_cpp_library ();
}
catch ( const utf8:: exception & utfcpp_ex) {
  cerr << utfcpp_ex. what ();
}

utf8::invalid_code_point

Available in version 1.0 and later.

Thrown by UTF8 CPP functions such as advance and next if an UTF-8 sequence represents and invalid code point.

 class invalid_code_point : public exception {
public: 
    utfchar32_t code_point () const ;
};

Member function code_point() can be used to determine the invalid code point that caused the exception to be thrown.

utf8::invalid_utf8

Available in version 1.0 and later.

Thrown by UTF8 CPP functions such as next and prior if an invalid UTF-8 sequence is detected during decoding.

 class invalid_utf8 : public exception {
public: 
    utfchar8_t utf8_octet () const ;
};

Member function utf8_octet() can be used to determine the beginning of the byte sequence that caused the exception to be thrown.

utf8::invalid_utf16

Available in version 1.0 and later.

Thrown by UTF8 CPP function utf16to8 if an invalid UTF-16 sequence is detected during decoding.

 class invalid_utf16 : public exception {
public: 
    utfchar16_t utf16_word () const ;
};

Member function utf16_word() can be used to determine the UTF-16 code unit that caused the exception to be thrown.

utf8::not_enough_room

Available in version 1.0 and later.

Thrown by UTF8 CPP functions such as next if the end of the decoded UTF-8 sequence was reached before the code point was decoded.

 class not_enough_room : public exception {};

utf8::iterator

Available in version 2.0 and later.

Adapts the underlying octet iterator to iterate over the sequence of code points, rather than raw octets.

 template < typename octet_iterator>
class iterator ;

Member functions

iterator(); the default constructor; the underlying octet_iterator is constructed with its default constructor.

explicit iterator (const octet_iterator& octet_it, const octet_iterator& range_start, const octet_iterator& range_end); a constructor that initializes the underlying octet_iterator with octet_it and sets the range in which the iterator is considered valid.

octet_iterator base () const; returns the underlying octet_iterator.

utfchar32_t operator * () const; decodes the utf-8 sequence the underlying octet_iterator is pointing to and returns the code point.

bool operator == (const iterator& rhs) const; returns true if the two underlying iterators are equal.

bool operator != (const iterator& rhs) const; returns true if the two underlying iterators are not equal.

iterator& operator ++ (); the prefix increment - moves the iterator to the next UTF-8 encoded code point.

iterator operator ++ (int); the postfix increment - moves the iterator to the next UTF-8 encoded code point and returns the current one.

iterator& operator -- (); the prefix decrement - moves the iterator to the previous UTF-8 encoded code point.

iterator operator -- (int); the postfix decrement - moves the iterator to the previous UTF-8 encoded code point and returns the current one.

Example of use:

 char * threechars = " xf0x90x8dx86xe6x97xa5xd1x88 " ;
utf8::iterator< char *> it (threechars, threechars, threechars + 9 );
utf8::iterator< char *> it2 = it;
assert (it2 == it);
assert (*it == 0x10346 );
assert (*(++it) == 0x65e5);
assert ((*it++) == 0x65e5);
assert (*it == 0x0448 );
assert (it != it2);
utf8::iterator< char *> endit (threechars + 9 , threechars, threechars + 9 );  
assert (++it == endit);
assert (*(--it) == 0x0448);
assert ((*it--) == 0x0448);
assert (*it == 0x65e5 );
assert (--it == utf8::iterator< char *>(threechars, threechars, threechars + 9 ));
assert (*it == 0x10346 );

The purpose of utf8::iterator adapter is to enable easy iteration as well as the use of STL algorithms with UTF-8 encoded strings. Increment and decrement operators are implemented in terms of utf8::next() and utf8::prior() functions.

Note that utf8::iterator adapter is a checked iterator. It operates on the range specified in the constructor; any attempt to go out of that range will result in an exception. Even the comparison operators require both iterator object to be constructed against the same range - otherwise an exception is thrown. Typically, the range will be determined by sequence container functions begin and end , ie:

std::string s = " example " ;
utf8::iterator i (s.begin(), s.begin(), s.end());

Functions From utf8::unchecked Namespace

utf8::unchecked::append

Available in version 1.0 and later.

Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence to a UTF-8 string.

 template < typename octet_iterator>
octet_iterator append ( utfchar32_t cp, octet_iterator result);

cp : A 32 bit integer representing a code point to append to the sequence.
result : An output iterator to the place in the sequence where to append the code point.
Return value: An iterator pointing to the place after the newly appended sequence.

Example of use:

 unsigned char u[ 5 ] = { 0 , 0 , 0 , 0 , 0 };
unsigned char * end = unchecked::append( 0x0448 , u);
assert (u[ 0 ] == 0xd1 && u[ 1 ] == 0x88 && u[ 2 ] == 0 && u[ 3 ] == 0 && u[ 4 ] == 0 );

This is a faster but less safe version of utf8::append . It does not check for validity of the supplied code point, and may produce an invalid UTF-8 sequence.

utf8::unchecked::append16

Available in version 4.0 and later.

Encodes a 32 bit code point as a UTF-16 sequence of words and appends the sequence to a UTF-16 string.

 template < typename word_iterator>
word_iterator append16 ( utfchar32_t cp, word_iterator result)

Example of use:

 unsigned short u[ 5 ] = { 0 , 0 };
utf8::unchecked::append16 ( 0x0448 , u);
assert (u[ 0 ], 0x0448 );
assert (u[ 1 ], 0x0000 );

This is a faster but less safe version of utf8::append . It does not check for validity of the supplied code point, and may produce an invalid UTF-8 sequence.

utf8::unchecked::next

Available in version 1.0 and later.

Given the iterator to the beginning of a UTF-8 sequence, it returns the code point and moves the iterator to the next position.

 template < typename octet_iterator>
utfchar32_t next (octet_iterator& it);

it : a reference to an iterator pointing to the beginning of an UTF-8 encoded code point. After the function returns, it is incremented to point to the beginning of the next code point.
Return value: the 32 bit representation of the processed UTF-8 code point.

Example of use:

 char * twochars = " xe6x97xa5xd1x88 " ;
char * w = twochars;
int cp = unchecked::next(w);
assert (cp == 0x65e5 );
assert (w == twochars + 3 );

This is a faster but less safe version of utf8::next . It does not check for validity of the supplied UTF-8 sequence.

utf8::next16

Available in version 4.0 and later.

Given the iterator to the beginning of the UTF-16 sequence, it returns the code point and moves the iterator to the next position.

 template < typename word_iterator>
utfchar32_t next16 (word_iterator& it);

word_iterator : an input iterator.
it : a reference to an iterator pointing to the beginning of an UTF-16 encoded code point. After the function returns, it is incremented to point to the beginning of the next code point.

Return value: the 32 bit representation of the processed UTF-16 code point.

Example of use:

 const unsigned short u[ 3 ] = { 0x65e5 , 0xd800 , 0xdf46 };
const unsigned short * w = u;
int cp = unchecked::next16(w);
assert (cp, 0x65e5 );
assert (w, u + 1 );

This function is typically used to iterate through a UTF-16 encoded string.

This is a faster but less safe version of utf8::next16 . It does not check for validity of the supplied UTF-8 sequence.

utf8::unchecked::peek_next

Available in version 2.1 and later.

Given the iterator to the beginning of a UTF-8 sequence, it returns the code point.

 template < typename octet_iterator>
utfchar32_t peek_next (octet_iterator it);

it : an iterator pointing to the beginning of an UTF-8 encoded code point.
Return value: the 32 bit representation of the processed UTF-8 code point.

Example of use:

 char * twochars = " xe6x97xa5xd1x88 " ;
char * w = twochars;
int cp = unchecked::peek_next(w);
assert (cp == 0x65e5 );
assert (w == twochars);

This is a faster but less safe version of utf8::peek_next . It does not check for validity of the supplied UTF-8 sequence.

utf8::unchecked::prior

Available in version 1.02 and later.

Given a reference to an iterator pointing to an octet in a UTF-8 sequence, it decreases the iterator until it hits the beginning of the previous UTF-8 encoded code point and returns the 32 bits representation of the code point.

 template < typename octet_iterator>
utfchar32_t prior (octet_iterator& it);

it : a reference pointing to an octet within a UTF-8 encoded string. After the function returns, it is decremented to point to the beginning of the previous code point.
Return value: the 32 bit representation of the previous code point.

Example of use:

 char * twochars = " xe6x97xa5xd1x88 " ;
char * w = twochars + 3 ;
int cp = unchecked::prior (w);
assert (cp == 0x65e5 );
assert (w == twochars);

This is a faster but less safe version of utf8::prior . It does not check for validity of the supplied UTF-8 sequence and offers no boundary checking.

utf8::unchecked::advance

Available in version 1.0 and later.

Advances an iterator by the specified number of code points within an UTF-8 sequence.

 template < typename octet_iterator, typename distance_type>
void advance (octet_iterator& it, distance_type n);

it : a reference to an iterator pointing to the beginning of an UTF-8 encoded code point. After the function returns, it is incremented to point to the nth following code point. n : number of code points it should be advanced. A negative value means decrement.

Example of use:

 char * twochars = " xe6x97xa5xd1x88 " ;
char * w = twochars;
unchecked::advance (w, 2 );
assert (w == twochars + 5 );

This is a faster but less safe version of utf8::advance . It does not check for validity of the supplied UTF-8 sequence and offers no boundary checking.

utf8::unchecked::distance

Available in version 1.0 and later.

Given the iterators to two UTF-8 encoded code points in a sequence, returns the number of code points between them.

 template < typename octet_iterator>
typename std::iterator_traits<octet_iterator>::difference_type distance (octet_iterator first, octet_iterator last);

first : an iterator to a beginning of a UTF-8 encoded code point.
last : an iterator to a "post-end" of the last UTF-8 encoded code point in the sequence we are trying to determine the length. It can be the beginning of a new code point, or not.
Return value: the distance between the iterators, in code points.

Example of use:

 char * twochars = " xe6x97xa5xd1x88 " ;
size_t dist = utf8::unchecked::distance(twochars, twochars + 5 );
assert (dist == 2 );

This is a faster but less safe version of utf8::distance . It does not check for validity of the supplied UTF-8 sequence.

utf8::unchecked::utf16to8

Available in version 1.0 and later.

Converts a UTF-16 encoded string to UTF-8.

 template < typename u16bit_iterator, typename octet_iterator>
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result);

start : an iterator pointing to the beginning of the UTF-16 encoded string to convert.
end : an iterator pointing to pass-the-end of the UTF-16 encoded string to convert.
result : an output iterator to the place in the UTF-8 string where to append the result of conversion.
Return value: An iterator pointing to the place after the appended UTF-8 string.

Example of use:

 unsigned short utf16string[] = { 0x41 , 0x0448 , 0x65e5 , 0xd834 , 0xdd1e };
vector< unsigned char > utf8result;
unchecked::utf16to8 (utf16string, utf16string + 5 , back_inserter(utf8result));
assert (utf8result.size() == 10);

This is a faster but less safe version of utf8::utf16to8 . It does not check for validity of the supplied UTF-16 sequence.

utf8::unchecked::utf8to16

Available in version 1.0 and later.

Converts an UTF-8 encoded string to UTF-16

 template < typename u16bit_iterator, typename octet_iterator>
u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result);

start : an iterator pointing to the beginning of the UTF-8 encoded string to convert. end : an iterator pointing to pass-the-end of the UTF-8 encoded string to convert.
result : an output iterator to the place in the UTF-16 string where to append the result of conversion.
Return value: An iterator pointing to the place after the appended UTF-16 string.

Example of use:

 char utf8_with_surrogates[] = " xe6x97xa5xd1x88xf0x9dx84x9e " ;
vector < unsigned short > utf16result;
unchecked::utf8to16 (utf8_with_surrogates, utf8_with_surrogates + 9 , back_inserter(utf16result));
assert (utf16result.size() == 4);
assert (utf16result[ 2 ] == 0xd834 );
assert (utf16result[ 3 ] == 0xdd1e );

This is a faster but less safe version of utf8::utf8to16 . It does not check for validity of the supplied UTF-8 sequence.

utf8::unchecked::utf32to8

Available in version 1.0 and later.

Converts a UTF-32 encoded string to UTF-8.

 template < typename octet_iterator, typename u32bit_iterator>
octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result);

start : an iterator pointing to the beginning of the UTF-32 encoded string to convert.
end : an iterator pointing to pass-the-end of the UTF-32 encoded string to convert.
result : an output iterator to the place in the UTF-8 string where to append the result of conversion.
Return value: An iterator pointing to the place after the appended UTF-8 string.

Example of use:

 int utf32string[] = { 0x448 , 0x65e5 , 0x10346 , 0 };
vector< unsigned char > utf8result;
utf32to8 (utf32string, utf32string + 3 , back_inserter(utf8result));
assert (utf8result.size() == 9);

This is a faster but less safe version of utf8::utf32to8 . It does not check for validity of the supplied UTF-32 sequence.

utf8::unchecked::utf8to32

Available in version 1.0 and later.

Converts a UTF-8 encoded string to UTF-32.

 template < typename octet_iterator, typename u32bit_iterator>
u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result);

start : an iterator pointing to the beginning of the UTF-8 encoded string to convert.
end : an iterator pointing to pass-the-end of the UTF-8 encoded string to convert.
result : an output iterator to the place in the UTF-32 string where to append the result of conversion.
Return value: An iterator pointing to the place after the appended UTF-32 string.

Example of use:

 char * twochars = " xe6x97xa5xd1x88 " ;
vector< int > utf32result;
unchecked::utf8to32 (twochars, twochars + 5 , back_inserter(utf32result));
assert (utf32result.size() == 2);

This is a faster but less safe version of utf8::utf8to32 . It does not check for validity of the supplied UTF-8 sequence.

utf8::unchecked::replace_invalid

Available in version 3.1 and later.

Replaces all invalid UTF-8 sequences within a string with a replacement marker.

 template < typename octet_iterator, typename output_iterator>
output_iterator replace_invalid (octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement);
template < typename octet_iterator, typename output_iterator>
output_iterator replace_invalid (octet_iterator start, octet_iterator end, output_iterator out);

octet_iterator : an input iterator.
output_iterator : an output iterator.
start : an iterator pointing to the beginning of the UTF-8 string to look for invalid UTF-8 sequences.
end : an iterator pointing to pass-the-end of the UTF-8 string to look for invalid UTF-8 sequences.
out : An output iterator to the range where the result of replacement is stored.
replacement : A Unicode code point for the replacement marker. The version without this parameter assumes the value 0xfffd
Return value: An iterator pointing to the place after the UTF-8 string with replaced invalid sequences.

Example of use:

 char invalid_sequence[] = " a x80xe0xa0xc0xafxedxa0x80 z " ;
vector< char > replace_invalid_result;
unchecked::replace_invalid (invalid_sequence, invalid_sequence + sizeof (invalid_sequence), back_inserter(replace_invalid_result), '?');
bvalid = utf8::is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
assert (bvalid);
char * fixed_invalid_sequence = " a????z " ;
assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence));

Unlike utf8::replace_invalid , this function does not verify validity of the replacement marker.

Types From utf8::unchecked Namespace

utf8::iterator

Available in version 2.0 and later.

Adapts the underlying octet iterator to iterate over the sequence of code points, rather than raw octets.

 template < typename octet_iterator>
class iterator ;

Member functions

iterator(); the default constructor; the underlying octet_iterator is constructed with its default constructor.

explicit iterator (const octet_iterator& octet_it); a constructor that initializes the underlying octet_iterator with octet_it .