update upf8 from 2.3.4 to 3.2.3

New convenience API for C++ 11 and later compilers. The library still works with C++ 98/03 compliant compilers, just without the new functions.
advance() function works in both directions.
The following deprecated functions were removed:

previous() - deprecated since version 1.02.
is_bom() - deprecated since version 2.3.
Fix of the project version number at CMakeLists.txt
Continuous Integration with Google Tests and CircleCI
A minor release that contains fix for Issue #31 Program fails to link when including utf8.h in multiple files.
This release adds one new API call: unchecked::replace_invalid().
Optional support for C++ 17 std::string_view.
The release contains the fix the inclusion of both cpp11 and cpp17 headers on C++17 compilation. Also some additional tests for using string literals and string objects with modern compilers.
pull/5148/head
Andrea Reale 2023-06-20 11:23:28 +02:00
parent b94392d199
commit ce59d49dd9
9 changed files with 1694 additions and 1848 deletions

View File

@ -0,0 +1,23 @@
Boost Software License - Version 1.0 - August 17th, 2003
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

File diff suppressed because it is too large Load Diff

View File

@ -1,12 +0,0 @@
utf8 cpp library
Release 2.3.4
A minor bug fix release. Thanks to all who reported bugs.
Note: Version 2.3.3 contained a regression, and therefore was removed.
Changes from version 2.3.2
- Bug fix [39]: checked.h Line 273 and unchecked.h Line 182 have an extra ';'
- Bug fix [36]: replace_invalid() only works with back_inserter
Files included in the release: utf8.h, core.h, checked.h, unchecked.h, utf8cpp.html, ReleaseNotes

File diff suppressed because it is too large Load Diff

View File

@ -42,7 +42,7 @@ namespace utf8
uint32_t cp; uint32_t cp;
public: public:
invalid_code_point(uint32_t codepoint) : cp(codepoint) {} invalid_code_point(uint32_t codepoint) : cp(codepoint) {}
virtual const char* what() const NOEXCEPT OVERRIDE { return "Invalid code point"; } virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid code point"; }
uint32_t code_point() const {return cp;} uint32_t code_point() const {return cp;}
}; };
@ -50,7 +50,8 @@ namespace utf8
uint8_t u8; uint8_t u8;
public: public:
invalid_utf8 (uint8_t u) : u8(u) {} invalid_utf8 (uint8_t u) : u8(u) {}
virtual const char* what() const NOEXCEPT OVERRIDE { return "Invalid UTF-8"; } invalid_utf8 (char c) : u8(static_cast<uint8_t>(c)) {}
virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-8"; }
uint8_t utf8_octet() const {return u8;} uint8_t utf8_octet() const {return u8;}
}; };
@ -58,13 +59,13 @@ namespace utf8
uint16_t u16; uint16_t u16;
public: public:
invalid_utf16 (uint16_t u) : u16(u) {} invalid_utf16 (uint16_t u) : u16(u) {}
virtual const char* what() const NOEXCEPT OVERRIDE { return "Invalid UTF-16"; } virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-16"; }
uint16_t utf16_word() const {return u16;} uint16_t utf16_word() const {return u16;}
}; };
class not_enough_room : public exception { class not_enough_room : public exception {
public: public:
virtual const char* what() const NOEXCEPT OVERRIDE { return "Not enough space"; } virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Not enough space"; }
}; };
/// The library API - functions intended to be called by the users /// The library API - functions intended to be called by the users
@ -75,24 +76,7 @@ namespace utf8
if (!utf8::internal::is_code_point_valid(cp)) if (!utf8::internal::is_code_point_valid(cp))
throw invalid_code_point(cp); throw invalid_code_point(cp);
if (cp < 0x80) // one octet return internal::append(cp, result);
*(result++) = static_cast<uint8_t>(cp);
else if (cp < 0x800) { // two octets
*(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
else if (cp < 0x10000) { // three octets
*(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
else { // four octets
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
*(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80);
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
return result;
} }
template <typename octet_iterator, typename output_iterator> template <typename octet_iterator, typename output_iterator>
@ -148,7 +132,7 @@ namespace utf8
case internal::INVALID_LEAD : case internal::INVALID_LEAD :
case internal::INCOMPLETE_SEQUENCE : case internal::INCOMPLETE_SEQUENCE :
case internal::OVERLONG_SEQUENCE : case internal::OVERLONG_SEQUENCE :
throw invalid_utf8(*it); throw invalid_utf8(static_cast<uint8_t>(*it));
case internal::INVALID_CODE_POINT : case internal::INVALID_CODE_POINT :
throw invalid_code_point(cp); throw invalid_code_point(cp);
} }
@ -325,7 +309,9 @@ namespace utf8
} // namespace utf8 } // namespace utf8
#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later #if UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later
#include "cpp17.h"
#elif UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
#include "cpp11.h" #include "cpp11.h"
#endif // C++ 11 or later #endif // C++ 11 or later

View File

@ -39,11 +39,11 @@ DEALINGS IN THE SOFTWARE.
#endif #endif
#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later #if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
#define OVERRIDE override #define UTF_CPP_OVERRIDE override
#define NOEXCEPT noexcept #define UTF_CPP_NOEXCEPT noexcept
#else // C++ 98/03 #else // C++ 98/03
#define OVERRIDE #define UTF_CPP_OVERRIDE
#define NOEXCEPT throw() #define UTF_CPP_NOEXCEPT throw()
#endif // C++ 11 or later #endif // C++ 11 or later
@ -297,6 +297,55 @@ namespace internal
return utf8::internal::validate_next(it, end, ignored); return utf8::internal::validate_next(it, end, ignored);
} }
// Internal implementation of both checked and unchecked append() function
// This function will be invoked by the overloads below, as they will know
// the octet_type.
template <typename octet_iterator, typename octet_type>
octet_iterator append(uint32_t cp, octet_iterator result) {
if (cp < 0x80) // one octet
*(result++) = static_cast<octet_type>(cp);
else if (cp < 0x800) { // two octets
*(result++) = static_cast<octet_type>((cp >> 6) | 0xc0);
*(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
}
else if (cp < 0x10000) { // three octets
*(result++) = static_cast<octet_type>((cp >> 12) | 0xe0);
*(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
}
else { // four octets
*(result++) = static_cast<octet_type>((cp >> 18) | 0xf0);
*(result++) = static_cast<octet_type>(((cp >> 12) & 0x3f)| 0x80);
*(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
}
return result;
}
// One of the following overloads will be invoked from the API calls
// A simple (but dangerous) case: the caller appends byte(s) to a char array
inline char* append(uint32_t cp, char* result) {
return append<char*, char>(cp, result);
}
// Hopefully, most common case: the caller uses back_inserter
// i.e. append(cp, std::back_inserter(str));
template<typename container_type>
std::back_insert_iterator<container_type> append
(uint32_t cp, std::back_insert_iterator<container_type> result) {
return append<std::back_insert_iterator<container_type>,
typename container_type::value_type>(cp, result);
}
// The caller uses some other kind of output operator - not covered above
// Note that in this case we are not able to determine octet_type
// so we assume it's uint_8; that can cause a conversion warning if we are wrong.
template <typename octet_iterator>
octet_iterator append(uint32_t cp, octet_iterator result) {
return append<octet_iterator, uint8_t>(cp, result);
}
} // namespace internal } // namespace internal
/// The library API - functions intended to be called by the users /// The library API - functions intended to be called by the users

View File

@ -70,7 +70,7 @@ namespace utf8
inline std::size_t find_invalid(const std::string& s) inline std::size_t find_invalid(const std::string& s)
{ {
std::string::const_iterator invalid = find_invalid(s.begin(), s.end()); std::string::const_iterator invalid = find_invalid(s.begin(), s.end());
return (invalid == s.end()) ? std::string::npos : (invalid - s.begin()); return (invalid == s.end()) ? std::string::npos : static_cast<std::size_t>(invalid - s.begin());
} }
inline bool is_valid(const std::string& s) inline bool is_valid(const std::string& s)

View File

@ -0,0 +1,103 @@
// Copyright 2018 Nemanja Trifunovic
/*
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
#ifndef UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9
#define UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9
#include "checked.h"
#include <string>
namespace utf8
{
inline void append(char32_t cp, std::string& s)
{
append(uint32_t(cp), std::back_inserter(s));
}
inline std::string utf16to8(std::u16string_view s)
{
std::string result;
utf16to8(s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline std::u16string utf8to16(std::string_view s)
{
std::u16string result;
utf8to16(s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline std::string utf32to8(std::u32string_view s)
{
std::string result;
utf32to8(s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline std::u32string utf8to32(std::string_view s)
{
std::u32string result;
utf8to32(s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline std::size_t find_invalid(std::string_view s)
{
std::string_view::const_iterator invalid = find_invalid(s.begin(), s.end());
return (invalid == s.end()) ? std::string_view::npos : static_cast<std::size_t>(invalid - s.begin());
}
inline bool is_valid(std::string_view s)
{
return is_valid(s.begin(), s.end());
}
inline std::string replace_invalid(std::string_view s, char32_t replacement)
{
std::string result;
replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
return result;
}
inline std::string replace_invalid(std::string_view s)
{
std::string result;
replace_invalid(s.begin(), s.end(), std::back_inserter(result));
return result;
}
inline bool starts_with_bom(std::string_view s)
{
return starts_with_bom(s.begin(), s.end());
}
} // namespace utf8
#endif // header guard

View File

@ -37,24 +37,7 @@ namespace utf8
template <typename octet_iterator> template <typename octet_iterator>
octet_iterator append(uint32_t cp, octet_iterator result) octet_iterator append(uint32_t cp, octet_iterator result)
{ {
if (cp < 0x80) // one octet return internal::append(cp, result);
*(result++) = static_cast<uint8_t>(cp);
else if (cp < 0x800) { // two octets
*(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
else if (cp < 0x10000) { // three octets
*(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
else { // four octets
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
*(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80);
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
return result;
} }
template <typename octet_iterator, typename output_iterator> template <typename octet_iterator, typename output_iterator>