update upf8 from 2.3.4 to 3.2.3
New convenience API for C++ 11 and later compilers. The library still works with C++ 98/03 compliant compilers, just without the new functions. advance() function works in both directions. The following deprecated functions were removed: previous() - deprecated since version 1.02. is_bom() - deprecated since version 2.3. Fix of the project version number at CMakeLists.txt Continuous Integration with Google Tests and CircleCI A minor release that contains fix for Issue #31 Program fails to link when including utf8.h in multiple files. This release adds one new API call: unchecked::replace_invalid(). Optional support for C++ 17 std::string_view. The release contains the fix the inclusion of both cpp11 and cpp17 headers on C++17 compilation. Also some additional tests for using string literals and string objects with modern compilers.pull/5148/head
parent
b94392d199
commit
ce59d49dd9
|
@ -0,0 +1,23 @@
|
||||||
|
Boost Software License - Version 1.0 - August 17th, 2003
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person or organization
|
||||||
|
obtaining a copy of the software and accompanying documentation covered by
|
||||||
|
this license (the "Software") to use, reproduce, display, distribute,
|
||||||
|
execute, and transmit the Software, and to prepare derivative works of the
|
||||||
|
Software, and to permit third-parties to whom the Software is furnished to
|
||||||
|
do so, all subject to the following:
|
||||||
|
|
||||||
|
The copyright notices in the Software and this entire statement, including
|
||||||
|
the above license grant, this restriction and the following disclaimer,
|
||||||
|
must be included in all copies of the Software, in whole or in part, and
|
||||||
|
all derivative works of the Software, unless such copies or derivative
|
||||||
|
works are solely in the form of machine-executable object code generated by
|
||||||
|
a source language processor.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
|
||||||
|
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
|
||||||
|
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
|
||||||
|
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
DEALINGS IN THE SOFTWARE.
|
File diff suppressed because it is too large
Load Diff
|
@ -1,12 +0,0 @@
|
||||||
utf8 cpp library
|
|
||||||
Release 2.3.4
|
|
||||||
|
|
||||||
A minor bug fix release. Thanks to all who reported bugs.
|
|
||||||
|
|
||||||
Note: Version 2.3.3 contained a regression, and therefore was removed.
|
|
||||||
|
|
||||||
Changes from version 2.3.2
|
|
||||||
- Bug fix [39]: checked.h Line 273 and unchecked.h Line 182 have an extra ';'
|
|
||||||
- Bug fix [36]: replace_invalid() only works with back_inserter
|
|
||||||
|
|
||||||
Files included in the release: utf8.h, core.h, checked.h, unchecked.h, utf8cpp.html, ReleaseNotes
|
|
File diff suppressed because it is too large
Load Diff
|
@ -42,7 +42,7 @@ namespace utf8
|
||||||
uint32_t cp;
|
uint32_t cp;
|
||||||
public:
|
public:
|
||||||
invalid_code_point(uint32_t codepoint) : cp(codepoint) {}
|
invalid_code_point(uint32_t codepoint) : cp(codepoint) {}
|
||||||
virtual const char* what() const NOEXCEPT OVERRIDE { return "Invalid code point"; }
|
virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid code point"; }
|
||||||
uint32_t code_point() const {return cp;}
|
uint32_t code_point() const {return cp;}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -50,7 +50,8 @@ namespace utf8
|
||||||
uint8_t u8;
|
uint8_t u8;
|
||||||
public:
|
public:
|
||||||
invalid_utf8 (uint8_t u) : u8(u) {}
|
invalid_utf8 (uint8_t u) : u8(u) {}
|
||||||
virtual const char* what() const NOEXCEPT OVERRIDE { return "Invalid UTF-8"; }
|
invalid_utf8 (char c) : u8(static_cast<uint8_t>(c)) {}
|
||||||
|
virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-8"; }
|
||||||
uint8_t utf8_octet() const {return u8;}
|
uint8_t utf8_octet() const {return u8;}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -58,13 +59,13 @@ namespace utf8
|
||||||
uint16_t u16;
|
uint16_t u16;
|
||||||
public:
|
public:
|
||||||
invalid_utf16 (uint16_t u) : u16(u) {}
|
invalid_utf16 (uint16_t u) : u16(u) {}
|
||||||
virtual const char* what() const NOEXCEPT OVERRIDE { return "Invalid UTF-16"; }
|
virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-16"; }
|
||||||
uint16_t utf16_word() const {return u16;}
|
uint16_t utf16_word() const {return u16;}
|
||||||
};
|
};
|
||||||
|
|
||||||
class not_enough_room : public exception {
|
class not_enough_room : public exception {
|
||||||
public:
|
public:
|
||||||
virtual const char* what() const NOEXCEPT OVERRIDE { return "Not enough space"; }
|
virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Not enough space"; }
|
||||||
};
|
};
|
||||||
|
|
||||||
/// The library API - functions intended to be called by the users
|
/// The library API - functions intended to be called by the users
|
||||||
|
@ -75,24 +76,7 @@ namespace utf8
|
||||||
if (!utf8::internal::is_code_point_valid(cp))
|
if (!utf8::internal::is_code_point_valid(cp))
|
||||||
throw invalid_code_point(cp);
|
throw invalid_code_point(cp);
|
||||||
|
|
||||||
if (cp < 0x80) // one octet
|
return internal::append(cp, result);
|
||||||
*(result++) = static_cast<uint8_t>(cp);
|
|
||||||
else if (cp < 0x800) { // two octets
|
|
||||||
*(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
|
|
||||||
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
|
||||||
}
|
|
||||||
else if (cp < 0x10000) { // three octets
|
|
||||||
*(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
|
|
||||||
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
|
|
||||||
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
|
||||||
}
|
|
||||||
else { // four octets
|
|
||||||
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
|
|
||||||
*(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80);
|
|
||||||
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
|
|
||||||
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename octet_iterator, typename output_iterator>
|
template <typename octet_iterator, typename output_iterator>
|
||||||
|
@ -148,7 +132,7 @@ namespace utf8
|
||||||
case internal::INVALID_LEAD :
|
case internal::INVALID_LEAD :
|
||||||
case internal::INCOMPLETE_SEQUENCE :
|
case internal::INCOMPLETE_SEQUENCE :
|
||||||
case internal::OVERLONG_SEQUENCE :
|
case internal::OVERLONG_SEQUENCE :
|
||||||
throw invalid_utf8(*it);
|
throw invalid_utf8(static_cast<uint8_t>(*it));
|
||||||
case internal::INVALID_CODE_POINT :
|
case internal::INVALID_CODE_POINT :
|
||||||
throw invalid_code_point(cp);
|
throw invalid_code_point(cp);
|
||||||
}
|
}
|
||||||
|
@ -325,7 +309,9 @@ namespace utf8
|
||||||
|
|
||||||
} // namespace utf8
|
} // namespace utf8
|
||||||
|
|
||||||
#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
|
#if UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later
|
||||||
|
#include "cpp17.h"
|
||||||
|
#elif UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
|
||||||
#include "cpp11.h"
|
#include "cpp11.h"
|
||||||
#endif // C++ 11 or later
|
#endif // C++ 11 or later
|
||||||
|
|
||||||
|
|
|
@ -39,11 +39,11 @@ DEALINGS IN THE SOFTWARE.
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
|
#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later
|
||||||
#define OVERRIDE override
|
#define UTF_CPP_OVERRIDE override
|
||||||
#define NOEXCEPT noexcept
|
#define UTF_CPP_NOEXCEPT noexcept
|
||||||
#else // C++ 98/03
|
#else // C++ 98/03
|
||||||
#define OVERRIDE
|
#define UTF_CPP_OVERRIDE
|
||||||
#define NOEXCEPT throw()
|
#define UTF_CPP_NOEXCEPT throw()
|
||||||
#endif // C++ 11 or later
|
#endif // C++ 11 or later
|
||||||
|
|
||||||
|
|
||||||
|
@ -297,6 +297,55 @@ namespace internal
|
||||||
return utf8::internal::validate_next(it, end, ignored);
|
return utf8::internal::validate_next(it, end, ignored);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Internal implementation of both checked and unchecked append() function
|
||||||
|
// This function will be invoked by the overloads below, as they will know
|
||||||
|
// the octet_type.
|
||||||
|
template <typename octet_iterator, typename octet_type>
|
||||||
|
octet_iterator append(uint32_t cp, octet_iterator result) {
|
||||||
|
if (cp < 0x80) // one octet
|
||||||
|
*(result++) = static_cast<octet_type>(cp);
|
||||||
|
else if (cp < 0x800) { // two octets
|
||||||
|
*(result++) = static_cast<octet_type>((cp >> 6) | 0xc0);
|
||||||
|
*(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
|
||||||
|
}
|
||||||
|
else if (cp < 0x10000) { // three octets
|
||||||
|
*(result++) = static_cast<octet_type>((cp >> 12) | 0xe0);
|
||||||
|
*(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
|
||||||
|
*(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
|
||||||
|
}
|
||||||
|
else { // four octets
|
||||||
|
*(result++) = static_cast<octet_type>((cp >> 18) | 0xf0);
|
||||||
|
*(result++) = static_cast<octet_type>(((cp >> 12) & 0x3f)| 0x80);
|
||||||
|
*(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80);
|
||||||
|
*(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// One of the following overloads will be invoked from the API calls
|
||||||
|
|
||||||
|
// A simple (but dangerous) case: the caller appends byte(s) to a char array
|
||||||
|
inline char* append(uint32_t cp, char* result) {
|
||||||
|
return append<char*, char>(cp, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Hopefully, most common case: the caller uses back_inserter
|
||||||
|
// i.e. append(cp, std::back_inserter(str));
|
||||||
|
template<typename container_type>
|
||||||
|
std::back_insert_iterator<container_type> append
|
||||||
|
(uint32_t cp, std::back_insert_iterator<container_type> result) {
|
||||||
|
return append<std::back_insert_iterator<container_type>,
|
||||||
|
typename container_type::value_type>(cp, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
// The caller uses some other kind of output operator - not covered above
|
||||||
|
// Note that in this case we are not able to determine octet_type
|
||||||
|
// so we assume it's uint_8; that can cause a conversion warning if we are wrong.
|
||||||
|
template <typename octet_iterator>
|
||||||
|
octet_iterator append(uint32_t cp, octet_iterator result) {
|
||||||
|
return append<octet_iterator, uint8_t>(cp, result);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace internal
|
} // namespace internal
|
||||||
|
|
||||||
/// The library API - functions intended to be called by the users
|
/// The library API - functions intended to be called by the users
|
||||||
|
|
|
@ -70,7 +70,7 @@ namespace utf8
|
||||||
inline std::size_t find_invalid(const std::string& s)
|
inline std::size_t find_invalid(const std::string& s)
|
||||||
{
|
{
|
||||||
std::string::const_iterator invalid = find_invalid(s.begin(), s.end());
|
std::string::const_iterator invalid = find_invalid(s.begin(), s.end());
|
||||||
return (invalid == s.end()) ? std::string::npos : (invalid - s.begin());
|
return (invalid == s.end()) ? std::string::npos : static_cast<std::size_t>(invalid - s.begin());
|
||||||
}
|
}
|
||||||
|
|
||||||
inline bool is_valid(const std::string& s)
|
inline bool is_valid(const std::string& s)
|
||||||
|
|
|
@ -0,0 +1,103 @@
|
||||||
|
// Copyright 2018 Nemanja Trifunovic
|
||||||
|
|
||||||
|
/*
|
||||||
|
Permission is hereby granted, free of charge, to any person or organization
|
||||||
|
obtaining a copy of the software and accompanying documentation covered by
|
||||||
|
this license (the "Software") to use, reproduce, display, distribute,
|
||||||
|
execute, and transmit the Software, and to prepare derivative works of the
|
||||||
|
Software, and to permit third-parties to whom the Software is furnished to
|
||||||
|
do so, all subject to the following:
|
||||||
|
|
||||||
|
The copyright notices in the Software and this entire statement, including
|
||||||
|
the above license grant, this restriction and the following disclaimer,
|
||||||
|
must be included in all copies of the Software, in whole or in part, and
|
||||||
|
all derivative works of the Software, unless such copies or derivative
|
||||||
|
works are solely in the form of machine-executable object code generated by
|
||||||
|
a source language processor.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
|
||||||
|
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
|
||||||
|
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
|
||||||
|
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9
|
||||||
|
#define UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9
|
||||||
|
|
||||||
|
#include "checked.h"
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
namespace utf8
|
||||||
|
{
|
||||||
|
|
||||||
|
inline void append(char32_t cp, std::string& s)
|
||||||
|
{
|
||||||
|
append(uint32_t(cp), std::back_inserter(s));
|
||||||
|
}
|
||||||
|
|
||||||
|
inline std::string utf16to8(std::u16string_view s)
|
||||||
|
{
|
||||||
|
std::string result;
|
||||||
|
utf16to8(s.begin(), s.end(), std::back_inserter(result));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline std::u16string utf8to16(std::string_view s)
|
||||||
|
{
|
||||||
|
std::u16string result;
|
||||||
|
utf8to16(s.begin(), s.end(), std::back_inserter(result));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline std::string utf32to8(std::u32string_view s)
|
||||||
|
{
|
||||||
|
std::string result;
|
||||||
|
utf32to8(s.begin(), s.end(), std::back_inserter(result));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline std::u32string utf8to32(std::string_view s)
|
||||||
|
{
|
||||||
|
std::u32string result;
|
||||||
|
utf8to32(s.begin(), s.end(), std::back_inserter(result));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline std::size_t find_invalid(std::string_view s)
|
||||||
|
{
|
||||||
|
std::string_view::const_iterator invalid = find_invalid(s.begin(), s.end());
|
||||||
|
return (invalid == s.end()) ? std::string_view::npos : static_cast<std::size_t>(invalid - s.begin());
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool is_valid(std::string_view s)
|
||||||
|
{
|
||||||
|
return is_valid(s.begin(), s.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
inline std::string replace_invalid(std::string_view s, char32_t replacement)
|
||||||
|
{
|
||||||
|
std::string result;
|
||||||
|
replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline std::string replace_invalid(std::string_view s)
|
||||||
|
{
|
||||||
|
std::string result;
|
||||||
|
replace_invalid(s.begin(), s.end(), std::back_inserter(result));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool starts_with_bom(std::string_view s)
|
||||||
|
{
|
||||||
|
return starts_with_bom(s.begin(), s.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace utf8
|
||||||
|
|
||||||
|
#endif // header guard
|
||||||
|
|
|
@ -37,24 +37,7 @@ namespace utf8
|
||||||
template <typename octet_iterator>
|
template <typename octet_iterator>
|
||||||
octet_iterator append(uint32_t cp, octet_iterator result)
|
octet_iterator append(uint32_t cp, octet_iterator result)
|
||||||
{
|
{
|
||||||
if (cp < 0x80) // one octet
|
return internal::append(cp, result);
|
||||||
*(result++) = static_cast<uint8_t>(cp);
|
|
||||||
else if (cp < 0x800) { // two octets
|
|
||||||
*(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
|
|
||||||
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
|
||||||
}
|
|
||||||
else if (cp < 0x10000) { // three octets
|
|
||||||
*(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
|
|
||||||
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
|
|
||||||
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
|
||||||
}
|
|
||||||
else { // four octets
|
|
||||||
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
|
|
||||||
*(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80);
|
|
||||||
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
|
|
||||||
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename octet_iterator, typename output_iterator>
|
template <typename octet_iterator, typename output_iterator>
|
||||||
|
|
Loading…
Reference in New Issue