Merge pull request #1310 from assimp/new_cpputf8_issue_1309

New cpputf8 issue 1309
pull/1311/head
Kim Kulling 2017-06-17 21:17:05 +02:00 committed by GitHub
commit c62429998e
19 changed files with 2980 additions and 835 deletions

View File

@ -303,24 +303,13 @@ void BaseImporter::GetExtensionList(std::set<std::string>& extensions)
return false;
}
#include "../contrib/ConvertUTF/ConvertUTF.h"
// ------------------------------------------------------------------------------------------------
void ReportResult(ConversionResult res)
{
if(res == sourceExhausted) {
DefaultLogger::get()->error("Source ends with incomplete character sequence, transformation to UTF-8 fails");
}
else if(res == sourceIllegal) {
DefaultLogger::get()->error("Source contains illegal character sequence, transformation to UTF-8 fails");
}
}
#include "../contrib/utf8cpp/source/utf8.h"
// ------------------------------------------------------------------------------------------------
// Convert to UTF8 data
void BaseImporter::ConvertToUTF8(std::vector<char>& data)
{
ConversionResult result;
//ConversionResult result;
if(data.size() < 8) {
throw DeadlyImportError("File is too small");
}
@ -333,7 +322,8 @@ void BaseImporter::ConvertToUTF8(std::vector<char>& data)
data.resize(data.size()-3);
return;
}
// UTF 32 BE with BOM
if(*((uint32_t*)&data.front()) == 0xFFFE0000) {
@ -348,20 +338,10 @@ void BaseImporter::ConvertToUTF8(std::vector<char>& data)
DefaultLogger::get()->debug("Found UTF-32 BOM ...");
const uint32_t* sstart = (uint32_t*)&data.front()+1, *send = (uint32_t*)&data.back()+1;
char* dstart,*dend;
std::vector<char> output;
do {
output.resize(output.size()?output.size()*3/2:data.size()/2);
dstart = &output.front(),dend = &output.back()+1;
result = ConvertUTF32toUTF8((const UTF32**)&sstart,(const UTF32*)send,(UTF8**)&dstart,(UTF8*)dend,lenientConversion);
} while(result == targetExhausted);
ReportResult(result);
// copy to output buffer.
const size_t outlen = (size_t)(dstart-&output.front());
data.assign(output.begin(),output.begin()+outlen);
int *ptr = (int*)&data[ 0 ];
int *end = ptr + ( data.size() / sizeof(int) ) +1;
utf8::utf32to8( ptr, end, back_inserter(output));
return;
}
@ -379,20 +359,11 @@ void BaseImporter::ConvertToUTF8(std::vector<char>& data)
DefaultLogger::get()->debug("Found UTF-16 BOM ...");
const uint16_t* sstart = (uint16_t*)&data.front()+1, *send = (uint16_t*)(&data.back()+1);
char* dstart,*dend;
std::vector<char> output;
do {
output.resize(output.size()?output.size()*3/2:data.size()*3/4);
dstart = &output.front(),dend = &output.back()+1;
std::vector<unsigned char> output;
int16_t *ptr = (int16_t*) &data[ 0 ];
int16_t *end = ptr + (data.size() / sizeof(int)) + 1;
result = ConvertUTF16toUTF8((const UTF16**)&sstart,(const UTF16*)send,(UTF8**)&dstart,(UTF8*)dend,lenientConversion);
} while(result == targetExhausted);
ReportResult(result);
// copy to output buffer.
const size_t outlen = (size_t)(dstart-&output.front());
data.assign(output.begin(),output.begin()+outlen);
utf8::utf16to8(data.begin(), data.end(), back_inserter(output));
return;
}
}

View File

@ -61,7 +61,6 @@ class BaseProcess;
class SharedPostProcessInfo;
class IOStream;
// utility to do char4 to uint32 in a portable manner
#define AI_MAKE_MAGIC(string) ((uint32_t)((string[0] << 24) + \
(string[1] << 16) + (string[2] << 8) + string[3]))
@ -194,14 +193,11 @@ public:
const Importer* pImp
);
// -------------------------------------------------------------------
/** Called by #Importer::GetImporterInfo to get a description of
* some loader features. Importers must provide this information. */
virtual const aiImporterDesc* GetInfo() const = 0;
// -------------------------------------------------------------------
/** Called by #Importer::GetExtensionList for each loaded importer.
* Take the extension list contained in the structure returned by
@ -317,7 +313,7 @@ public: // static utilities
* @param Size of one token, in bytes. Maximally 16 bytes.
* @return true if one of the given tokens was found
*
* @note For convinence, the check is also performed for the
* @note For convenience, the check is also performed for the
* byte-swapped variant of all tokens (big endian). Only for
* tokens of size 2,4.
*/

View File

@ -695,11 +695,6 @@ SET( Extra_SRCS
)
SOURCE_GROUP( Extra FILES ${Extra_SRCS})
SET( ConvertUTF_SRCS
../contrib/ConvertUTF/ConvertUTF.h
../contrib/ConvertUTF/ConvertUTF.c
)
SOURCE_GROUP( ConvertUTF FILES ${ConvertUTF_SRCS})
SET( Clipper_SRCS
../contrib/clipper/clipper.hpp
@ -835,7 +830,6 @@ SET( assimp_src
# Third-party libraries
${IrrXML_SRCS}
${ConvertUTF_SRCS}
${unzip_compile_SRCS}
${Poly2Tri_SRCS}
${Clipper_SRCS}

View File

@ -1,3 +1,43 @@
/*
Open Asset Import Library (assimp)
----------------------------------------------------------------------
Copyright (c) 2006-2017, assimp team
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the
following conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* Neither the name of the assimp team, nor the names of its
contributors may be used to endorse or promote products
derived from this software without specific prior
written permission of the assimp team.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
----------------------------------------------------------------------
*/
#pragma once
#ifndef MMD_CPP14_H

View File

@ -1,3 +1,43 @@
/*
Open Asset Import Library (assimp)
----------------------------------------------------------------------
Copyright (c) 2006-2017, assimp team
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the
following conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* Neither the name of the assimp team, nor the names of its
contributors may be used to endorse or promote products
derived from this software without specific prior
written permission of the assimp team.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
----------------------------------------------------------------------
*/
#pragma once
#include <vector>

View File

@ -1,7 +1,47 @@
/*
Open Asset Import Library (assimp)
----------------------------------------------------------------------
Copyright (c) 2006-2017, assimp team
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the
following conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* Neither the name of the assimp team, nor the names of its
contributors may be used to endorse or promote products
derived from this software without specific prior
written permission of the assimp team.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
----------------------------------------------------------------------
*/
#include <utility>
#include "MMDPmxParser.h"
#include "../contrib/utf8cpp/source/utf8.h"
#include "Exceptional.h"
#include "../contrib/ConvertUTF/ConvertUTF.h"
namespace pmx
{
@ -60,15 +100,7 @@ namespace pmx
const unsigned int targetSize = size * 3; // enough to encode
char* targetStart = new char[targetSize]();
const char* targetReserved = targetStart;
ConversionFlags flags = ConversionFlags::lenientConversion;
ConversionResult conversionResult;
if( ( conversionResult = ConvertUTF16toUTF8(
(const UTF16**)&sourceStart, (const UTF16*)(sourceStart + size),
(UTF8**)&targetStart, (UTF8*)(targetStart + targetSize),
flags) ) != ConversionResult::conversionOK) {
throw DeadlyImportError( "Convert " + std::string(sourceStart) + " to UTF8 is not valid." );
}
utf8::utf16to8( sourceStart, sourceStart + size, targetStart );
result.assign(targetReserved, targetStart - targetReserved);
delete[] targetReserved;

View File

@ -1,3 +1,43 @@
/*
Open Asset Import Library (assimp)
----------------------------------------------------------------------
Copyright (c) 2006-2017, assimp team
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the
following conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* Neither the name of the assimp team, nor the names of its
contributors may be used to endorse or promote products
derived from this software without specific prior
written permission of the assimp team.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
----------------------------------------------------------------------
*/
#pragma once
#include <vector>

View File

@ -1,3 +1,43 @@
/*
Open Asset Import Library (assimp)
----------------------------------------------------------------------
Copyright (c) 2006-2017, assimp team
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the
following conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* Neither the name of the assimp team, nor the names of its
contributors may be used to endorse or promote products
derived from this software without specific prior
written permission of the assimp team.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
----------------------------------------------------------------------
*/
#pragma once
#include <vector>

View File

@ -58,7 +58,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "ByteSwapper.h"
#include "StreamReader.h"
#include "TinyFormatter.h"
#include "../contrib/ConvertUTF/ConvertUTF.h"
//#include "../contrib/ConvertUTF/ConvertUTF.h"
#include "../contrib/utf8cpp/source/utf8.h"
#include <assimp/IOSystem.hpp>
#include <assimp/DefaultLogger.hpp>
#include <assimp/scene.h>
@ -177,21 +178,33 @@ static void UnknownChunk(StreamReaderLE* stream, const SIBChunk& chunk)
// Reads a UTF-16LE string and returns it at UTF-8.
static aiString ReadString(StreamReaderLE* stream, uint32_t numWChars)
{
if ( 0 == numWChars ) {
static const aiString empty;
return empty;
}
// Allocate buffers (max expansion is 1 byte -> 4 bytes for UTF-8)
UTF16* temp = new UTF16[numWChars];
UTF8* str = new UTF8[numWChars * 4 + 1];
//UTF16* temp = new UTF16[numWChars];
std::vector<unsigned char> str;
str.reserve(numWChars * 4 + 1);
//unsigned char* str = new unsigned char[numWChars * 4 + 1];
uint16_t *temp = new uint16_t[numWChars];
for (uint32_t n=0;n<numWChars;n++)
temp[n] = stream->GetU2();
// Convert it and NUL-terminate.
const UTF16 *start = temp, *end = temp + numWChars;
UTF8 *dest = str, *limit = str + numWChars*4;
ConvertUTF16toUTF8(&start, end, &dest, limit, lenientConversion);
*dest = '\0';
//const UTF16 *start = temp, *end = temp + numWChars;
const uint16_t *start = temp, *end = temp + numWChars;
utf8::utf16to8(start, end, back_inserter(str));
//UTF8 *dest = str, *limit = str + numWChars*4;
//ConvertUTF16toUTF8(&start, end, &dest, limit, lenientConversion);
//*dest = '\0';
str[str.size()-1] = '\0';
// Return the final string.
aiString result = aiString((const char *)str);
delete[] str;
aiString result = aiString((const char *)&str[0]);
//delete[] str;
delete[] temp;
return result;
}

View File

@ -40,18 +40,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/** @file STEPFileEncoding.cpp
* @brief STEP character handling, string unescaping
* @brief STEP character handling, string un-escaping
*/
#include "STEPFileEncoding.h"
#include "fast_atof.h"
#include <contrib/utf8cpp/source/utf8.h>
#include "../contrib/ConvertUTF/ConvertUTF.h"
#include <memory>
using namespace Assimp;
// roman1 to utf16 table
static const UTF16 mac_codetable[] = {
static const uint16_t mac_codetable[] = {
// 0x20 unassig./nonprint. slots
0x0020 ,
0x0021 ,
@ -309,14 +309,12 @@ bool STEP::StringToUTF8(std::string& s)
ai_assert(sizeof(mac_codetable) / sizeof(mac_codetable[0]) == 0x100-0x20);
const UTF32 unival = mac_codetable[macval - 0x20], *univalp = &unival;
const uint32_t unival = mac_codetable[macval - 0x20], *univalp = &unival;
UTF8 temp[5], *tempp = temp;
ai_assert(sizeof(UTF8) == 1);
unsigned char temp[5], *tempp = temp;
ai_assert(sizeof( unsigned char ) == 1);
if(ConvertUTF32toUTF8(&univalp, univalp+1, &tempp, tempp+sizeof(temp), lenientConversion) != conversionOK) {
return false;
}
utf8::utf32to8( univalp, univalp + 1, tempp );
const size_t outcount = static_cast<size_t>(tempp-temp);
@ -355,28 +353,26 @@ bool STEP::StringToUTF8(std::string& s)
}
const size_t count = (j-basei)/4;
std::unique_ptr<UTF16[]> src(new UTF16[count]);
std::unique_ptr<uint16_t[]> src(new uint16_t[count]);
const char* cur = s.c_str() + basei;
for (size_t k = 0; k < count; ++k, cur += 4) {
src[k] = (static_cast<UTF16>(HexOctetToDecimal(cur)) << 8u) |
static_cast<UTF16>(HexOctetToDecimal(cur+2));
src[k] = (static_cast<uint16_t>(HexOctetToDecimal(cur)) << 8u) |
static_cast<uint16_t>(HexOctetToDecimal(cur+2));
}
const size_t dcount = count * 3; // this is enough to hold all possible outputs
std::unique_ptr<UTF8[]> dest(new UTF8[dcount]);
std::unique_ptr<unsigned char[]> dest(new unsigned char[dcount]);
const UTF16* srct = src.get();
UTF8* destt = dest.get();
if(ConvertUTF16toUTF8(&srct, srct+count, &destt, destt+dcount, lenientConversion) != conversionOK) {
return false;
}
const uint16_t* srct = src.get();
unsigned char* destt = dest.get();
utf8::utf16to8( srct, srct + count, destt );
const size_t outcount = static_cast<size_t>(destt-dest.get());
s.erase(i,(j+4-i));
ai_assert(sizeof(UTF8) == 1);
ai_assert(sizeof(unsigned char) == 1);
s.insert(i, reinterpret_cast<char*>(dest.get()), outcount);
i += outcount;
@ -388,37 +384,34 @@ bool STEP::StringToUTF8(std::string& s)
}
const size_t count = (j-basei)/8;
std::unique_ptr<UTF32[]> src(new UTF32[count]);
std::unique_ptr<uint32_t[]> src(new uint32_t[count]);
const char* cur = s.c_str() + basei;
for (size_t k = 0; k < count; ++k, cur += 8) {
src[k] = (static_cast<UTF32>(HexOctetToDecimal(cur )) << 24u) |
(static_cast<UTF32>(HexOctetToDecimal(cur+2)) << 16u) |
(static_cast<UTF32>(HexOctetToDecimal(cur+4)) << 8u) |
(static_cast<UTF32>(HexOctetToDecimal(cur+6)));
src[k] = (static_cast<uint32_t>(HexOctetToDecimal(cur )) << 24u) |
(static_cast<uint32_t>(HexOctetToDecimal(cur+2)) << 16u) |
(static_cast<uint32_t>(HexOctetToDecimal(cur+4)) << 8u) |
(static_cast<uint32_t>(HexOctetToDecimal(cur+6)));
}
const size_t dcount = count * 5; // this is enough to hold all possible outputs
std::unique_ptr<UTF8[]> dest(new UTF8[dcount]);
std::unique_ptr<unsigned char[]> dest(new unsigned char[dcount]);
const UTF32* srct = src.get();
UTF8* destt = dest.get();
if(ConvertUTF32toUTF8(&srct, srct+count, &destt, destt+dcount, lenientConversion) != conversionOK) {
return false;
}
const uint32_t* srct = src.get();
unsigned char* destt = dest.get();
utf8::utf32to8( srct, srct + count, destt );
const size_t outcount = static_cast<size_t>(destt-dest.get());
s.erase(i,(j+4-i));
ai_assert(sizeof(UTF8) == 1);
ai_assert(sizeof(unsigned char) == 1);
s.insert(i, reinterpret_cast<char*>(dest.get()), outcount);
i += outcount;
continue;
}
}
break;
// TODO: other encoding patterns?

View File

@ -1,539 +0,0 @@
/*
* Copyright 2001-2004 Unicode, Inc.
*
* Disclaimer
*
* This source code is provided as is by Unicode, Inc. No claims are
* made as to fitness for any particular purpose. No warranties of any
* kind are expressed or implied. The recipient agrees to determine
* applicability of information provided. If this file has been
* purchased on magnetic or optical media from Unicode, Inc., the
* sole remedy for any claim will be exchange of defective media
* within 90 days of receipt.
*
* Limitations on Rights to Redistribute This Code
*
* Unicode, Inc. hereby grants the right to freely use the information
* supplied in this file in the creation of products supporting the
* Unicode Standard, and to make copies of this file in any form
* for internal or external distribution as long as this notice
* remains attached.
*/
/* ---------------------------------------------------------------------
Conversions between UTF32, UTF-16, and UTF-8. Source code file.
Author: Mark E. Davis, 1994.
Rev History: Rick McGowan, fixes & updates May 2001.
Sept 2001: fixed const & error conditions per
mods suggested by S. Parent & A. Lillich.
June 2002: Tim Dodd added detection and handling of incomplete
source sequences, enhanced error detection, added casts
to eliminate compiler warnings.
July 2003: slight mods to back out aggressive FFFE detection.
Jan 2004: updated switches in from-UTF8 conversions.
Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
See the header file "ConvertUTF.h" for complete documentation.
------------------------------------------------------------------------ */
#include "ConvertUTF.h"
#ifdef CVTUTF_DEBUG
#include <stdio.h>
#endif
static const int halfShift = 10; /* used for shifting by 10 bits */
static const UTF32 halfBase = 0x0010000UL;
static const UTF32 halfMask = 0x3FFUL;
#define UNI_SUR_HIGH_START (UTF32)0xD800
#define UNI_SUR_HIGH_END (UTF32)0xDBFF
#define UNI_SUR_LOW_START (UTF32)0xDC00
#define UNI_SUR_LOW_END (UTF32)0xDFFF
#define false 0
#define true 1
/* --------------------------------------------------------------------- */
ConversionResult ConvertUTF32toUTF16 (
const UTF32** sourceStart, const UTF32* sourceEnd,
UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
ConversionResult result = conversionOK;
const UTF32* source = *sourceStart;
UTF16* target = *targetStart;
while (source < sourceEnd) {
UTF32 ch;
if (target >= targetEnd) {
result = targetExhausted; break;
}
ch = *source++;
if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
/* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
if (flags == strictConversion) {
--source; /* return to the illegal value itself */
result = sourceIllegal;
break;
} else {
*target++ = UNI_REPLACEMENT_CHAR;
}
} else {
*target++ = (UTF16)ch; /* normal case */
}
} else if (ch > UNI_MAX_LEGAL_UTF32) {
if (flags == strictConversion) {
result = sourceIllegal;
} else {
*target++ = UNI_REPLACEMENT_CHAR;
}
} else {
/* target is a character in range 0xFFFF - 0x10FFFF. */
if (target + 1 >= targetEnd) {
--source; /* Back up source pointer! */
result = targetExhausted; break;
}
ch -= halfBase;
*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
}
}
*sourceStart = source;
*targetStart = target;
return result;
}
/* --------------------------------------------------------------------- */
ConversionResult ConvertUTF16toUTF32 (
const UTF16** sourceStart, const UTF16* sourceEnd,
UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
ConversionResult result = conversionOK;
const UTF16* source = *sourceStart;
UTF32* target = *targetStart;
UTF32 ch, ch2;
while (source < sourceEnd) {
const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
ch = *source++;
/* If we have a surrogate pair, convert to UTF32 first. */
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
/* If the 16 bits following the high surrogate are in the source buffer... */
if (source < sourceEnd) {
ch2 = *source;
/* If it's a low surrogate, convert to UTF32. */
if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
+ (ch2 - UNI_SUR_LOW_START) + halfBase;
++source;
} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
--source; /* return to the illegal value itself */
result = sourceIllegal;
break;
}
} else { /* We don't have the 16 bits following the high surrogate. */
--source; /* return to the high surrogate */
result = sourceExhausted;
break;
}
} else if (flags == strictConversion) {
/* UTF-16 surrogate values are illegal in UTF-32 */
if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
--source; /* return to the illegal value itself */
result = sourceIllegal;
break;
}
}
if (target >= targetEnd) {
source = oldSource; /* Back up source pointer! */
result = targetExhausted; break;
}
*target++ = ch;
}
*sourceStart = source;
*targetStart = target;
#ifdef CVTUTF_DEBUG
if (result == sourceIllegal) {
fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
fflush(stderr);
}
#endif
return result;
}
/* --------------------------------------------------------------------- */
/*
* Index into the table below with the first byte of a UTF-8 sequence to
* get the number of trailing bytes that are supposed to follow it.
* Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
* left as-is for anyone who may want to do such conversion, which was
* allowed in earlier algorithms.
*/
static const char trailingBytesForUTF8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};
/*
* Magic values subtracted from a buffer value during UTF8 conversion.
* This table contains as many values as there might be trailing bytes
* in a UTF-8 sequence.
*/
static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
0x03C82080UL, 0xFA082080UL, 0x82082080UL };
/*
* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
* into the first byte, depending on how many bytes follow. There are
* as many entries in this table as there are UTF-8 sequence types.
* (I.e., one byte sequence, two byte... etc.). Remember that sequencs
* for *legal* UTF-8 will be 4 or fewer bytes total.
*/
static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
/* --------------------------------------------------------------------- */
/* The interface converts a whole buffer to avoid function-call overhead.
* Constants have been gathered. Loops & conditionals have been removed as
* much as possible for efficiency, in favor of drop-through switches.
* (See "Note A" at the bottom of the file for equivalent code.)
* If your compiler supports it, the "isLegalUTF8" call can be turned
* into an inline function.
*/
/* --------------------------------------------------------------------- */
ConversionResult ConvertUTF16toUTF8 (
const UTF16** sourceStart, const UTF16* sourceEnd,
UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
ConversionResult result = conversionOK;
const UTF16* source = *sourceStart;
UTF8* target = *targetStart;
while (source < sourceEnd) {
UTF32 ch;
unsigned short bytesToWrite = 0;
const UTF32 byteMask = 0xBF;
const UTF32 byteMark = 0x80;
const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
ch = *source++;
/* If we have a surrogate pair, convert to UTF32 first. */
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
/* If the 16 bits following the high surrogate are in the source buffer... */
if (source < sourceEnd) {
UTF32 ch2 = *source;
/* If it's a low surrogate, convert to UTF32. */
if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
+ (ch2 - UNI_SUR_LOW_START) + halfBase;
++source;
} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
--source; /* return to the illegal value itself */
result = sourceIllegal;
break;
}
} else { /* We don't have the 16 bits following the high surrogate. */
--source; /* return to the high surrogate */
result = sourceExhausted;
break;
}
} else if (flags == strictConversion) {
/* UTF-16 surrogate values are illegal in UTF-32 */
if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
--source; /* return to the illegal value itself */
result = sourceIllegal;
break;
}
}
/* Figure out how many bytes the result will require */
if (ch < (UTF32)0x80) { bytesToWrite = 1;
} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
} else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
} else { bytesToWrite = 3;
ch = UNI_REPLACEMENT_CHAR;
}
target += bytesToWrite;
if (target > targetEnd) {
source = oldSource; /* Back up source pointer! */
target -= bytesToWrite; result = targetExhausted; break;
}
switch (bytesToWrite) { /* note: everything falls through. */
case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
}
target += bytesToWrite;
}
*sourceStart = source;
*targetStart = target;
return result;
}
/* --------------------------------------------------------------------- */
/*
* Utility routine to tell whether a sequence of bytes is legal UTF-8.
* This must be called with the length pre-determined by the first byte.
* If not calling this from ConvertUTF8to*, then the length can be set by:
* length = trailingBytesForUTF8[*source]+1;
* and the sequence is illegal right away if there aren't that many bytes
* available.
* If presented with a length > 4, this returns false. The Unicode
* definition of UTF-8 goes up to 4-byte sequences.
*/
static Boolean isLegalUTF8(const UTF8 *source, int length) {
UTF8 a;
const UTF8 *srcptr = source+length;
switch (length) {
default: return false;
/* Everything else falls through when "true"... */
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 2: if ((a = (*--srcptr)) > 0xBF) return false;
switch (*source) {
/* no fall-through in this inner switch */
case 0xE0: if (a < 0xA0) return false; break;
case 0xED: if (a > 0x9F) return false; break;
case 0xF0: if (a < 0x90) return false; break;
case 0xF4: if (a > 0x8F) return false; break;
default: if (a < 0x80) return false;
}
case 1: if (*source >= 0x80 && *source < 0xC2) return false;
}
if (*source > 0xF4) return false;
return true;
}
/* --------------------------------------------------------------------- */
/*
* Exported function to return whether a UTF-8 sequence is legal or not.
* This is not used here; it's just exported.
*/
Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
int length = trailingBytesForUTF8[*source]+1;
if (source+length > sourceEnd) {
return false;
}
return isLegalUTF8(source, length);
}
/* --------------------------------------------------------------------- */
ConversionResult ConvertUTF8toUTF16 (
const UTF8** sourceStart, const UTF8* sourceEnd,
UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
ConversionResult result = conversionOK;
const UTF8* source = *sourceStart;
UTF16* target = *targetStart;
while (source < sourceEnd) {
UTF32 ch = 0;
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
if (source + extraBytesToRead >= sourceEnd) {
result = sourceExhausted; break;
}
/* Do this check whether lenient or strict */
if (! isLegalUTF8(source, extraBytesToRead+1)) {
result = sourceIllegal;
break;
}
/*
* The cases all fall through. See "Note A" below.
*/
switch (extraBytesToRead) {
case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
case 3: ch += *source++; ch <<= 6;
case 2: ch += *source++; ch <<= 6;
case 1: ch += *source++; ch <<= 6;
case 0: ch += *source++;
}
ch -= offsetsFromUTF8[extraBytesToRead];
if (target >= targetEnd) {
source -= (extraBytesToRead+1); /* Back up source pointer! */
result = targetExhausted; break;
}
if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
/* UTF-16 surrogate values are illegal in UTF-32 */
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
if (flags == strictConversion) {
source -= (extraBytesToRead+1); /* return to the illegal value itself */
result = sourceIllegal;
break;
} else {
*target++ = UNI_REPLACEMENT_CHAR;
}
} else {
*target++ = (UTF16)ch; /* normal case */
}
} else if (ch > UNI_MAX_UTF16) {
if (flags == strictConversion) {
result = sourceIllegal;
source -= (extraBytesToRead+1); /* return to the start */
break; /* Bail out; shouldn't continue */
} else {
*target++ = UNI_REPLACEMENT_CHAR;
}
} else {
/* target is a character in range 0xFFFF - 0x10FFFF. */
if (target + 1 >= targetEnd) {
source -= (extraBytesToRead+1); /* Back up source pointer! */
result = targetExhausted; break;
}
ch -= halfBase;
*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
}
}
*sourceStart = source;
*targetStart = target;
return result;
}
/* --------------------------------------------------------------------- */
ConversionResult ConvertUTF32toUTF8 (
const UTF32** sourceStart, const UTF32* sourceEnd,
UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
ConversionResult result = conversionOK;
const UTF32* source = *sourceStart;
UTF8* target = *targetStart;
while (source < sourceEnd) {
UTF32 ch;
unsigned short bytesToWrite = 0;
const UTF32 byteMask = 0xBF;
const UTF32 byteMark = 0x80;
ch = *source++;
if (flags == strictConversion ) {
/* UTF-16 surrogate values are illegal in UTF-32 */
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
--source; /* return to the illegal value itself */
result = sourceIllegal;
break;
}
}
/*
* Figure out how many bytes the result will require. Turn any
* illegally large UTF32 things (> Plane 17) into replacement chars.
*/
if (ch < (UTF32)0x80) { bytesToWrite = 1;
} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
} else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
} else { bytesToWrite = 3;
ch = UNI_REPLACEMENT_CHAR;
result = sourceIllegal;
}
target += bytesToWrite;
if (target > targetEnd) {
--source; /* Back up source pointer! */
target -= bytesToWrite; result = targetExhausted; break;
}
switch (bytesToWrite) { /* note: everything falls through. */
case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
}
target += bytesToWrite;
}
*sourceStart = source;
*targetStart = target;
return result;
}
/* --------------------------------------------------------------------- */
ConversionResult ConvertUTF8toUTF32 (
const UTF8** sourceStart, const UTF8* sourceEnd,
UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
ConversionResult result = conversionOK;
const UTF8* source = *sourceStart;
UTF32* target = *targetStart;
while (source < sourceEnd) {
UTF32 ch = 0;
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
if (source + extraBytesToRead >= sourceEnd) {
result = sourceExhausted; break;
}
/* Do this check whether lenient or strict */
if (! isLegalUTF8(source, extraBytesToRead+1)) {
result = sourceIllegal;
break;
}
/*
* The cases all fall through. See "Note A" below.
*/
switch (extraBytesToRead) {
case 5: ch += *source++; ch <<= 6;
case 4: ch += *source++; ch <<= 6;
case 3: ch += *source++; ch <<= 6;
case 2: ch += *source++; ch <<= 6;
case 1: ch += *source++; ch <<= 6;
case 0: ch += *source++;
}
ch -= offsetsFromUTF8[extraBytesToRead];
if (target >= targetEnd) {
source -= (extraBytesToRead+1); /* Back up the source pointer! */
result = targetExhausted; break;
}
if (ch <= UNI_MAX_LEGAL_UTF32) {
/*
* UTF-16 surrogate values are illegal in UTF-32, and anything
* over Plane 17 (> 0x10FFFF) is illegal.
*/
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
if (flags == strictConversion) {
source -= (extraBytesToRead+1); /* return to the illegal value itself */
result = sourceIllegal;
break;
} else {
*target++ = UNI_REPLACEMENT_CHAR;
}
} else {
*target++ = ch;
}
} else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
result = sourceIllegal;
*target++ = UNI_REPLACEMENT_CHAR;
}
}
*sourceStart = source;
*targetStart = target;
return result;
}
/* ---------------------------------------------------------------------
Note A.
The fall-through switches in UTF-8 reading code save a
temp variable, some decrements & conditionals. The switches
are equivalent to the following loop:
{
int tmpBytesToRead = extraBytesToRead+1;
do {
ch += *source++;
--tmpBytesToRead;
if (tmpBytesToRead) ch <<= 6;
} while (tmpBytesToRead > 0);
}
In UTF-8 writing code, the switches on "bytesToWrite" are
similarly unrolled loops.
--------------------------------------------------------------------- */

View File

@ -1,151 +0,0 @@
/*
* Copyright 2001-2004 Unicode, Inc.
*
* Disclaimer
*
* This source code is provided as is by Unicode, Inc. No claims are
* made as to fitness for any particular purpose. No warranties of any
* kind are expressed or implied. The recipient agrees to determine
* applicability of information provided. If this file has been
* purchased on magnetic or optical media from Unicode, Inc., the
* sole remedy for any claim will be exchange of defective media
* within 90 days of receipt.
*
* Limitations on Rights to Redistribute This Code
*
* Unicode, Inc. hereby grants the right to freely use the information
* supplied in this file in the creation of products supporting the
* Unicode Standard, and to make copies of this file in any form
* for internal or external distribution as long as this notice
* remains attached.
*/
#ifndef CONVERTUTF_H
#define CONVERTUTF_H
/* ---------------------------------------------------------------------
Conversions between UTF32, UTF-16, and UTF-8. Header file.
Several funtions are included here, forming a complete set of
conversions between the three formats. UTF-7 is not included
here, but is handled in a separate source file.
Each of these routines takes pointers to input buffers and output
buffers. The input buffers are const.
Each routine converts the text between *sourceStart and sourceEnd,
putting the result into the buffer between *targetStart and
targetEnd. Note: the end pointers are *after* the last item: e.g.
*(sourceEnd - 1) is the last item.
The return result indicates whether the conversion was successful,
and if not, whether the problem was in the source or target buffers.
(Only the first encountered problem is indicated.)
After the conversion, *sourceStart and *targetStart are both
updated to point to the end of last text successfully converted in
the respective buffers.
Input parameters:
sourceStart - pointer to a pointer to the source buffer.
The contents of this are modified on return so that
it points at the next thing to be converted.
targetStart - similarly, pointer to pointer to the target buffer.
sourceEnd, targetEnd - respectively pointers to the ends of the
two buffers, for overflow checking only.
These conversion functions take a ConversionFlags argument. When this
flag is set to strict, both irregular sequences and isolated surrogates
will cause an error. When the flag is set to lenient, both irregular
sequences and isolated surrogates are converted.
Whether the flag is strict or lenient, all illegal sequences will cause
an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
must check for illegal sequences.
When the flag is set to lenient, characters over 0x10FFFF are converted
to the replacement character; otherwise (when the flag is set to strict)
they constitute an error.
Output parameters:
The value "sourceIllegal" is returned from some routines if the input
sequence is malformed. When "sourceIllegal" is returned, the source
value will point to the illegal value that caused the problem. E.g.,
in UTF-8 when a sequence is malformed, it points to the start of the
malformed sequence.
Author: Mark E. Davis, 1994.
Rev History: Rick McGowan, fixes & updates May 2001.
Fixes & updates, Sept 2001.
------------------------------------------------------------------------ */
/* ---------------------------------------------------------------------
The following 4 definitions are compiler-specific.
The C standard does not guarantee that wchar_t has at least
16 bits, so wchar_t is no less portable than unsigned short!
All should be unsigned values to avoid sign extension during
bit mask & shift operations.
------------------------------------------------------------------------ */
typedef unsigned long UTF32; /* at least 32 bits */
typedef unsigned short UTF16; /* at least 16 bits */
typedef unsigned char UTF8; /* typically 8 bits */
typedef unsigned char Boolean; /* 0 or 1 */
/* Some fundamental constants */
#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
#define UNI_MAX_BMP (UTF32)0x0000FFFF
#define UNI_MAX_UTF16 (UTF32)0x0010FFFF
#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
typedef enum {
conversionOK, /* conversion successful */
sourceExhausted, /* partial character in source, but hit end */
targetExhausted, /* insuff. room in target for conversion */
sourceIllegal /* source sequence is illegal/malformed */
} ConversionResult;
typedef enum {
strictConversion = 0,
lenientConversion
} ConversionFlags;
/* This is for C++ and does no harm in C */
#ifdef __cplusplus
extern "C" {
#endif
ConversionResult ConvertUTF8toUTF16 (
const UTF8** sourceStart, const UTF8* sourceEnd,
UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
ConversionResult ConvertUTF16toUTF8 (
const UTF16** sourceStart, const UTF16* sourceEnd,
UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
ConversionResult ConvertUTF8toUTF32 (
const UTF8** sourceStart, const UTF8* sourceEnd,
UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
ConversionResult ConvertUTF32toUTF8 (
const UTF32** sourceStart, const UTF32* sourceEnd,
UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
ConversionResult ConvertUTF16toUTF32 (
const UTF16** sourceStart, const UTF16* sourceEnd,
UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
ConversionResult ConvertUTF32toUTF16 (
const UTF32** sourceStart, const UTF32* sourceEnd,
UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
#ifdef __cplusplus
}
#endif
/* --------------------------------------------------------------------- */
#endif // CONVERTUTF_H

View File

@ -1,43 +0,0 @@
The accompanying C source code file "ConvertUTF.c" and the associated header
file "ConvertUTF.h" provide for conversion between various transformation
formats of Unicode characters. The following conversions are supported:
UTF-32 to UTF-16
UTF-32 to UTF-8
UTF-16 to UTF-32
UTF-16 to UTF-8
UTF-8 to UTF-16
UTF-8 to UTF-32
In addition, there is a test harness which runs various tests.
The files "CVTUTF7.C" and "CVTUTF7.H" are for archival and historical purposes
only. They have not been updated to Unicode 3.0 or later and should be
considered obsolescent. "CVTUTF7.C" contains two functions that can convert
between UCS2 (i.e., the BMP characters only) and UTF-7. Surrogates are
not supported, the code has not been tested, and should be considered
unsuitable for general purpose use.
Please submit any bug reports about these programs here:
http://www.unicode.org/unicode/reporting.html
Version 1.0: initial version.
Version 1.1: corrected some minor problems; added stricter checks.
Version 1.2: corrected switch statements associated with "extraBytesToRead"
in 4 & 5 byte cases, in functions for conversion from UTF8.
Note: formally, the 4 & 5 byte cases are illegal in the latest
UTF8, but the table and this code has always catered for those,
cases since at one time they were legal.
Version 1.3: Updated UTF-8 legality check;
updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions
Updated UTF-8 legality tests in harness.c
Last update: October 19, 2004

View File

@ -0,0 +1,12 @@
utf8 cpp library
Release 2.3.4
A minor bug fix release. Thanks to all who reported bugs.
Note: Version 2.3.3 contained a regression, and therefore was removed.
Changes from version 2.3.2
- Bug fix [39]: checked.h Line 273 and unchecked.h Line 182 have an extra ';'
- Bug fix [36]: replace_invalid() only works with back_inserter
Files included in the release: utf8.h, core.h, checked.h, unchecked.h, utf8cpp.html, ReleaseNotes

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,34 @@
// Copyright 2006 Nemanja Trifunovic
/*
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
#include "utf8/checked.h"
#include "utf8/unchecked.h"
#endif // header guard

View File

@ -0,0 +1,327 @@
// Copyright 2006 Nemanja Trifunovic
/*
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
#include "core.h"
#include <stdexcept>
namespace utf8
{
// Base for the exceptions that may be thrown from the library
class exception : public ::std::exception {
};
// Exceptions that may be thrown from the library functions.
class invalid_code_point : public exception {
uint32_t cp;
public:
invalid_code_point(uint32_t cp) : cp(cp) {}
virtual const char* what() const throw() { return "Invalid code point"; }
uint32_t code_point() const {return cp;}
};
class invalid_utf8 : public exception {
uint8_t u8;
public:
invalid_utf8 (uint8_t u) : u8(u) {}
virtual const char* what() const throw() { return "Invalid UTF-8"; }
uint8_t utf8_octet() const {return u8;}
};
class invalid_utf16 : public exception {
uint16_t u16;
public:
invalid_utf16 (uint16_t u) : u16(u) {}
virtual const char* what() const throw() { return "Invalid UTF-16"; }
uint16_t utf16_word() const {return u16;}
};
class not_enough_room : public exception {
public:
virtual const char* what() const throw() { return "Not enough space"; }
};
/// The library API - functions intended to be called by the users
template <typename octet_iterator>
octet_iterator append(uint32_t cp, octet_iterator result)
{
if (!utf8::internal::is_code_point_valid(cp))
throw invalid_code_point(cp);
if (cp < 0x80) // one octet
*(result++) = static_cast<uint8_t>(cp);
else if (cp < 0x800) { // two octets
*(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
else if (cp < 0x10000) { // three octets
*(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
else { // four octets
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
*(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80);
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
return result;
}
template <typename octet_iterator, typename output_iterator>
output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
{
while (start != end) {
octet_iterator sequence_start = start;
internal::utf_error err_code = utf8::internal::validate_next(start, end);
switch (err_code) {
case internal::UTF8_OK :
for (octet_iterator it = sequence_start; it != start; ++it)
*out++ = *it;
break;
case internal::NOT_ENOUGH_ROOM:
throw not_enough_room();
case internal::INVALID_LEAD:
out = utf8::append (replacement, out);
++start;
break;
case internal::INCOMPLETE_SEQUENCE:
case internal::OVERLONG_SEQUENCE:
case internal::INVALID_CODE_POINT:
out = utf8::append (replacement, out);
++start;
// just one replacement mark for the sequence
while (start != end && utf8::internal::is_trail(*start))
++start;
break;
}
}
return out;
}
template <typename octet_iterator, typename output_iterator>
inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
{
static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
return utf8::replace_invalid(start, end, out, replacement_marker);
}
template <typename octet_iterator>
uint32_t next(octet_iterator& it, octet_iterator end)
{
uint32_t cp = 0;
internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
switch (err_code) {
case internal::UTF8_OK :
break;
case internal::NOT_ENOUGH_ROOM :
throw not_enough_room();
case internal::INVALID_LEAD :
case internal::INCOMPLETE_SEQUENCE :
case internal::OVERLONG_SEQUENCE :
throw invalid_utf8(*it);
case internal::INVALID_CODE_POINT :
throw invalid_code_point(cp);
}
return cp;
}
template <typename octet_iterator>
uint32_t peek_next(octet_iterator it, octet_iterator end)
{
return utf8::next(it, end);
}
template <typename octet_iterator>
uint32_t prior(octet_iterator& it, octet_iterator start)
{
// can't do much if it == start
if (it == start)
throw not_enough_room();
octet_iterator end = it;
// Go back until we hit either a lead octet or start
while (utf8::internal::is_trail(*(--it)))
if (it == start)
throw invalid_utf8(*it); // error - no lead byte in the sequence
return utf8::peek_next(it, end);
}
/// Deprecated in versions that include "prior"
template <typename octet_iterator>
uint32_t previous(octet_iterator& it, octet_iterator pass_start)
{
octet_iterator end = it;
while (utf8::internal::is_trail(*(--it)))
if (it == pass_start)
throw invalid_utf8(*it); // error - no lead byte in the sequence
octet_iterator temp = it;
return utf8::next(temp, end);
}
template <typename octet_iterator, typename distance_type>
void advance (octet_iterator& it, distance_type n, octet_iterator end)
{
for (distance_type i = 0; i < n; ++i)
utf8::next(it, end);
}
template <typename octet_iterator>
typename std::iterator_traits<octet_iterator>::difference_type
distance (octet_iterator first, octet_iterator last)
{
typename std::iterator_traits<octet_iterator>::difference_type dist;
for (dist = 0; first < last; ++dist)
utf8::next(first, last);
return dist;
}
template <typename u16bit_iterator, typename octet_iterator>
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
{
while (start != end) {
uint32_t cp = utf8::internal::mask16(*start++);
// Take care of surrogate pairs first
if (utf8::internal::is_lead_surrogate(cp)) {
if (start != end) {
uint32_t trail_surrogate = utf8::internal::mask16(*start++);
if (utf8::internal::is_trail_surrogate(trail_surrogate))
cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
else
throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
}
else
throw invalid_utf16(static_cast<uint16_t>(cp));
}
// Lone trail surrogate
else if (utf8::internal::is_trail_surrogate(cp))
throw invalid_utf16(static_cast<uint16_t>(cp));
result = utf8::append(cp, result);
}
return result;
}
template <typename u16bit_iterator, typename octet_iterator>
u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
{
while (start != end) {
uint32_t cp = utf8::next(start, end);
if (cp > 0xffff) { //make a surrogate pair
*result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
*result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
}
else
*result++ = static_cast<uint16_t>(cp);
}
return result;
}
template <typename octet_iterator, typename u32bit_iterator>
octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
{
while (start != end)
result = utf8::append(*(start++), result);
return result;
}
template <typename octet_iterator, typename u32bit_iterator>
u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
{
while (start != end)
(*result++) = utf8::next(start, end);
return result;
}
// The iterator class
template <typename octet_iterator>
class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
octet_iterator it;
octet_iterator range_start;
octet_iterator range_end;
public:
iterator () {}
explicit iterator (const octet_iterator& octet_it,
const octet_iterator& range_start,
const octet_iterator& range_end) :
it(octet_it), range_start(range_start), range_end(range_end)
{
if (it < range_start || it > range_end)
throw std::out_of_range("Invalid utf-8 iterator position");
}
// the default "big three" are OK
octet_iterator base () const { return it; }
uint32_t operator * () const
{
octet_iterator temp = it;
return utf8::next(temp, range_end);
}
bool operator == (const iterator& rhs) const
{
if (range_start != rhs.range_start || range_end != rhs.range_end)
throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
return (it == rhs.it);
}
bool operator != (const iterator& rhs) const
{
return !(operator == (rhs));
}
iterator& operator ++ ()
{
utf8::next(it, range_end);
return *this;
}
iterator operator ++ (int)
{
iterator temp = *this;
utf8::next(it, range_end);
return temp;
}
iterator& operator -- ()
{
utf8::prior(it, range_start);
return *this;
}
iterator operator -- (int)
{
iterator temp = *this;
utf8::prior(it, range_start);
return temp;
}
}; // class iterator
} // namespace utf8
#endif //header guard

View File

@ -0,0 +1,329 @@
// Copyright 2006 Nemanja Trifunovic
/*
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
#include <iterator>
namespace utf8
{
// The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
// You may need to change them to match your system.
// These typedefs have the same names as ones from cstdint, or boost/cstdint
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
// Helper code - not intended to be directly called by the library users. May be changed at any time
namespace internal
{
// Unicode constants
// Leading (high) surrogates: 0xd800 - 0xdbff
// Trailing (low) surrogates: 0xdc00 - 0xdfff
const uint16_t LEAD_SURROGATE_MIN = 0xd800u;
const uint16_t LEAD_SURROGATE_MAX = 0xdbffu;
const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10);
const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
// Maximum valid value for a Unicode code point
const uint32_t CODE_POINT_MAX = 0x0010ffffu;
template<typename octet_type>
inline uint8_t mask8(octet_type oc)
{
return static_cast<uint8_t>(0xff & oc);
}
template<typename u16_type>
inline uint16_t mask16(u16_type oc)
{
return static_cast<uint16_t>(0xffff & oc);
}
template<typename octet_type>
inline bool is_trail(octet_type oc)
{
return ((utf8::internal::mask8(oc) >> 6) == 0x2);
}
template <typename u16>
inline bool is_lead_surrogate(u16 cp)
{
return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
}
template <typename u16>
inline bool is_trail_surrogate(u16 cp)
{
return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
}
template <typename u16>
inline bool is_surrogate(u16 cp)
{
return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
}
template <typename u32>
inline bool is_code_point_valid(u32 cp)
{
return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
}
template <typename octet_iterator>
inline typename std::iterator_traits<octet_iterator>::difference_type
sequence_length(octet_iterator lead_it)
{
uint8_t lead = utf8::internal::mask8(*lead_it);
if (lead < 0x80)
return 1;
else if ((lead >> 5) == 0x6)
return 2;
else if ((lead >> 4) == 0xe)
return 3;
else if ((lead >> 3) == 0x1e)
return 4;
else
return 0;
}
template <typename octet_difference_type>
inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
{
if (cp < 0x80) {
if (length != 1)
return true;
}
else if (cp < 0x800) {
if (length != 2)
return true;
}
else if (cp < 0x10000) {
if (length != 3)
return true;
}
return false;
}
enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
/// Helper for get_sequence_x
template <typename octet_iterator>
utf_error increase_safely(octet_iterator& it, octet_iterator end)
{
if (++it == end)
return NOT_ENOUGH_ROOM;
if (!utf8::internal::is_trail(*it))
return INCOMPLETE_SEQUENCE;
return UTF8_OK;
}
#define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}
/// get_sequence_x functions decode utf-8 sequences of the length x
template <typename octet_iterator>
utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
{
if (it == end)
return NOT_ENOUGH_ROOM;
code_point = utf8::internal::mask8(*it);
return UTF8_OK;
}
template <typename octet_iterator>
utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
{
if (it == end)
return NOT_ENOUGH_ROOM;
code_point = utf8::internal::mask8(*it);
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
return UTF8_OK;
}
template <typename octet_iterator>
utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
{
if (it == end)
return NOT_ENOUGH_ROOM;
code_point = utf8::internal::mask8(*it);
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
code_point += (*it) & 0x3f;
return UTF8_OK;
}
template <typename octet_iterator>
utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
{
if (it == end)
return NOT_ENOUGH_ROOM;
code_point = utf8::internal::mask8(*it);
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
code_point += (utf8::internal::mask8(*it) << 6) & 0xfff;
UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
code_point += (*it) & 0x3f;
return UTF8_OK;
}
#undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
template <typename octet_iterator>
utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
{
// Save the original value of it so we can go back in case of failure
// Of course, it does not make much sense with i.e. stream iterators
octet_iterator original_it = it;
uint32_t cp = 0;
// Determine the sequence length based on the lead octet
typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
const octet_difference_type length = utf8::internal::sequence_length(it);
// Get trail octets and calculate the code point
utf_error err = UTF8_OK;
switch (length) {
case 0:
return INVALID_LEAD;
case 1:
err = utf8::internal::get_sequence_1(it, end, cp);
break;
case 2:
err = utf8::internal::get_sequence_2(it, end, cp);
break;
case 3:
err = utf8::internal::get_sequence_3(it, end, cp);
break;
case 4:
err = utf8::internal::get_sequence_4(it, end, cp);
break;
}
if (err == UTF8_OK) {
// Decoding succeeded. Now, security checks...
if (utf8::internal::is_code_point_valid(cp)) {
if (!utf8::internal::is_overlong_sequence(cp, length)){
// Passed! Return here.
code_point = cp;
++it;
return UTF8_OK;
}
else
err = OVERLONG_SEQUENCE;
}
else
err = INVALID_CODE_POINT;
}
// Failure branch - restore the original value of the iterator
it = original_it;
return err;
}
template <typename octet_iterator>
inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
uint32_t ignored;
return utf8::internal::validate_next(it, end, ignored);
}
} // namespace internal
/// The library API - functions intended to be called by the users
// Byte order mark
const uint8_t bom[] = {0xef, 0xbb, 0xbf};
template <typename octet_iterator>
octet_iterator find_invalid(octet_iterator start, octet_iterator end)
{
octet_iterator result = start;
while (result != end) {
utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
if (err_code != internal::UTF8_OK)
return result;
}
return result;
}
template <typename octet_iterator>
inline bool is_valid(octet_iterator start, octet_iterator end)
{
return (utf8::find_invalid(start, end) == end);
}
template <typename octet_iterator>
inline bool starts_with_bom (octet_iterator it, octet_iterator end)
{
return (
((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
((it != end) && (utf8::internal::mask8(*it)) == bom[2])
);
}
//Deprecated in release 2.3
template <typename octet_iterator>
inline bool is_bom (octet_iterator it)
{
return (
(utf8::internal::mask8(*it++)) == bom[0] &&
(utf8::internal::mask8(*it++)) == bom[1] &&
(utf8::internal::mask8(*it)) == bom[2]
);
}
} // namespace utf8
#endif // header guard

View File

@ -0,0 +1,228 @@
// Copyright 2006 Nemanja Trifunovic
/*
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
#include "core.h"
namespace utf8
{
namespace unchecked
{
template <typename octet_iterator>
octet_iterator append(uint32_t cp, octet_iterator result)
{
if (cp < 0x80) // one octet
*(result++) = static_cast<uint8_t>(cp);
else if (cp < 0x800) { // two octets
*(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
else if (cp < 0x10000) { // three octets
*(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
else { // four octets
*(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
*(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80);
*(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
*(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
}
return result;
}
template <typename octet_iterator>
uint32_t next(octet_iterator& it)
{
uint32_t cp = utf8::internal::mask8(*it);
typename std::iterator_traits<octet_iterator>::difference_type length = utf8::internal::sequence_length(it);
switch (length) {
case 1:
break;
case 2:
it++;
cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
break;
case 3:
++it;
cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
++it;
cp += (*it) & 0x3f;
break;
case 4:
++it;
cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
++it;
cp += (utf8::internal::mask8(*it) << 6) & 0xfff;
++it;
cp += (*it) & 0x3f;
break;
}
++it;
return cp;
}
template <typename octet_iterator>
uint32_t peek_next(octet_iterator it)
{
return utf8::unchecked::next(it);
}
template <typename octet_iterator>
uint32_t prior(octet_iterator& it)
{
while (utf8::internal::is_trail(*(--it))) ;
octet_iterator temp = it;
return utf8::unchecked::next(temp);
}
// Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous)
template <typename octet_iterator>
inline uint32_t previous(octet_iterator& it)
{
return utf8::unchecked::prior(it);
}
template <typename octet_iterator, typename distance_type>
void advance (octet_iterator& it, distance_type n)
{
for (distance_type i = 0; i < n; ++i)
utf8::unchecked::next(it);
}
template <typename octet_iterator>
typename std::iterator_traits<octet_iterator>::difference_type
distance (octet_iterator first, octet_iterator last)
{
typename std::iterator_traits<octet_iterator>::difference_type dist;
for (dist = 0; first < last; ++dist)
utf8::unchecked::next(first);
return dist;
}
template <typename u16bit_iterator, typename octet_iterator>
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
{
while (start != end) {
uint32_t cp = utf8::internal::mask16(*start++);
// Take care of surrogate pairs first
if (utf8::internal::is_lead_surrogate(cp)) {
uint32_t trail_surrogate = utf8::internal::mask16(*start++);
cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
}
result = utf8::unchecked::append(cp, result);
}
return result;
}
template <typename u16bit_iterator, typename octet_iterator>
u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
{
while (start < end) {
uint32_t cp = utf8::unchecked::next(start);
if (cp > 0xffff) { //make a surrogate pair
*result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
*result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
}
else
*result++ = static_cast<uint16_t>(cp);
}
return result;
}
template <typename octet_iterator, typename u32bit_iterator>
octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
{
while (start != end)
result = utf8::unchecked::append(*(start++), result);
return result;
}
template <typename octet_iterator, typename u32bit_iterator>
u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
{
while (start < end)
(*result++) = utf8::unchecked::next(start);
return result;
}
// The iterator class
template <typename octet_iterator>
class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
octet_iterator it;
public:
iterator () {}
explicit iterator (const octet_iterator& octet_it): it(octet_it) {}
// the default "big three" are OK
octet_iterator base () const { return it; }
uint32_t operator * () const
{
octet_iterator temp = it;
return utf8::unchecked::next(temp);
}
bool operator == (const iterator& rhs) const
{
return (it == rhs.it);
}
bool operator != (const iterator& rhs) const
{
return !(operator == (rhs));
}
iterator& operator ++ ()
{
::std::advance(it, utf8::internal::sequence_length(it));
return *this;
}
iterator operator ++ (int)
{
iterator temp = *this;
::std::advance(it, utf8::internal::sequence_length(it));
return temp;
}
iterator& operator -- ()
{
utf8::unchecked::prior(it);
return *this;
}
iterator operator -- (int)
{
iterator temp = *this;
utf8::unchecked::prior(it);
return temp;
}
}; // class iterator
} // namespace utf8::unchecked
} // namespace utf8
#endif // header guard