Implementing UTF8 to ISO-8859-1 conversion to improve chances of finding files in zip archives

pull/261/head
Léo Terziman 2013-11-26 16:34:16 +01:00
parent 0bf4aea9d5
commit e2676ec176
2 changed files with 46 additions and 0 deletions

View File

@ -379,6 +379,43 @@ void BaseImporter::ConvertToUTF8(std::vector<char>& data)
}
}
// ------------------------------------------------------------------------------------------------
// Convert to UTF8 data to ISO-8859-1
void BaseImporter::ConvertUTF8toISO8859_1(std::string& data)
{
unsigned int size = data.size();
unsigned int i = 0, j = 0;
while(i < size) {
if((unsigned char) data[i] < 0x80) {
data[j] = data[i];
} else if(i < size - 1) {
if((unsigned char) data[i] == 0xC2) {
data[j] = data[++i];
} else if((unsigned char) data[i] == 0xC3) {
data[j] = ((unsigned char) data[++i] + 0x40);
} else {
std::stringstream stream;
stream << "UTF8 code " << std::hex << data[i] << data[i + 1] << " can not be converted into ISA-8859-1.";
DefaultLogger::get()->error(stream.str());
data[j++] = data[i++];
data[j] = data[i];
}
} else {
DefaultLogger::get()->error("UTF8 code but only one character remaining");
data[j] = data[i];
}
i++; j++;
}
data.resize(j);
}
// ------------------------------------------------------------------------------------------------
void BaseImporter::TextFileToBuffer(IOStream* stream,
std::vector<char>& data)

View File

@ -331,6 +331,15 @@ public: // static utilities
static void ConvertToUTF8(
std::vector<char>& data);
// -------------------------------------------------------------------
/** An utility for all text file loaders. It converts a file from our
* UTF8 character set back to ISO-8859-1. Errors are reported, but ignored.
*
* @param data File buffer to be converted from UTF8 to ISO-8859-1. The buffer
* is resized as appropriate. */
static void ConvertUTF8toISO8859_1(
std::string& data);
// -------------------------------------------------------------------
/** Utility for text file loaders which copies the contents of the
* file into a memory buffer and converts it to our UTF8