- STEP: support UTF16, UTF32 and Mac/Roman escape sequences.

2013-01-24 19:51:46 +01:00 · 2013-01-24 19:51:46 +01:00 · 03c01685d3
parent da88ab4408
commit 03c01685d3
4 changed files with 505 additions and 28 deletions
--- a/code/CMakeLists.txt
+++ b/code/CMakeLists.txt
@ -384,6 +384,8 @@ SET(IFC_SRCS
 	STEPFile.h
 	STEPFileReader.h
 	STEPFileReader.cpp
+	STEPFileEncoding.cpp
+	STEPFileEncoding.h
 )
 SOURCE_GROUP( IFC FILES ${IFC_SRCS})

--- a/code/STEPFileEncoding.cpp
+++ b/code/STEPFileEncoding.cpp
@ -0,0 +1,433 @@
+/*
+Open Asset Import Library (assimp)
+----------------------------------------------------------------------
+
+Copyright (c) 2006-2012, assimp team
+All rights reserved.
+
+Redistribution and use of this software in source and binary forms, 
+with or without modification, are permitted provided that the 
+following conditions are met:
+
+* Redistributions of source code must retain the above
+  copyright notice, this list of conditions and the
+  following disclaimer.
+
+* Redistributions in binary form must reproduce the above
+  copyright notice, this list of conditions and the
+  following disclaimer in the documentation and/or other
+  materials provided with the distribution.
+
+* Neither the name of the assimp team, nor the names of its
+  contributors may be used to endorse or promote products
+  derived from this software without specific prior
+  written permission of the assimp team.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+----------------------------------------------------------------------
+*/
+
+/** @file  STEPFileEncoding.cpp
+ *  @brief STEP character handling, string unescaping
+ */
+#include "AssimpPCH.h"
+#include "STEPFileEncoding.h"
+#include "fast_atof.h"
+
+#include "../contrib/ConvertUTF/ConvertUTF.h"
+
+using namespace Assimp;
+
+// roman1 to utf16 table
+static const UTF16 mac_codetable[] = {
+	// 0x20 unassig./nonprint. slots
+	 0x0020	,
+	 0x0021	,
+	 0x0022	,
+	 0x0023	,
+	 0x0024	,
+	 0x0025	,
+	 0x0026	,
+	 0x0027	,
+	 0x0028	,
+	 0x0029	,
+	 0x002A	,
+	 0x002B	,
+	 0x002C	,
+	 0x002D	,
+	 0x002E	,
+	 0x002F	,
+	 0x0030	,
+	 0x0031	,
+	 0x0032	,
+	 0x0033	,
+	 0x0034	,
+	 0x0035	,
+	 0x0036	,
+	 0x0037	,
+	 0x0038	,
+	 0x0039	,
+	 0x003A	,
+	 0x003B	,
+	 0x003C	,
+	 0x003D	,
+	 0x003E	,
+	 0x003F	,
+	 0x0040	,
+	 0x0041	,
+	 0x0042	,
+	 0x0043	,
+	 0x0044	,
+	 0x0045	,
+	 0x0046	,
+	 0x0047	,
+	 0x0048	,
+	 0x0049	,
+	 0x004A	,
+	 0x004B	,
+	 0x004C	,
+	 0x004D	,
+	 0x004E	,
+	 0x004F	,
+	 0x0050	,
+	 0x0051	,
+	 0x0052	,
+	 0x0053	,
+	 0x0054	,
+	 0x0055	,
+	 0x0056	,
+	 0x0057	,
+	 0x0058	,
+	 0x0059	,
+	 0x005A	,
+	 0x005B	,
+	 0x005C	,
+	 0x005D	,
+	 0x005E	,
+	 0x005F	,
+	 0x0060	,
+	 0x0061	,
+	 0x0062	,
+	 0x0063	,
+	 0x0064	,
+	 0x0065	,
+	 0x0066	,
+	 0x0067	,
+	 0x0068	,
+	 0x0069	,
+	 0x006A	,
+	 0x006B	,
+	 0x006C	,
+	 0x006D	,
+	 0x006E	,
+	 0x006F	,
+	 0x0070	,
+	 0x0071	,
+	 0x0072	,
+	 0x0073	,
+	 0x0074	,
+	 0x0075	,
+	 0x0076	,
+	 0x0077	,
+	 0x0078	,
+	 0x0079	,
+	 0x007A	,
+	 0x007B	,
+	 0x007C	,
+	 0x007D	,
+	 0x007E	,
+	 0x0000 , // unassig.
+	 0x00C4	,
+	 0x00C5	,
+	 0x00C7	,
+	 0x00C9	,
+	 0x00D1	,
+	 0x00D6	,
+	 0x00DC	,
+	 0x00E1	,
+	 0x00E0	,
+	 0x00E2	,
+	 0x00E4	,
+	 0x00E3	,
+	 0x00E5	,
+	 0x00E7	,
+	 0x00E9	,
+	 0x00E8	,
+	 0x00EA	,
+	 0x00EB	,
+	 0x00ED	,
+	 0x00EC	,
+	 0x00EE	,
+	 0x00EF	,
+	 0x00F1	,
+	 0x00F3	,
+	 0x00F2	,
+	 0x00F4	,
+	 0x00F6	,
+	 0x00F5	,
+	 0x00FA	,
+	 0x00F9	,
+	 0x00FB	,
+	 0x00FC	,
+	 0x2020	,
+	 0x00B0	,
+	 0x00A2	,
+	 0x00A3	,
+	 0x00A7	,
+	 0x2022	,
+	 0x00B6	,
+	 0x00DF	,
+	 0x00AE	,
+	 0x00A9	,
+	 0x2122	,
+	 0x00B4	,
+	 0x00A8	,
+	 0x2260	,
+	 0x00C6	,
+	 0x00D8	,
+	 0x221E	,
+	 0x00B1	,
+	 0x2264	,
+	 0x2265	,
+	 0x00A5	,
+	 0x00B5	,
+	 0x2202	,
+	 0x2211	,
+	 0x220F	,
+	 0x03C0	,
+	 0x222B	,
+	 0x00AA	,
+	 0x00BA	,
+	 0x03A9	,
+	 0x00E6	,
+	 0x00F8	,
+	 0x00BF	,
+	 0x00A1	,
+	 0x00AC	,
+	 0x221A	,
+	 0x0192	,
+	 0x2248	,
+	 0x2206	,
+	 0x00AB	,
+	 0x00BB	,
+	 0x2026	,
+	 0x00A0	,
+	 0x00C0	,
+	 0x00C3	,
+	 0x00D5	,
+	 0x0152	,
+	 0x0153	,
+	 0x2013	,
+	 0x2014	,
+	 0x201C	,
+	 0x201D	,
+	 0x2018	,
+	 0x2019	,
+	 0x00F7	,
+	 0x25CA	,
+	 0x00FF	,
+	 0x0178	,
+	 0x2044	,
+	 0x20AC	,
+	 0x2039	,
+	 0x203A	,
+	 0xFB01	,
+	 0xFB02	,
+	 0x2021	,
+	 0x00B7	,
+	 0x201A	,
+	 0x201E	,
+	 0x2030	,
+	 0x00C2	,
+	 0x00CA	,
+	 0x00C1	,
+	 0x00CB	,
+	 0x00C8	,
+	 0x00CD	,
+	 0x00CE	,
+	 0x00CF	,
+	 0x00CC	,
+	 0x00D3	,
+	 0x00D4	,
+	 0xF8FF	,
+	 0x00D2	,
+	 0x00DA	,
+	 0x00DB	,
+	 0x00D9	,
+	 0x0131	,
+	 0x02C6	,
+	 0x02DC	,
+	 0x00AF	,
+	 0x02D8	,
+	 0x02D9	,
+	 0x02DA	,
+	 0x00B8	,
+	 0x02DD	,
+	 0x02DB	,
+	 0x02C7
+};
+
+// ------------------------------------------------------------------------------------------------
+bool STEP::StringToUTF8(std::string& s) 
+{
+	// very basic handling for escaped string sequences
+	// http://doc.spatial.com/index.php?title=InterOp:Connect/STEP&redirect=no
+	
+	for (size_t i = 0; i < s.size(); ) {
+		if (s[i] == '\\') {
+			// \S\X - cp1252 (X is the character remapped to [0,127])
+			if (i+3 < s.size() && s[i+1] == 'S' && s[i+2] == '\\') {
+				// http://stackoverflow.com/questions/5586214/how-to-convert-char-from-iso-8859-1-to-utf-8-in-c-multiplatformly
+				ai_assert((uint8_t)s[i+3] < 0x80);
+				const uint8_t ch = s[i+3] + 0x80;
+
+				s[i] = 0xc0 | (ch & 0xc0) >> 6;
+				s[i+1] =  0x80 | (ch & 0x3f);
+
+				s.erase(i + 2,2);
+				++i;
+			}
+			// \X\xx - mac/roman (xx is a hex sequence)
+			else if (i+4 < s.size() && s[i+1] == 'X' && s[i+2] == '\\') {
+				
+				const uint8_t macval = HexOctetToDecimal(s.c_str() + i + 3);
+				if(macval < 0x20) {
+					return false;
+				}
+
+				ai_assert(sizeof(mac_codetable) / sizeof(mac_codetable[0]) == 0x100-0x20);
+
+				const UTF32 unival = mac_codetable[macval - 0x20], *univalp = &unival;
+
+				UTF8 temp[5], *tempp = temp;
+				ai_assert(sizeof(UTF8) == 1);
+
+				if(ConvertUTF32toUTF8(&univalp, univalp+1, &tempp, tempp+sizeof(temp), lenientConversion) != conversionOK) {
+					return false;
+				}
+
+				const size_t outcount = static_cast<size_t>(tempp-temp);
+
+				s.erase(i,5);
+				s.insert(i, reinterpret_cast<char*>(temp), outcount);
+				i += outcount;
+			}
+			// \Xn\ .. \X0\ - various unicode encodings (n=2: utf16; n=4: utf32)
+			else if (i+3 < s.size() && s[i+1] == 'X' && s[i+2] >= '0' && s[i+2] <= '9') {
+				switch(s[i+2]) {
+					// utf16
+				case '2':
+					// utf32
+				case '4':						
+					if (s[i+3] == '\\') {
+						const size_t basei = i+4;
+						// scan for \X0\ 
+						size_t j = basei, jend = s.size()-4;
+						for (; j < jend; ++j) {
+							if (s[j] == '\\' && s[j] == 'X' && s[j] == '0' && s[j] == '\\') {
+								break;
+							}
+						}
+						if (j == jend) {						
+							return false;
+						}
+
+						if (j == basei) {
+							s.erase(i,8);
+							continue;
+						}
+						
+						if (s[i+2] == '2') {
+							if (((j - basei) % 4) != 0) {
+								return false;
+							}
+							
+							const size_t count = (j-basei)/4;
+							boost::scoped_array<UTF16> src = boost::scoped_array<UTF16>(new UTF16[count]);
+
+							const char* cur = s.c_str() + basei;
+							for (size_t k = 0; k < count; ++k, cur += 4) {
+								src[k] = (static_cast<UTF16>(HexOctetToDecimal(cur)) << 8u)  |
+									 static_cast<UTF16>(HexOctetToDecimal(cur+2));
+							}
+
+							const size_t dcount = count * 3; // this is enough to hold all possible outputs
+							boost::scoped_array<UTF8> dest = boost::scoped_array<UTF8>(new UTF8[dcount]);
+
+							const UTF16* srct = src.get();
+							UTF8* destt = dest.get();
+							if(ConvertUTF16toUTF8(&srct, srct+count, &destt, destt+dcount, lenientConversion) != conversionOK) {
+								return false;
+							}
+
+							const size_t outcount = static_cast<size_t>(destt-dest.get());
+
+							s.erase(i,(j+4-i));
+
+							ai_assert(sizeof(UTF8) == 1);
+							s.insert(i, reinterpret_cast<char*>(dest.get()), outcount);
+
+							i += outcount;
+							continue;
+						}
+						else if (s[i+2] == '4') {
+							if (((j - basei) % 8) != 0) {
+								return false;
+							}
+
+							const size_t count = (j-basei)/8;
+							boost::scoped_array<UTF32> src = boost::scoped_array<UTF32>(new UTF32[count]);
+
+							const char* cur = s.c_str() + basei;
+							for (size_t k = 0; k < count; ++k, cur += 8) {
+								src[k] = (static_cast<UTF32>(HexOctetToDecimal(cur  )) << 24u) |
+									     (static_cast<UTF32>(HexOctetToDecimal(cur+2)) << 16u) |
+										 (static_cast<UTF32>(HexOctetToDecimal(cur+4)) << 8u)  |
+									     (static_cast<UTF32>(HexOctetToDecimal(cur+6)));
+							}
+
+							const size_t dcount = count * 5; // this is enough to hold all possible outputs
+							boost::scoped_array<UTF8> dest = boost::scoped_array<UTF8>(new UTF8[dcount]);
+
+							const UTF32* srct = src.get();
+							UTF8* destt = dest.get();
+							if(ConvertUTF32toUTF8(&srct, srct+count, &destt, destt+dcount, lenientConversion) != conversionOK) {
+								return false;
+							}
+
+							const size_t outcount = static_cast<size_t>(destt-dest.get());
+
+							s.erase(i,(j+4-i));
+
+							ai_assert(sizeof(UTF8) == 1);
+							s.insert(i, reinterpret_cast<char*>(dest.get()), outcount);
+
+							i += outcount;
+							continue;
+						}
+					}				
+
+					break;
+					
+					// TODO: other encoding patterns?
+
+				default:
+					return false;
+				}
+			}
+		}
+		++i;
+	}
+	return true;
+}
--- a/code/STEPFileEncoding.h
+++ b/code/STEPFileEncoding.h
@ -0,0 +1,63 @@
+/*
+Open Asset Import Library (assimp)
+----------------------------------------------------------------------
+
+Copyright (c) 2006-2012, assimp team
+All rights reserved.
+
+Redistribution and use of this software in source and binary forms, 
+with or without modification, are permitted provided that the 
+following conditions are met:
+
+* Redistributions of source code must retain the above
+  copyright notice, this list of conditions and the
+  following disclaimer.
+
+* Redistributions in binary form must reproduce the above
+  copyright notice, this list of conditions and the
+  following disclaimer in the documentation and/or other
+  materials provided with the distribution.
+
+* Neither the name of the assimp team, nor the names of its
+  contributors may be used to endorse or promote products
+  derived from this software without specific prior
+  written permission of the assimp team.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+----------------------------------------------------------------------
+*/
+
+#ifndef INCLUDED_AI_STEPFILEENCODING_H
+#define INCLUDED_AI_STEPFILEENCODING_H
+
+#include <string>
+
+namespace Assimp {
+namespace STEP {
+
+
+	// --------------------------------------------------------------------------
+	// Convert an ASCII STEP identifier with possibly escaped character
+	// sequences using foreign encodings to plain UTF8.
+	// 
+	// Return false if an error occurs, s may or may not be modified in
+	// this case and could still contain escape sequences (even partly
+	// escaped ones).
+	bool StringToUTF8(std::string& s);
+	
+
+} // ! STEP
+} // ! Assimp
+
+#endif
--- a/code/STEPFileReader.cpp
+++ b/code/STEPFileReader.cpp
@ -44,9 +44,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include "AssimpPCH.h"
 #include "STEPFileReader.h"
+#include "STEPFileEncoding.h"
 #include "TinyFormatter.h"
 #include "fast_atof.h"

+
 using namespace Assimp;
 namespace EXPRESS = STEP::EXPRESS;

@ -331,7 +333,6 @@ void STEP::ReadFile(DB& db,const EXPRESS::ConversionSchema& scheme,
 	}
 }

-
 // ------------------------------------------------------------------------------------------------
 boost::shared_ptr<const EXPRESS::DataType> EXPRESS::DataType::Parse(const char*& inout,uint64_t line, const EXPRESS::ConversionSchema* schema /*= NULL*/)
 {
@ -419,34 +420,12 @@ boost::shared_ptr<const EXPRESS::DataType> EXPRESS::DataType::Parse(const char*&

 		inout = cur + 1;

-		// very basic handling for escaped string sequences
-		// http://doc.spatial.com/index.php?title=InterOp:Connect/STEP&redirect=no
-		// UTF16: \X2\ ... \X0\
-		// UTF32: \X4\ ... \X0\
-		// Mac: \X8\xx (xx is a hex sequence)
-		// cp1252: \S\X (X is the character remapped to [0,127])
-		// ? more of them ?
-
-		// Note: assimp is supposed to output UTF8 strings
-
+		// assimp is supposed to output UTF8 strings, so we have to deal
+		// with foreign encodings.
 		std::string stemp = std::string(start, static_cast<size_t>(cur - start));
-		for (size_t i = 0; i < stemp.size(); ++i) {
-			if (stemp[i] == '\\') {
-				if (i+3 < stemp.size() && stemp[i+1] == 'S' && stemp[i+2] == '\\') {
-					// http://stackoverflow.com/questions/5586214/how-to-convert-char-from-iso-8859-1-to-utf-8-in-c-multiplatformly
-					ai_assert((uint8_t)stemp[i+3] < 0x80);
-					const uint8_t ch = stemp[i+3] + 0x80;
-
-					stemp[i] = 0xc0 | (ch & 0xc0) >> 6;
-					stemp[i+1] =  0x80 | (ch & 0x3f);
-
-					stemp.erase(i + 2,2);
-					++i;
-				}
-				else if (i+2 < stemp.size() && stemp[i+1] == 'X' && IsNumeric(stemp[i+2])) {
-					// TODO: warn
-				}
-			}
+		if(!StringToUTF8(stemp)) {
+			// TODO: route this to a correct logger with line numbers etc., better error messages
+			DefaultLogger::get()->error("an error occurred reading escape sequences in ASCII text");
 		}

 		return boost::make_shared<EXPRESS::STRING>(stemp);