JoinIdenticalVertices: Performance optimizations by Krishty („Fuck the System”). Yields a 9x speedup in first benchmarks with meshes > 2k triangles.

git-svn-id: https://assimp.svn.sourceforge.net/svnroot/assimp/trunk@780 67173fc5-114c-0410-ac8e-9d2fd5bffc1f
2010-07-11 23:07:11 +00:00 · 2010-07-11 23:07:11 +00:00 · a9fd02c14e
parent 9e8a9586b3
commit a9fd02c14e
4 changed files with 160 additions and 12 deletions
--- a/code/JoinVerticesProcess.cpp
+++ b/code/JoinVerticesProcess.cpp
@ -127,10 +127,12 @@ int JoinVerticesProcess::ProcessMesh( aiMesh* pMesh, unsigned int meshIndex)
 	std::vector<Vertex> uniqueVertices;
 	uniqueVertices.reserve( pMesh->mNumVertices);

-	// For each vertex the index of the vertex it was replaced by. 
+	// For each vertex the index of the vertex it was replaced by.
+	// Since the maximal number of vertices is 2^31-1, the most significand bit can be used to mark
+	//	whether a new vertex was created for the index (true) or if it was replaced by an existing
+	//	unique vertex (false). This saves an additional std::vector<bool> and greatly enhances
+	//	branching performance.
 	std::vector<unsigned int> replaceIndex( pMesh->mNumVertices, 0xffffffff);
-	// for each vertex whether it was replaced by an existing unique vertex (true) or a new vertex was created for it (false)
-	std::vector<bool> isVertexUnique( pMesh->mNumVertices, false);

 	// A little helper to find locally close vertices faster.
 	// Try to reuse the lookup table from the last step.
@ -180,7 +182,7 @@ int JoinVerticesProcess::ProcessMesh( aiMesh* pMesh, unsigned int meshIndex)
 		Vertex v(pMesh,a);

 		// collect all vertices that are close enough to the given position
-		vertexFinder->FindPositions( v.position, posEpsilonSqr, verticesFound);
+		vertexFinder->FindIdenticalPositions( v.position, verticesFound);
 		unsigned int matchIndex = 0xffffffff;

 		// check all unique vertices close to the position if this vertex is already present among them
@ -188,9 +190,8 @@ int JoinVerticesProcess::ProcessMesh( aiMesh* pMesh, unsigned int meshIndex)

 			const unsigned int vidx = verticesFound[b];
 			const unsigned int uidx = replaceIndex[ vidx];
-			if( uidx == 0xffffffff || !isVertexUnique[ vidx]) {
+			if( uidx & 0x80000000)
 				continue;
-			}

 			const Vertex& uv = uniqueVertices[ uidx];
 			// Position mismatch is impossible - the vertex finder already discarded all non-matching positions
@ -239,15 +240,13 @@ int JoinVerticesProcess::ProcessMesh( aiMesh* pMesh, unsigned int meshIndex)
 		if( matchIndex != 0xffffffff)
 		{
 			// store where to found the matching unique vertex
-			replaceIndex[a] = matchIndex;
-			isVertexUnique[a] = false;
+			replaceIndex[a] = matchIndex | 0x80000000;
 		}
 		else
 		{
 			// no unique vertex matches it upto now -> so add it
 			replaceIndex[a] = (unsigned int)uniqueVertices.size();
 			uniqueVertices.push_back( v);
-			isVertexUnique[a] = true;
 		}
 	}

@ -331,7 +330,7 @@ int JoinVerticesProcess::ProcessMesh( aiMesh* pMesh, unsigned int meshIndex)
 	{
 		aiFace& face = pMesh->mFaces[a];
 		for( unsigned int b = 0; b < face.mNumIndices; b++)	{
-			face.mIndices[b] = replaceIndex[face.mIndices[b]];
+			face.mIndices[b] = replaceIndex[face.mIndices[b]] & ~0x80000000;
 		}
 	}

@ -346,7 +345,7 @@ int JoinVerticesProcess::ProcessMesh( aiMesh* pMesh, unsigned int meshIndex)
 		{
 			const aiVertexWeight& ow = bone->mWeights[b];
 			// if the vertex is a unique one, translate it
-			if( isVertexUnique[ow.mVertexId])
+			if( !(replaceIndex[ow.mVertexId] & 0x80000000))
 			{
 				aiVertexWeight nw;
 				nw.mVertexId = replaceIndex[ow.mVertexId];
--- a/code/SpatialSort.cpp
+++ b/code/SpatialSort.cpp
@ -46,6 +46,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 using namespace Assimp;

+// CHAR_BIT seems to be defined under MVSC, but not under GCC. Pray that the correct value is 8.
+#ifndef CHAR_BIT
+#	define CHAR_BIT 8
+#endif
+
 // ------------------------------------------------------------------------------------------------
 // Constructs a spatially sorted representation from the given position array.
 SpatialSort::SpatialSort( const aiVector3D* pPositions, unsigned int pNumPositions, 
@ -168,6 +173,140 @@ void SpatialSort::FindPositions( const aiVector3D& pPosition,
 	// that's it
 }

+namespace {
+
+	// Binary, signed-integer representation of a single-precision floating-point value.
+	// IEEE 754 says: "If two floating-point numbers in the same format are ordered then they are
+	//	ordered the same way when their bits are reinterpreted as sign-magnitude integers."
+	// This allows us to convert all floating-point numbers to signed integers of arbitrary size
+	//	and then use them to work with ULPs (Units in the Last Place, for high-precision
+	//	computations) or to compare them (integer comparisons are faster than floating-point
+	//	comparisons on many platforms).
+	typedef signed int BinFloat;
+
+	// --------------------------------------------------------------------------------------------
+	// Converts the bit pattern of a floating-point number to its signed integer representation.
+	BinFloat ToBinary( const float & pValue) {
+
+		// If this assertion fails, signed int is not big enough to store a float on your platform.
+		//	Please correct the declaration of BinFloat a few lines above - but do it in a portable,
+		//	#ifdef'd manner!
+		BOOST_STATIC_ASSERT( sizeof(BinFloat) >= sizeof(float));
+
+		#if defined( _MSC_VER)
+			// If this assertion fails, Visual C++ has finally moved to ILP64. This means that this
+			//	code has just become legacy code! Find out the current value of _MSC_VER and modify
+			//	the #if above so it evaluates false on the current and all upcoming VC versions (or
+			//	on the current platform, if LP64 or LLP64 are still used on other platforms).
+			BOOST_STATIC_ASSERT( sizeof(BinFloat) == sizeof(float));
+
+			// This works best on Visual C++, but other compilers have their problems with it.
+			const BinFloat binValue = reinterpret_cast<BinFloat const &>(pValue);
+		#else
+			// On many compilers, reinterpreting a float address as an integer causes aliasing
+			// problems. This is an ugly but more or less safe way of doing it.
+			union {
+				float		asFloat;
+				BinFloat	asBin;
+			} conversion;
+			conversion.asBin	= 0; // zero empty space in case sizeof(BinFloat) > sizeof(float)
+			conversion.asFloat	= pValue;
+			const BinFloat binValue = conversion.asBin;
+		#endif
+
+		// floating-point numbers are of sign-magnitude format, so find out what signed number
+		//	representation we must convert negative values to.
+		// See http://en.wikipedia.org/wiki/Signed_number_representations.
+
+		// Two's complement?
+		if( (-42 == (~42 + 1)) && (binValue & 0x80000000))
+			return BinFloat(1 << (CHAR_BIT * sizeof(BinFloat) - 1)) - binValue;
+		// One's complement?
+		else if( (-42 == ~42) && (binValue & 0x80000000))
+			return BinFloat(-0) - binValue;
+		// Sign-magnitude?
+		else if( (-42 == (42 | (-0))) && (binValue & 0x80000000)) // -0 = 1000... binary
+			return binValue;
+		else
+			return binValue;
+	}
+
+} // namespace
+
+// ------------------------------------------------------------------------------------------------
+// Fills an array with indices of all positions indentical to the given position. In opposite to
+// FindPositions(), not an epsilon is used but a (very low) tolerance of four floating-point units.
+void SpatialSort::FindIdenticalPositions( const aiVector3D& pPosition, 
+	std::vector<unsigned int>& poResults) const
+{
+	// Epsilons have a huge disadvantage: they are of constant precision, while floating-point
+	//	values are of log2 precision. If you apply e=0.01 to 100, the epsilon is rather small, but
+	//	if you apply it to 0.001, it is enormous.
+
+	// The best way to overcome this is the unit in the last place (ULP). A precision of 2 ULPs
+	//	tells us that a float does not differ more than 2 bits from the "real" value. ULPs are of
+	//	logarithmic precision - around 1, they are 1÷(2^24) and around 10000, they are 0.00125.
+
+	// For standard C math, we can assume a precision of 0.5 ULPs according to IEEE 754. The
+	//	incoming vertex positions might have already been transformed, probably using rather
+	//	inaccurate SSE instructions, so we assume a tolerance of 4 ULPs to safely identify
+	//	identical vertex positions.
+	static const int toleranceInULPs = 4;
+	// An interesting point is that the inaccuracy grows linear with the number of operations:
+	//	multiplying to numbers, each inaccurate to four ULPs, results in an inaccuracy of four ULPs
+	//	plus 0.5 ULPs for the multiplication.
+	// To compute the distance to the plane, a dot product is needed - that is a multiplication and
+	//	an addition on each number.
+	static const int distanceToleranceInULPs = toleranceInULPs + 1;
+	// The squared distance between two 3D vectors is computed the same way, but with an additional
+	//	subtraction.
+	static const int distance3DToleranceInULPs = distanceToleranceInULPs + 1;
+
+	// Convert the plane distance to its signed integer representation so the ULPs tolerance can be
+	//	applied. For some reason, VC won't optimize two calls of the bit pattern conversion.
+	const BinFloat minDistBinary = ToBinary( pPosition * mPlaneNormal) - distanceToleranceInULPs;
+	const BinFloat maxDistBinary = minDistBinary + 2 * distanceToleranceInULPs;
+
+	// clear the array in this strange fashion because a simple clear() would also deallocate
+    // the array which we want to avoid
+	poResults.erase( poResults.begin(), poResults.end());
+
+	// do a binary search for the minimal distance to start the iteration there
+	unsigned int index = (unsigned int)mPositions.size() / 2;
+	unsigned int binaryStepSize = (unsigned int)mPositions.size() / 4;
+	while( binaryStepSize > 1)
+	{
+		// Ugly, but conditional jumps are faster with integers than with floats
+		if( minDistBinary > ToBinary(mPositions[index].mDistance))
+			index += binaryStepSize;
+		else
+			index -= binaryStepSize;
+
+		binaryStepSize /= 2;
+	}
+
+	// depending on the direction of the last step we need to single step a bit back or forth
+	// to find the actual beginning element of the range
+	while( index > 0 && minDistBinary < ToBinary(mPositions[index].mDistance) )
+		index--;
+	while( index < (mPositions.size() - 1) && minDistBinary > ToBinary(mPositions[index].mDistance))
+		index++;
+
+	// Now start iterating from there until the first position lays outside of the distance range.
+	// Add all positions inside the distance range within the tolerance to the result aray
+	std::vector<Entry>::const_iterator it = mPositions.begin() + index;
+	while( ToBinary(it->mDistance) < maxDistBinary)
+	{
+		if( distance3DToleranceInULPs >= ToBinary((it->mPosition - pPosition).SquareLength()))
+			poResults.push_back(it->mIndex);
+		++it;
+		if( it == mPositions.end())
+			break;
+	}
+
+	// that's it
+}
+
 // ------------------------------------------------------------------------------------------------
 unsigned int SpatialSort::GenerateMappingTable(std::vector<unsigned int>& fill,float pRadius) const
 {
--- a/code/SpatialSort.h
+++ b/code/SpatialSort.h
@ -120,6 +120,15 @@ public:
 	void FindPositions( const aiVector3D& pPosition, float pRadius, 
 		std::vector<unsigned int>& poResults) const;

+	// ------------------------------------------------------------------------------------
+	/** Fills an array with indices of all positions indentical to the given position. In
+	 *  opposite to FindPositions(), not an epsilon is used but a (very low) tolerance of
+	 *  four floating-point units.
+	 * @param pPosition The position to look for vertices.
+	 * @param poResults The container to store the indices of the found positions. 
+	 *   Will be emptied by the call so it may contain anything.*/
+	void FindIdenticalPositions( const aiVector3D& pPosition,
+		std::vector<unsigned int>& poResults) const;

 	// ------------------------------------------------------------------------------------
 	/** Compute a table that maps each vertex ID referring to a spatially close
--- a/workspaces/vc9/assimp.vcproj
+++ b/workspaces/vc9/assimp.vcproj
@ -361,7 +361,7 @@
 			Name="release-dll|x64"
 			ConfigurationType="2"
 			InheritedPropertySheets=".\shared\DllShared.vsprops"
-			WholeProgramOptimization="0"
+			WholeProgramOptimization="1"
 			>
 			<Tool
 				Name="VCPreBuildEventTool"
@ -381,6 +381,7 @@
 			/>
 			<Tool
 				Name="VCCLCompilerTool"
+				Optimization="3"
 				InlineFunctionExpansion="2"
 				EnableIntrinsicFunctions="true"
 				FavorSizeOrSpeed="1"