diff --git a/MAKE.bat b/MAKE.bat
index 3fddd3b..017d218 100644
--- a/MAKE.bat
+++ b/MAKE.bat
@@ -31,6 +31,7 @@ if "%1"=="help" (
     echo %0 [test]                  ; check untracked allocators in V4K
     echo %0 [todo]                  ; check for @fixme and @todo
     echo %0 [v4web]                 ; sync v4 website
+    echo %0 [swap]                  ; toggle #line directives on/off
     echo %0 [split^|join]            ; engine/v4k* ^>split^> engine/split/* or engine/split/* ^>join^> engine/v4k*
     echo %0 [lua]                   ; execute lua script with v4k
     echo %0 [amalgamation]          ; combine engine/v4k* into a single-header file
@@ -243,6 +244,13 @@ if "%1"=="join" (
     call tools\join
     exit /b
 )
+if "%1"=="swap" (
+    echo Swapping #line on v4k.h
+    call tools\linswap engine\v4k.h
+    echo Swapping #line on v4k.c
+    call tools\linswap engine\v4k.c
+    exit /b
+)
 
 rem fuse binaries and zipfiles
 if "%1"=="fuse" (
diff --git a/bind/v4k.lua b/bind/v4k.lua
index 1a3973a..f16a31b 100644
--- a/bind/v4k.lua
+++ b/bind/v4k.lua
@@ -1520,17 +1520,23 @@ ffi.cdef([[
 //lcpp INF [0000] vec2: macro name but used as C declaration in:vec2 fire;
 //lcpp INF [0000] vec4: macro name but used as C declaration in:vec4 pos;
 //lcpp INF [0000] vec2: macro name but used as C declaration in:vec2 sca;
-//lcpp INF [0000] vec4: macro name but used as C declaration in:void (*drawrect)(void* userdata, const char *skin, vec4 rect);
-//lcpp INF [0000] vec2: macro name but used as C declaration in:void (*getskinsize)(void* userdata, const char *skin, vec2 *size);
-//lcpp INF [0000] vec4: macro name but used as C declaration in:void (*getscissorrect)(void* userdata, const char *skin, vec4 rect, vec4 *dims);
-//lcpp INF [0000] vec4: macro name but used as C declaration in:void (*getscissorrect)(void* userdata, const char *skin, vec4 rect, vec4 *dims);
-//lcpp INF [0000] vec4: macro name but used as C declaration in:bool (*ismouseinrect)(void* userdata, const char *skin, vec4 rect);
-//lcpp INF [0000] vec2: macro name but used as C declaration in:API vec2        gui_getskinsize(const char *skin);
-//lcpp INF [0000] vec2: macro name but used as C declaration in:STATIC vec2        gui_getskinsize(const char *skin);
-//lcpp INF [0000] vec2: macro name but used as C declaration in: vec2        gui_getskinsize(const char *skin);
-//lcpp INF [0000] vec4: macro name but used as C declaration in:API bool        gui_ismouseinrect(const char *skin, vec4 rect);
-//lcpp INF [0000] vec4: macro name but used as C declaration in:STATIC bool        gui_ismouseinrect(const char *skin, vec4 rect);
-//lcpp INF [0000] vec4: macro name but used as C declaration in: bool        gui_ismouseinrect(const char *skin, vec4 rect);
+//lcpp INF [0000] vec4: macro name but used as C declaration in:void (*drawrect)(void* userdata, const char *skin, const char *fallback, vec4 rect);
+//lcpp INF [0000] vec2: macro name but used as C declaration in:void (*getskinsize)(void* userdata, const char *skin, const char *fallback, vec2 *size);
+//lcpp INF [0000] vec4: macro name but used as C declaration in:void (*getscissorrect)(void* userdata, const char *skin, const char *fallback, vec4 rect, vec4 *dims);
+//lcpp INF [0000] vec4: macro name but used as C declaration in:void (*getscissorrect)(void* userdata, const char *skin, const char *fallback, vec4 rect, vec4 *dims);
+//lcpp INF [0000] vec4: macro name but used as C declaration in:bool (*ismouseinrect)(void* userdata, const char *skin, const char *fallback, vec4 rect);
+//lcpp INF [0000] vec2: macro name but used as C declaration in:API vec2        gui_getskinsize(const char *skin, const char *fallback);
+//lcpp INF [0000] vec2: macro name but used as C declaration in:STATIC vec2        gui_getskinsize(const char *skin, const char *fallback);
+//lcpp INF [0000] vec2: macro name but used as C declaration in: vec2        gui_getskinsize(const char *skin, const char *fallback);
+//lcpp INF [0000] vec4: macro name but used as C declaration in:API bool        gui_ismouseinrect(const char *skin, const char *fallback, vec4 rect);
+//lcpp INF [0000] vec4: macro name but used as C declaration in:STATIC bool        gui_ismouseinrect(const char *skin, const char *fallback, vec4 rect);
+//lcpp INF [0000] vec4: macro name but used as C declaration in: bool        gui_ismouseinrect(const char *skin, const char *fallback, vec4 rect);
+//lcpp INF [0000] vec4: macro name but used as C declaration in:API vec4        gui_getscissorrect(const char *skin, const char *fallback, vec4 rect);
+//lcpp INF [0000] vec4: macro name but used as C declaration in:API vec4        gui_getscissorrect(const char *skin, const char *fallback, vec4 rect);
+//lcpp INF [0000] vec4: macro name but used as C declaration in:STATIC vec4        gui_getscissorrect(const char *skin, const char *fallback, vec4 rect);
+//lcpp INF [0000] vec4: macro name but used as C declaration in:STATIC vec4        gui_getscissorrect(const char *skin, const char *fallback, vec4 rect);
+//lcpp INF [0000] vec4: macro name but used as C declaration in: vec4        gui_getscissorrect(const char *skin, const char *fallback, vec4 rect);
+//lcpp INF [0000] vec4: macro name but used as C declaration in: vec4        gui_getscissorrect(const char *skin, const char *fallback, vec4 rect);
 //lcpp INF [0000] vec4: macro name but used as C declaration in:API void        gui_panel_id(int id, vec4 rect, const char *skin);
 //lcpp INF [0000] vec4: macro name but used as C declaration in:STATIC void        gui_panel_id(int id, vec4 rect, const char *skin);
 //lcpp INF [0000] vec4: macro name but used as C declaration in: void        gui_panel_id(int id, vec4 rect, const char *skin);
@@ -2991,6 +2997,9 @@ float speed;
 anim_t* anims;
 } anims_t;
  anims_t animations(const char *pathfile, int flags);
+typedef struct lightmap_t {
+struct lm_context *ctx;
+} lightmap_t;
 typedef struct skybox_t {
 handle program;
 mesh_t geometry;
@@ -3315,7 +3324,7 @@ unsigned play;
 bool paused;
 struct atlas_t *a;
 } sprite_t;
-enum { OBJTYPE_sprite_t = 10 };       typedef struct { unsigned static_assert_on_L__LINE__ : !!(10 <= 255); } static_assert_on_Lconcat(_L,3938)___COUNTER__;       typedef struct { unsigned static_assert_on_L__LINE__ : !!(sizeof(sprite_t)); } static_assert_on_Lconcat(_L,3938)___COUNTER__;;
+enum { OBJTYPE_sprite_t = 10 };       typedef struct { unsigned static_assert_on_L__LINE__ : !!(10 <= 255); } static_assert_on_Lconcat(_L,3945)___COUNTER__;       typedef struct { unsigned static_assert_on_L__LINE__ : !!(sizeof(sprite_t)); } static_assert_on_Lconcat(_L,3945)___COUNTER__;;
  void     sprite_ctor(sprite_t *s);
  void     sprite_dtor(sprite_t *s);
  void     sprite_tick(sprite_t *s);
@@ -3325,17 +3334,18 @@ enum { OBJTYPE_sprite_t = 10 };       typedef struct { unsigned static_assert_on
  void     sprite_del(sprite_t *s);
  void     sprite_setanim(sprite_t *s, unsigned name);
 typedef struct guiskin_t {
-void (*drawrect)(void* userdata, const char *skin, vec4 rect);
-void (*getskinsize)(void* userdata, const char *skin, vec2 *size);
-void (*getscissorrect)(void* userdata, const char *skin, vec4 rect, vec4 *dims);
-bool (*ismouseinrect)(void* userdata, const char *skin, vec4 rect);
+void (*drawrect)(void* userdata, const char *skin, const char *fallback, vec4 rect);
+void (*getskinsize)(void* userdata, const char *skin, const char *fallback, vec2 *size);
+void (*getscissorrect)(void* userdata, const char *skin, const char *fallback, vec4 rect, vec4 *dims);
+bool (*ismouseinrect)(void* userdata, const char *skin, const char *fallback, vec4 rect);
 void (*free)(void* userdata);
 void *userdata;
 } guiskin_t;
  void    gui_pushskin(guiskin_t skin);
  void*       gui_userdata();
- vec2        gui_getskinsize(const char *skin);
- bool        gui_ismouseinrect(const char *skin, vec4 rect);
+ vec2        gui_getskinsize(const char *skin, const char *fallback);
+ bool        gui_ismouseinrect(const char *skin, const char *fallback, vec4 rect);
+ vec4        gui_getscissorrect(const char *skin, const char *fallback, vec4 rect);
  void        gui_panel_id(int id, vec4 rect, const char *skin);
  void            gui_rect_id(int id, vec4 rect, const char *skin);
  void            gui_label_id(int id, const char *text, vec4 rect);
diff --git a/demos/3rd_lightmapper.h b/demos/3rd_lightmapper.h
new file mode 100644
index 0000000..671db74
--- /dev/null
+++ b/demos/3rd_lightmapper.h
@@ -0,0 +1,1761 @@
+/***********************************************************
+* A single header file OpenGL lightmapping library         *
+* https://github.com/ands/lightmapper                      *
+* no warranty implied | use at your own risk               *
+* author: Andreas Mantler (ands) | last change: 10.05.2018 *
+*                                                          *
+* License:                                                 *
+* This software is in the public domain.                   *
+* Where that dedication is not recognized,                 *
+* you are granted a perpetual, irrevocable license to copy *
+* and modify this file however you want.                   *
+***********************************************************/
+
+#ifndef LIGHTMAPPER_H
+#define LIGHTMAPPER_H
+
+#ifdef __cplusplus
+#define LM_DEFAULT_VALUE(value) = value
+#else
+#define LM_DEFAULT_VALUE(value)
+#endif
+
+#ifndef LM_CALLOC
+#define LM_CALLOC(count, size) calloc(count, size)
+#endif
+
+#ifndef LM_FREE
+#define LM_FREE(ptr) free(ptr)
+#endif
+
+typedef int lm_bool;
+#define LM_FALSE 0
+#define LM_TRUE  1
+
+typedef int lm_type;
+#define LM_NONE           0
+#define LM_UNSIGNED_BYTE  GL_UNSIGNED_BYTE
+#define LM_UNSIGNED_SHORT GL_UNSIGNED_SHORT
+#define LM_UNSIGNED_INT   GL_UNSIGNED_INT
+#define LM_FLOAT          GL_FLOAT
+
+typedef struct lm_context lm_context;
+
+// creates a lightmapper instance. it can be used to render multiple lightmaps.
+lm_context *lmCreate(
+	int hemisphereSize,                                                                                // hemisphereSize: resolution of the hemisphere renderings. must be a power of two! typical: 64.
+	float zNear, float zFar,                                                                           // zNear/zFar: hemisphere min/max draw distances.
+	float clearR, float clearG, float clearB,                                                          // clear color / background color / sky color.
+	int interpolationPasses, float interpolationThreshold,                                             // passes: hierarchical selective interpolation passes (0-8; initial step size = 2^passes).
+	                                                                                                   // threshold: error value below which lightmap pixels are interpolated instead of rendered.
+	                                                                                                   // use output image from LM_DEBUG_INTERPOLATION to determine a good value.
+	                                                                                                   // values around and below 0.01 are probably ok.
+	                                                                                                   // the lower the value, the more hemispheres are rendered -> slower, but possibly better quality.
+	float cameraToSurfaceDistanceModifier LM_DEFAULT_VALUE(0.0f));                                     // modifier for the height of the rendered hemispheres above the surface
+	                                                                                                   // -1.0f => stick to surface, 0.0f => minimum height for interpolated surface normals,
+	                                                                                                   // > 0.0f => improves gradients on surfaces with interpolated normals due to the flat surface horizon,
+	                                                                                                   // but may introduce other artifacts.
+
+// optional: set material characteristics by specifying cos(theta)-dependent weights for incoming light.
+typedef float (*lm_weight_func)(float cos_theta, void *userdata);
+void lmSetHemisphereWeights(lm_context *ctx, lm_weight_func f, void *userdata);                        // precalculates weights for incoming light depending on its angle. (default: all weights are 1.0f)
+
+// specify an output lightmap image buffer with w * h * c * sizeof(float) bytes of memory.
+void lmSetTargetLightmap(lm_context *ctx, float *outLightmap, int w, int h, int c);                    // output HDR lightmap (linear 32bit float channels; c: 1->Greyscale, 2->Greyscale+Alpha, 3->RGB, 4->RGBA).
+
+// set the geometry to map to the currently set target lightmap (set the target lightmap before calling this!).
+void lmSetGeometry(lm_context *ctx,
+	const float *transformationMatrix,                                                                 // 4x4 object-to-world transform for the geometry or NULL (no transformation).
+	lm_type positionsType, const void *positionsXYZ, int positionsStride,                              // triangle mesh in object space.
+	lm_type normalsType, const void *normalsXYZ, int normalsStride,                                    // optional normals for the mesh in object space (Use LM_NONE type in case you only need flat surfaces).
+	lm_type lightmapCoordsType, const void *lightmapCoordsUV, int lightmapCoordsStride,                // lightmap atlas texture coordinates for the mesh [0..1]x[0..1] (integer types are normalized to 0..1 range).
+	int count, lm_type indicesType LM_DEFAULT_VALUE(LM_NONE), const void *indices LM_DEFAULT_VALUE(0));// if mesh indices are used, count = number of indices else count = number of vertices.
+
+
+// as long as lmBegin returns true, the scene has to be rendered with the
+// returned camera and view parameters to the currently bound framebuffer.
+// if lmBegin returns true, it must be followed by lmEnd after rendering!
+lm_bool lmBegin(lm_context *ctx,
+	int* outViewport4,                                                                                 // output of the current viewport: { x, y, w, h }. use these to call glViewport()!
+	float* outView4x4,                                                                                 // output of the current camera view matrix.
+	float* outProjection4x4);                                                                          // output of the current camera projection matrix.
+
+float lmProgress(lm_context *ctx);                                                                     // should only be called between lmBegin/lmEnd!
+                                                                                                       // provides the light mapping progress as a value increasing from 0.0 to 1.0.
+
+void lmEnd(lm_context *ctx);
+
+// destroys the lightmapper instance. should be called to free resources.
+void lmDestroy(lm_context *ctx);
+
+
+// image based post processing (c is the number of color channels in the image, m a channel mask for the operation)
+#define LM_ALL_CHANNELS 0x0f
+float lmImageMin(const float *image, int w, int h, int c, int m LM_DEFAULT_VALUE(LM_ALL_CHANNELS));                    // find the minimum value (across the specified channels)
+float lmImageMax(const float *image, int w, int h, int c, int m LM_DEFAULT_VALUE(LM_ALL_CHANNELS));                    // find the maximum value (across the specified channels)
+void lmImageAdd(float *image, int w, int h, int c, float value, int m LM_DEFAULT_VALUE(LM_ALL_CHANNELS));              // in-place add to the specified channels
+void lmImageScale(float *image, int w, int h, int c, float factor, int m LM_DEFAULT_VALUE(LM_ALL_CHANNELS));           // in-place scaling of the specified channels
+void lmImagePower(float *image, int w, int h, int c, float exponent, int m LM_DEFAULT_VALUE(LM_ALL_CHANNELS));         // in-place powf(v, exponent) of the specified channels (for gamma)
+void lmImageDilate(const float *image, float *outImage, int w, int h, int c);                                          // widen the populated non-zero areas by 1 pixel.
+void lmImageSmooth(const float *image, float *outImage, int w, int h, int c);                                          // simple box filter on only the non-zero values.
+void lmImageDownsample(const float *image, float *outImage, int w, int h, int c);                                      // downsamples [0..w]x[0..h] to [0..w/2]x[0..h/2] by avereging only the non-zero values
+void lmImageFtoUB(const float *image, unsigned char *outImage, int w, int h, int c, float max LM_DEFAULT_VALUE(0.0f)); // casts a floating point image to an 8bit/channel image
+
+// TGA file output helpers
+lm_bool lmImageSaveTGAub(const char *filename, const unsigned char *image, int w, int h, int c);
+lm_bool lmImageSaveTGAf(const char *filename, const float *image, int w, int h, int c, float max LM_DEFAULT_VALUE(0.0f));
+
+#endif
+////////////////////// END OF HEADER //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef LIGHTMAPPER_IMPLEMENTATION
+#undef LIGHTMAPPER_IMPLEMENTATION
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <float.h>
+#include <assert.h>
+#include <limits.h>
+
+#define LM_SWAP(type, a, b) { type tmp = (a); (a) = (b); (b) = tmp; }
+
+#if defined(_MSC_VER) && !defined(__cplusplus)
+#define inline __inline
+#endif
+
+#if defined(_MSC_VER) && (_MSC_VER <= 1700)
+static inline lm_bool lm_finite(float a) { return _finite(a); }
+#else
+static inline lm_bool lm_finite(float a) { return isfinite(a); }
+#endif
+
+static inline int      lm_mini      (int     a, int     b) { return a < b ? a : b; }
+static inline int      lm_maxi      (int     a, int     b) { return a > b ? a : b; }
+static inline int      lm_absi      (int     a           ) { return a < 0 ? -a : a; }
+static inline float    lm_minf      (float   a, float   b) { return a < b ? a : b; }
+static inline float    lm_maxf      (float   a, float   b) { return a > b ? a : b; }
+static inline float    lm_absf      (float   a           ) { return a < 0.0f ? -a : a; }
+static inline float    lm_pmodf     (float   a, float   b) { return (a < 0.0f ? 1.0f : 0.0f) + (float)fmod(a, b); } // positive mod
+
+typedef struct lm_ivec2 { int x, y; } lm_ivec2;
+static inline lm_ivec2 lm_i2        (int     x, int     y) { lm_ivec2 v = { x, y }; return v; }
+
+typedef struct lm_vec2 { float x, y; } lm_vec2;
+static inline lm_vec2  lm_v2i       (int     x, int     y) { lm_vec2 v = { (float)x, (float)y }; return v; }
+static inline lm_vec2  lm_v2        (float   x, float   y) { lm_vec2 v = { x, y }; return v; }
+static inline lm_vec2  lm_negate2   (lm_vec2 a           ) { return lm_v2(-a.x, -a.y); }
+static inline lm_vec2  lm_add2      (lm_vec2 a, lm_vec2 b) { return lm_v2(a.x + b.x, a.y + b.y); }
+static inline lm_vec2  lm_sub2      (lm_vec2 a, lm_vec2 b) { return lm_v2(a.x - b.x, a.y - b.y); }
+static inline lm_vec2  lm_mul2      (lm_vec2 a, lm_vec2 b) { return lm_v2(a.x * b.x, a.y * b.y); }
+static inline lm_vec2  lm_scale2    (lm_vec2 a, float   b) { return lm_v2(a.x * b, a.y * b); }
+static inline lm_vec2  lm_div2      (lm_vec2 a, float   b) { return lm_scale2(a, 1.0f / b); }
+static inline lm_vec2  lm_pmod2     (lm_vec2 a, float   b) { return lm_v2(lm_pmodf(a.x, b), lm_pmodf(a.y, b)); }
+static inline lm_vec2  lm_min2      (lm_vec2 a, lm_vec2 b) { return lm_v2(lm_minf(a.x, b.x), lm_minf(a.y, b.y)); }
+static inline lm_vec2  lm_max2      (lm_vec2 a, lm_vec2 b) { return lm_v2(lm_maxf(a.x, b.x), lm_maxf(a.y, b.y)); }
+static inline lm_vec2  lm_abs2      (lm_vec2 a           ) { return lm_v2(lm_absf(a.x), lm_absf(a.y)); }
+static inline lm_vec2  lm_floor2    (lm_vec2 a           ) { return lm_v2(floorf(a.x), floorf(a.y)); }
+static inline lm_vec2  lm_ceil2     (lm_vec2 a           ) { return lm_v2(ceilf (a.x), ceilf (a.y)); }
+static inline float    lm_dot2      (lm_vec2 a, lm_vec2 b) { return a.x * b.x + a.y * b.y; }
+static inline float    lm_cross2    (lm_vec2 a, lm_vec2 b) { return a.x * b.y - a.y * b.x; } // pseudo cross product
+static inline float    lm_length2sq (lm_vec2 a           ) { return a.x * a.x + a.y * a.y; }
+static inline float    lm_length2   (lm_vec2 a           ) { return sqrtf(lm_length2sq(a)); }
+static inline lm_vec2  lm_normalize2(lm_vec2 a           ) { return lm_div2(a, lm_length2(a)); }
+static inline lm_bool  lm_finite2   (lm_vec2 a           ) { return lm_finite(a.x) && lm_finite(a.y); }
+
+typedef struct lm_vec3 { float x, y, z; } lm_vec3;
+static inline lm_vec3  lm_v3        (float   x, float   y, float   z) { lm_vec3 v = { x, y, z }; return v; }
+static inline lm_vec3  lm_negate3   (lm_vec3 a           ) { return lm_v3(-a.x, -a.y, -a.z); }
+static inline lm_vec3  lm_add3      (lm_vec3 a, lm_vec3 b) { return lm_v3(a.x + b.x, a.y + b.y, a.z + b.z); }
+static inline lm_vec3  lm_sub3      (lm_vec3 a, lm_vec3 b) { return lm_v3(a.x - b.x, a.y - b.y, a.z - b.z); }
+static inline lm_vec3  lm_mul3      (lm_vec3 a, lm_vec3 b) { return lm_v3(a.x * b.x, a.y * b.y, a.z * b.z); }
+static inline lm_vec3  lm_scale3    (lm_vec3 a, float   b) { return lm_v3(a.x * b, a.y * b, a.z * b); }
+static inline lm_vec3  lm_div3      (lm_vec3 a, float   b) { return lm_scale3(a, 1.0f / b); }
+static inline lm_vec3  lm_pmod3     (lm_vec3 a, float   b) { return lm_v3(lm_pmodf(a.x, b), lm_pmodf(a.y, b), lm_pmodf(a.z, b)); }
+static inline lm_vec3  lm_min3      (lm_vec3 a, lm_vec3 b) { return lm_v3(lm_minf(a.x, b.x), lm_minf(a.y, b.y), lm_minf(a.z, b.z)); }
+static inline lm_vec3  lm_max3      (lm_vec3 a, lm_vec3 b) { return lm_v3(lm_maxf(a.x, b.x), lm_maxf(a.y, b.y), lm_maxf(a.z, b.z)); }
+static inline lm_vec3  lm_abs3      (lm_vec3 a           ) { return lm_v3(lm_absf(a.x), lm_absf(a.y), lm_absf(a.z)); }
+static inline lm_vec3  lm_floor3    (lm_vec3 a           ) { return lm_v3(floorf(a.x), floorf(a.y), floorf(a.z)); }
+static inline lm_vec3  lm_ceil3     (lm_vec3 a           ) { return lm_v3(ceilf (a.x), ceilf (a.y), ceilf (a.z)); }
+static inline float    lm_dot3      (lm_vec3 a, lm_vec3 b) { return a.x * b.x + a.y * b.y + a.z * b.z; }
+static inline lm_vec3  lm_cross3    (lm_vec3 a, lm_vec3 b) { return lm_v3(a.y * b.z - b.y * a.z, a.z * b.x - b.z * a.x, a.x * b.y - b.x * a.y); }
+static inline float    lm_length3sq (lm_vec3 a           ) { return a.x * a.x + a.y * a.y + a.z * a.z; }
+static inline float    lm_length3   (lm_vec3 a           ) { return sqrtf(lm_length3sq(a)); }
+static inline lm_vec3  lm_normalize3(lm_vec3 a           ) { return lm_div3(a, lm_length3(a)); }
+static inline lm_bool  lm_finite3   (lm_vec3 a           ) { return lm_finite(a.x) && lm_finite(a.y) && lm_finite(a.z); }
+
+static lm_vec2 lm_toBarycentric(lm_vec2 p1, lm_vec2 p2, lm_vec2 p3, lm_vec2 p)
+{
+	// http://www.blackpawn.com/texts/pointinpoly/
+	// Compute vectors
+	lm_vec2 v0 = lm_sub2(p3, p1);
+	lm_vec2 v1 = lm_sub2(p2, p1);
+	lm_vec2 v2 = lm_sub2(p, p1);
+	// Compute dot products
+	float dot00 = lm_dot2(v0, v0);
+	float dot01 = lm_dot2(v0, v1);
+	float dot02 = lm_dot2(v0, v2);
+	float dot11 = lm_dot2(v1, v1);
+	float dot12 = lm_dot2(v1, v2);
+	// Compute barycentric coordinates
+	float invDenom = 1.0f / (dot00 * dot11 - dot01 * dot01);
+	float u = (dot11 * dot02 - dot01 * dot12) * invDenom;
+	float v = (dot00 * dot12 - dot01 * dot02) * invDenom;
+	return lm_v2(u, v);
+}
+
+static inline int lm_leftOf(lm_vec2 a, lm_vec2 b, lm_vec2 c)
+{
+	float x = lm_cross2(lm_sub2(b, a), lm_sub2(c, b));
+	return x < 0 ? -1 : x > 0;
+}
+
+static lm_bool lm_lineIntersection(lm_vec2 x0, lm_vec2 x1, lm_vec2 y0, lm_vec2 y1, lm_vec2* res)
+{
+	lm_vec2 dx = lm_sub2(x1, x0);
+	lm_vec2 dy = lm_sub2(y1, y0);
+	lm_vec2 d = lm_sub2(x0, y0);
+	float dyx = lm_cross2(dy, dx);
+	if (dyx == 0.0f)
+		return LM_FALSE;
+	dyx = lm_cross2(d, dx) / dyx;
+	if (dyx <= 0 || dyx >= 1)
+		return LM_FALSE;
+	res->x = y0.x + dyx * dy.x;
+	res->y = y0.y + dyx * dy.y;
+	return LM_TRUE;
+}
+
+// this modifies the poly array! the poly array must be big enough to hold the result!
+// res must be big enough to hold the result!
+static int lm_convexClip(lm_vec2 *poly, int nPoly, const lm_vec2 *clip, int nClip, lm_vec2 *res)
+{
+	int nRes = nPoly;
+	int dir = lm_leftOf(clip[0], clip[1], clip[2]);
+	for (int i = 0, j = nClip - 1; i < nClip && nRes; j = i++)
+	{
+		if (i != 0)
+			for (nPoly = 0; nPoly < nRes; nPoly++)
+				poly[nPoly] = res[nPoly];
+		nRes = 0;
+		lm_vec2 v0 = poly[nPoly - 1];
+		int side0 = lm_leftOf(clip[j], clip[i], v0);
+		if (side0 != -dir)
+			res[nRes++] = v0;
+		for (int k = 0; k < nPoly; k++)
+		{
+			lm_vec2 v1 = poly[k], x;
+			int side1 = lm_leftOf(clip[j], clip[i], v1);
+			if (side0 + side1 == 0 && side0 && lm_lineIntersection(clip[j], clip[i], v0, v1, &x))
+				res[nRes++] = x;
+			if (k == nPoly - 1)
+				break;
+			if (side1 != -dir)
+				res[nRes++] = v1;
+			v0 = v1;
+			side0 = side1;
+		}
+	}
+
+	return nRes;
+}
+
+struct lm_context
+{
+	struct
+	{
+		const float *modelMatrix;
+		float normalMatrix[9];
+
+		const unsigned char *positions;
+		lm_type positionsType;
+		int positionsStride;
+		const unsigned char *normals;
+		lm_type normalsType;
+		int normalsStride;
+		const unsigned char *uvs;
+		lm_type uvsType;
+		int uvsStride;
+		const unsigned char *indices;
+		lm_type indicesType;
+		unsigned int count;
+	} mesh;
+
+	struct
+	{
+		int pass;
+		int passCount;
+
+		struct
+		{
+			unsigned int baseIndex;
+			lm_vec3 p[3];
+			lm_vec3 n[3];
+			lm_vec2 uv[3];
+		} triangle;
+
+		struct
+		{
+			int minx, miny;
+			int maxx, maxy;
+			int x, y;
+		} rasterizer;
+
+		struct
+		{
+			lm_vec3 position;
+			lm_vec3 direction;
+			lm_vec3 up;
+		} sample;
+
+		struct
+		{
+			int side;
+		} hemisphere;
+	} meshPosition;
+
+	struct
+	{
+		int width;
+		int height;
+		int channels;
+		float *data;
+
+#ifdef LM_DEBUG_INTERPOLATION
+		unsigned char *debug;
+#endif
+	} lightmap;
+
+	struct
+	{
+		unsigned int size;
+		float zNear, zFar;
+		float cameraToSurfaceDistanceModifier;
+		struct { float r, g, b; } clearColor;
+
+		unsigned int fbHemiCountX;
+		unsigned int fbHemiCountY;
+		unsigned int fbHemiIndex;
+		lm_ivec2 *fbHemiToLightmapLocation;
+		GLuint fbTexture[2];
+		GLuint fb[2];
+		GLuint fbDepth;
+		GLuint vao;
+		struct
+		{
+			GLuint programID;
+			GLuint hemispheresTextureID;
+			GLuint weightsTextureID;
+			GLuint weightsTexture;
+		} firstPass;
+		struct
+		{
+			GLuint programID;
+			GLuint hemispheresTextureID;
+		} downsamplePass;
+		struct
+		{
+			GLuint texture;
+			lm_ivec2 writePosition;
+			lm_ivec2 *toLightmapLocation;
+		} storage;
+	} hemisphere;
+
+	float interpolationThreshold;
+};
+
+// pass order of one 4x4 interpolation patch for two interpolation steps (and the next neighbors right of/below it)
+// 0 4 1 4 0
+// 5 6 5 6 5
+// 2 4 3 4 2
+// 5 6 5 6 5
+// 0 4 1 4 0
+
+static unsigned int lm_passStepSize(lm_context *ctx)
+{
+	unsigned int shift = ctx->meshPosition.passCount / 3 - (ctx->meshPosition.pass - 1) / 3;
+	unsigned int step = (1 << shift);
+	assert(step > 0);
+	return step;
+}
+
+static unsigned int lm_passOffsetX(lm_context *ctx)
+{
+	if (!ctx->meshPosition.pass)
+		return 0;
+	int passType = (ctx->meshPosition.pass - 1) % 3;
+	unsigned int halfStep = lm_passStepSize(ctx) >> 1;
+	return passType != 1 ? halfStep : 0;
+}
+
+static unsigned int lm_passOffsetY(lm_context *ctx)
+{
+	if (!ctx->meshPosition.pass)
+		return 0;
+	int passType = (ctx->meshPosition.pass - 1) % 3;
+	unsigned int halfStep = lm_passStepSize(ctx) >> 1;
+	return passType != 0 ? halfStep : 0;
+}
+
+static lm_bool lm_hasConservativeTriangleRasterizerFinished(lm_context *ctx)
+{
+	return ctx->meshPosition.rasterizer.y >= ctx->meshPosition.rasterizer.maxy;
+}
+
+static void lm_moveToNextPotentialConservativeTriangleRasterizerPosition(lm_context *ctx)
+{
+	unsigned int step = lm_passStepSize(ctx);
+	ctx->meshPosition.rasterizer.x += step;
+	while (ctx->meshPosition.rasterizer.x >= ctx->meshPosition.rasterizer.maxx)
+	{
+		ctx->meshPosition.rasterizer.x = ctx->meshPosition.rasterizer.minx + lm_passOffsetX(ctx);
+		ctx->meshPosition.rasterizer.y += step;
+		if (lm_hasConservativeTriangleRasterizerFinished(ctx))
+			break;
+	}
+}
+
+static float *lm_getLightmapPixel(lm_context *ctx, int x, int y)
+{
+	assert(x >= 0 && x < ctx->lightmap.width && y >= 0 && y < ctx->lightmap.height);
+	return ctx->lightmap.data + (y * ctx->lightmap.width + x) * ctx->lightmap.channels;
+}
+
+static void lm_setLightmapPixel(lm_context *ctx, int x, int y, float *in)
+{
+	assert(x >= 0 && x < ctx->lightmap.width && y >= 0 && y < ctx->lightmap.height);
+	float *p = ctx->lightmap.data + (y * ctx->lightmap.width + x) * ctx->lightmap.channels;
+	for (int j = 0; j < ctx->lightmap.channels; j++)
+		*p++ = *in++;
+}
+
+#define lm_baseAngle 0.1f
+static const float lm_baseAngles[3][3] = {
+	{ lm_baseAngle, lm_baseAngle + 1.0f / 3.0f, lm_baseAngle + 2.0f / 3.0f },
+	{ lm_baseAngle + 1.0f / 3.0f, lm_baseAngle + 2.0f / 3.0f, lm_baseAngle },
+	{ lm_baseAngle + 2.0f / 3.0f, lm_baseAngle, lm_baseAngle + 1.0f / 3.0f }
+};
+
+static lm_bool lm_trySamplingConservativeTriangleRasterizerPosition(lm_context *ctx)
+{
+	if (lm_hasConservativeTriangleRasterizerFinished(ctx))
+		return LM_FALSE;
+
+	// check if lightmap pixel was already set
+	float *pixelValue = lm_getLightmapPixel(ctx, ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y);
+	for (int j = 0; j < ctx->lightmap.channels; j++)
+		if (pixelValue[j] != 0.0f)
+			return LM_FALSE;
+
+	// try calculating centroid by clipping the pixel against the triangle
+	lm_vec2 pixel[16];
+	pixel[0] = lm_v2i(ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y);
+	pixel[1] = lm_v2i(ctx->meshPosition.rasterizer.x + 1, ctx->meshPosition.rasterizer.y);
+	pixel[2] = lm_v2i(ctx->meshPosition.rasterizer.x + 1, ctx->meshPosition.rasterizer.y + 1);
+	pixel[3] = lm_v2i(ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y + 1);
+
+	lm_vec2 res[16];
+	int nRes = lm_convexClip(pixel, 4, ctx->meshPosition.triangle.uv, 3, res);
+	if (nRes == 0)
+		return LM_FALSE; // nothing left
+
+	// calculate centroid position and area
+	lm_vec2 centroid = res[0];
+	float area = res[nRes - 1].x * res[0].y - res[nRes - 1].y * res[0].x;
+	for (int i = 1; i < nRes; i++)
+	{
+		centroid = lm_add2(centroid, res[i]);
+		area += res[i - 1].x * res[i].y - res[i - 1].y * res[i].x;
+	}
+	centroid = lm_div2(centroid, (float)nRes);
+	area = lm_absf(area / 2.0f);
+
+	if (area <= 0.0f)
+		return LM_FALSE; // no area left
+
+	// calculate barycentric coords
+	lm_vec2 uv = lm_toBarycentric(
+		ctx->meshPosition.triangle.uv[0],
+		ctx->meshPosition.triangle.uv[1],
+		ctx->meshPosition.triangle.uv[2],
+		centroid);
+
+	if (!lm_finite2(uv))
+		return LM_FALSE; // degenerate
+
+	// try to interpolate color from neighbors:
+	if (ctx->meshPosition.pass > 0)
+	{
+		float *neighbors[4];
+		int neighborCount = 0;
+		int neighborsExpected = 0;
+		int d = (int)lm_passStepSize(ctx) / 2;
+		int dirs = ((ctx->meshPosition.pass - 1) % 3) + 1;
+		if (dirs & 1) // check x-neighbors with distance d
+		{
+			neighborsExpected += 2;
+			if (ctx->meshPosition.rasterizer.x - d >= ctx->meshPosition.rasterizer.minx &&
+				ctx->meshPosition.rasterizer.x + d <= ctx->meshPosition.rasterizer.maxx)
+			{
+				neighbors[neighborCount++] = lm_getLightmapPixel(ctx, ctx->meshPosition.rasterizer.x - d, ctx->meshPosition.rasterizer.y);
+				neighbors[neighborCount++] = lm_getLightmapPixel(ctx, ctx->meshPosition.rasterizer.x + d, ctx->meshPosition.rasterizer.y);
+			}
+		}
+		if (dirs & 2) // check y-neighbors with distance d
+		{
+			neighborsExpected += 2;
+			if (ctx->meshPosition.rasterizer.y - d >= ctx->meshPosition.rasterizer.miny &&
+				ctx->meshPosition.rasterizer.y + d <= ctx->meshPosition.rasterizer.maxy)
+			{
+				neighbors[neighborCount++] = lm_getLightmapPixel(ctx, ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y - d);
+				neighbors[neighborCount++] = lm_getLightmapPixel(ctx, ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y + d);
+			}
+		}
+		if (neighborCount == neighborsExpected) // are all interpolation neighbors available?
+		{
+			// calculate average neighbor pixel value
+			float avg[4] = { 0 };
+			for (int i = 0; i < neighborCount; i++)
+				for (int j = 0; j < ctx->lightmap.channels; j++)
+					avg[j] += neighbors[i][j];
+			float ni = 1.0f / neighborCount;
+			for (int j = 0; j < ctx->lightmap.channels; j++)
+				avg[j] *= ni;
+
+			// check if error from average pixel to neighbors is above the interpolation threshold
+			lm_bool interpolate = LM_TRUE;
+			for (int i = 0; i < neighborCount; i++)
+			{
+				lm_bool zero = LM_TRUE;
+				for (int j = 0; j < ctx->lightmap.channels; j++)
+				{
+					if (neighbors[i][j] != 0.0f)
+						zero = LM_FALSE;
+					if (fabs(neighbors[i][j] - avg[j]) > ctx->interpolationThreshold)
+						interpolate = LM_FALSE;
+				}
+				if (zero)
+					interpolate = LM_FALSE;
+				if (!interpolate)
+					break;
+			}
+
+			// set interpolated value and return if interpolation is acceptable
+			if (interpolate)
+			{
+				lm_setLightmapPixel(ctx, ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y, avg);
+#ifdef LM_DEBUG_INTERPOLATION
+				// set interpolated pixel to green in debug output
+				ctx->lightmap.debug[(ctx->meshPosition.rasterizer.y * ctx->lightmap.width + ctx->meshPosition.rasterizer.x) * 3 + 1] = 255;
+#endif
+				return LM_FALSE;
+			}
+		}
+	}
+
+	// could not interpolate. must render a hemisphere.
+	// calculate 3D sample position and orientation
+	lm_vec3 p0 = ctx->meshPosition.triangle.p[0];
+	lm_vec3 p1 = ctx->meshPosition.triangle.p[1];
+	lm_vec3 p2 = ctx->meshPosition.triangle.p[2];
+	lm_vec3 v1 = lm_sub3(p1, p0);
+	lm_vec3 v2 = lm_sub3(p2, p0);
+	ctx->meshPosition.sample.position = lm_add3(p0, lm_add3(lm_scale3(v2, uv.x), lm_scale3(v1, uv.y)));
+
+	lm_vec3 n0 = ctx->meshPosition.triangle.n[0];
+	lm_vec3 n1 = ctx->meshPosition.triangle.n[1];
+	lm_vec3 n2 = ctx->meshPosition.triangle.n[2];
+	lm_vec3 nv1 = lm_sub3(n1, n0);
+	lm_vec3 nv2 = lm_sub3(n2, n0);
+	ctx->meshPosition.sample.direction = lm_normalize3(lm_add3(n0, lm_add3(lm_scale3(nv2, uv.x), lm_scale3(nv1, uv.y))));
+	ctx->meshPosition.sample.direction = lm_normalize3(ctx->meshPosition.sample.direction);
+	float cameraToSurfaceDistance = (1.0f + ctx->hemisphere.cameraToSurfaceDistanceModifier) * ctx->hemisphere.zNear * sqrtf(2.0f);
+	ctx->meshPosition.sample.position = lm_add3(ctx->meshPosition.sample.position, lm_scale3(ctx->meshPosition.sample.direction, cameraToSurfaceDistance));
+
+	if (!lm_finite3(ctx->meshPosition.sample.position) ||
+		!lm_finite3(ctx->meshPosition.sample.direction) ||
+		lm_length3sq(ctx->meshPosition.sample.direction) < 0.5f) // don't allow 0.0f. should always be ~1.0f
+		return LM_FALSE;
+
+	lm_vec3 up = lm_v3(0.0f, 1.0f, 0.0f);
+	if (lm_absf(lm_dot3(up, ctx->meshPosition.sample.direction)) > 0.8f)
+		up = lm_v3(0.0f, 0.0f, 1.0f);
+
+#if 0
+	// triangle-consistent up vector
+	ctx->meshPosition.sample.up = lm_normalize3(lm_cross3(up, ctx->meshPosition.sample.direction));
+	return LM_TRUE;
+#else
+	// "randomized" rotation with pattern
+	lm_vec3 side = lm_normalize3(lm_cross3(up, ctx->meshPosition.sample.direction));
+	up = lm_normalize3(lm_cross3(side, ctx->meshPosition.sample.direction));
+	int rx = ctx->meshPosition.rasterizer.x % 3;
+	int ry = ctx->meshPosition.rasterizer.y % 3;
+	static const float lm_pi = 3.14159265358979f;
+	float phi = 2.0f * lm_pi * lm_baseAngles[ry][rx] + 0.1f * ((float)rand() / (float)RAND_MAX);
+	ctx->meshPosition.sample.up = lm_normalize3(lm_add3(lm_scale3(side, cosf(phi)), lm_scale3(up, sinf(phi))));
+	return LM_TRUE;
+#endif
+}
+
+// returns true if a sampling position was found and
+// false if we finished rasterizing the current triangle
+static lm_bool lm_findFirstConservativeTriangleRasterizerPosition(lm_context *ctx)
+{
+	while (!lm_trySamplingConservativeTriangleRasterizerPosition(ctx))
+	{
+		lm_moveToNextPotentialConservativeTriangleRasterizerPosition(ctx);
+		if (lm_hasConservativeTriangleRasterizerFinished(ctx))
+			return LM_FALSE;
+	}
+	return LM_TRUE;
+}
+
+static lm_bool lm_findNextConservativeTriangleRasterizerPosition(lm_context *ctx)
+{
+	lm_moveToNextPotentialConservativeTriangleRasterizerPosition(ctx);
+	return lm_findFirstConservativeTriangleRasterizerPosition(ctx);
+}
+
+static void lm_integrateHemisphereBatch(lm_context *ctx)
+{
+	if (!ctx->hemisphere.fbHemiIndex)
+		return; // nothing to do
+
+	glDisable(GL_DEPTH_TEST);
+	glBindVertexArray(ctx->hemisphere.vao);
+
+	int fbRead = 0;
+	int fbWrite = 1;
+
+	// weighted downsampling pass
+	int outHemiSize = ctx->hemisphere.size / 2;
+	glBindFramebuffer(GL_FRAMEBUFFER, ctx->hemisphere.fb[fbWrite]);
+	glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, ctx->hemisphere.fbTexture[fbWrite], 0);
+	glViewport(0, 0, outHemiSize * ctx->hemisphere.fbHemiCountX, outHemiSize * ctx->hemisphere.fbHemiCountY);
+	glUseProgram(ctx->hemisphere.firstPass.programID);
+	glUniform1i(ctx->hemisphere.firstPass.hemispheresTextureID, 0);
+	glActiveTexture(GL_TEXTURE0);
+	glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.fbTexture[fbRead]);
+	glUniform1i(ctx->hemisphere.firstPass.weightsTextureID, 1);
+	glActiveTexture(GL_TEXTURE1);
+	glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.firstPass.weightsTexture);
+	glActiveTexture(GL_TEXTURE0);
+	glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+	//glBindTexture(GL_TEXTURE_2D, 0);
+
+#if 0
+	// debug output
+	int w = outHemiSize * ctx->hemisphere.fbHemiCountX, h = outHemiSize * ctx->hemisphere.fbHemiCountY;
+	glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
+	glBindFramebuffer(GL_READ_FRAMEBUFFER, ctx->hemisphere.fb[fbWrite]);
+	glReadBuffer(GL_COLOR_ATTACHMENT0);
+	float *image = new float[3 * w * h];
+	glReadPixels(0, 0, w, h, GL_RGB, GL_FLOAT, image);
+	lmImageSaveTGAf("firstpass.png", image, w, h, 3);
+	delete[] image;
+#endif
+
+	// downsampling passes
+	glUseProgram(ctx->hemisphere.downsamplePass.programID);
+	glUniform1i(ctx->hemisphere.downsamplePass.hemispheresTextureID, 0);
+	while (outHemiSize > 1)
+	{
+		LM_SWAP(int, fbRead, fbWrite);
+		outHemiSize /= 2;
+		glBindFramebuffer(GL_FRAMEBUFFER, ctx->hemisphere.fb[fbWrite]);
+		glViewport(0, 0, outHemiSize * ctx->hemisphere.fbHemiCountX, outHemiSize * ctx->hemisphere.fbHemiCountY);
+		glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.fbTexture[fbRead]);
+		glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+		//glBindTexture(GL_TEXTURE_2D, 0);
+	}
+
+	// copy results to storage texture
+	glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.storage.texture);
+	glCopyTexSubImage2D(GL_TEXTURE_2D, 0,
+		ctx->hemisphere.storage.writePosition.x, ctx->hemisphere.storage.writePosition.y,
+		0, 0, ctx->hemisphere.fbHemiCountX, ctx->hemisphere.fbHemiCountY);
+	glBindTexture(GL_TEXTURE_2D, 0);
+	glBindFramebuffer(GL_FRAMEBUFFER, 0);
+	glBindVertexArray(0);
+	glEnable(GL_DEPTH_TEST);
+
+	// copy position mapping to storage
+	for (unsigned int y = 0; y < ctx->hemisphere.fbHemiCountY; y++)
+	{
+		int sy = ctx->hemisphere.storage.writePosition.y + y;
+		for (unsigned int x = 0; x < ctx->hemisphere.fbHemiCountX; x++)
+		{
+			int sx = ctx->hemisphere.storage.writePosition.x + x;
+			unsigned int hemiIndex = y * ctx->hemisphere.fbHemiCountX + x;
+			if (hemiIndex >= ctx->hemisphere.fbHemiIndex)
+				ctx->hemisphere.storage.toLightmapLocation[sy * ctx->lightmap.width + sx] = lm_i2(-1, -1);
+			else
+				ctx->hemisphere.storage.toLightmapLocation[sy * ctx->lightmap.width + sx] = ctx->hemisphere.fbHemiToLightmapLocation[hemiIndex];
+		}
+	}
+
+	// advance storage texture write position
+	ctx->hemisphere.storage.writePosition.x += ctx->hemisphere.fbHemiCountX;
+	if (ctx->hemisphere.storage.writePosition.x + (int)ctx->hemisphere.fbHemiCountX > ctx->lightmap.width)
+	{
+		ctx->hemisphere.storage.writePosition.x = 0;
+		ctx->hemisphere.storage.writePosition.y += ctx->hemisphere.fbHemiCountY;
+		assert(ctx->hemisphere.storage.writePosition.y + (int)ctx->hemisphere.fbHemiCountY < ctx->lightmap.height);
+	}
+
+	ctx->hemisphere.fbHemiIndex = 0;
+}
+
+static void lm_writeResultsToLightmap(lm_context *ctx)
+{
+	// do the GPU->CPU transfer of downsampled hemispheres
+	float *hemi = (float*)LM_CALLOC(ctx->lightmap.width * ctx->lightmap.height, 4 * sizeof(float));
+	glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.storage.texture);
+	glGetTexImage(GL_TEXTURE_2D, 0, GL_RGBA, GL_FLOAT, hemi);
+
+	// write results to lightmap texture
+	for (int y = 0; y < ctx->hemisphere.storage.writePosition.y + (int)ctx->hemisphere.fbHemiCountY; y++)
+	{
+		for (int x = 0; x < ctx->lightmap.width; x++)
+		{
+			lm_ivec2 lmUV = ctx->hemisphere.storage.toLightmapLocation[y * ctx->lightmap.width + x];
+			if (lmUV.x >= 0)
+			{
+				float *c = hemi + (y * ctx->lightmap.width + x) * 4;
+				float validity = c[3];
+				float *lm = ctx->lightmap.data + (lmUV.y * ctx->lightmap.width + lmUV.x) * ctx->lightmap.channels;
+				if (!lm[0] && validity > 0.9)
+				{
+					float scale = 1.0f / validity;
+					switch (ctx->lightmap.channels)
+					{
+					case 1:
+						lm[0] = lm_maxf((c[0] + c[1] + c[2]) * scale / 3.0f, FLT_MIN);
+						break;
+					case 2:
+						lm[0] = lm_maxf((c[0] + c[1] + c[2]) * scale / 3.0f, FLT_MIN);
+						lm[1] = 1.0f; // do we want to support this format?
+						break;
+					case 3:
+						lm[0] = lm_maxf(c[0] * scale, FLT_MIN);
+						lm[1] = lm_maxf(c[1] * scale, FLT_MIN);
+						lm[2] = lm_maxf(c[2] * scale, FLT_MIN);
+						break;
+					case 4:
+						lm[0] = lm_maxf(c[0] * scale, FLT_MIN);
+						lm[1] = lm_maxf(c[1] * scale, FLT_MIN);
+						lm[2] = lm_maxf(c[2] * scale, FLT_MIN);
+						lm[3] = 1.0f;
+						break;
+					default:
+						assert(LM_FALSE);
+						break;
+					}
+
+#ifdef LM_DEBUG_INTERPOLATION
+					// set sampled pixel to red in debug output
+					ctx->lightmap.debug[(lmUV.y * ctx->lightmap.width + lmUV.x) * 3 + 0] = 255;
+#endif
+				}
+			}
+			ctx->hemisphere.storage.toLightmapLocation[y * ctx->lightmap.width + x].x = -1; // reset
+		}
+	}
+
+	LM_FREE(hemi);
+	ctx->hemisphere.storage.writePosition = lm_i2(0, 0);
+}
+
+static void lm_setView(
+	int* viewport, int x, int y, int w, int h,
+	float* view,   lm_vec3 pos, lm_vec3 dir, lm_vec3 up,
+	float* proj,   float l, float r, float b, float t, float n, float f)
+{
+	// viewport
+	viewport[0] = x; viewport[1] = y; viewport[2] = w; viewport[3] = h;
+
+	// view matrix: lookAt(pos, pos + dir, up)
+	lm_vec3 side = lm_cross3(dir, up);
+	//up = cross(side, dir);
+	dir = lm_negate3(dir); pos = lm_negate3(pos);
+	view[ 0] = side.x;             view[ 1] = up.x;             view[ 2] = dir.x;             view[ 3] = 0.0f;
+	view[ 4] = side.y;             view[ 5] = up.y;             view[ 6] = dir.y;             view[ 7] = 0.0f;
+	view[ 8] = side.z;             view[ 9] = up.z;             view[10] = dir.z;             view[11] = 0.0f;
+	view[12] = lm_dot3(side, pos); view[13] = lm_dot3(up, pos); view[14] = lm_dot3(dir, pos); view[15] = 1.0f;
+
+	// projection matrix: frustum(l, r, b, t, n, f)
+	float ilr = 1.0f / (r - l), ibt = 1.0f / (t - b), ninf = -1.0f / (f - n), n2 = 2.0f * n;
+	proj[ 0] = n2 * ilr;      proj[ 1] = 0.0f;          proj[ 2] = 0.0f;           proj[ 3] = 0.0f;
+	proj[ 4] = 0.0f;          proj[ 5] = n2 * ibt;      proj[ 6] = 0.0f;           proj[ 7] = 0.0f;
+	proj[ 8] = (r + l) * ilr; proj[ 9] = (t + b) * ibt; proj[10] = (f + n) * ninf; proj[11] = -1.0f;
+	proj[12] = 0.0f;          proj[13] = 0.0f;          proj[14] = f * n2 * ninf;  proj[15] = 0.0f;
+}
+
+// returns true if a hemisphere side was prepared for rendering and
+// false if we finished the current hemisphere
+static lm_bool lm_beginSampleHemisphere(lm_context *ctx, int* viewport, float* view, float* proj)
+{
+	if (ctx->meshPosition.hemisphere.side >= 5)
+		return LM_FALSE;
+
+	if (ctx->meshPosition.hemisphere.side == 0)
+	{
+		// prepare hemisphere
+		glBindFramebuffer(GL_FRAMEBUFFER, ctx->hemisphere.fb[0]);
+		if (ctx->hemisphere.fbHemiIndex == 0)
+		{
+			// prepare hemisphere batch
+			glClearColor( // clear to valid background pixels!
+				ctx->hemisphere.clearColor.r,
+				ctx->hemisphere.clearColor.g,
+				ctx->hemisphere.clearColor.b, 1.0f);
+			glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+		}
+		ctx->hemisphere.fbHemiToLightmapLocation[ctx->hemisphere.fbHemiIndex] =
+			lm_i2(ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y);
+	}
+
+	// find the target position in the batch
+	int x = (ctx->hemisphere.fbHemiIndex % ctx->hemisphere.fbHemiCountX) * ctx->hemisphere.size * 3;
+	int y = (ctx->hemisphere.fbHemiIndex / ctx->hemisphere.fbHemiCountX) * ctx->hemisphere.size;
+
+	int size = ctx->hemisphere.size;
+	float zNear = ctx->hemisphere.zNear;
+	float zFar = ctx->hemisphere.zFar;
+
+	lm_vec3 pos = ctx->meshPosition.sample.position;
+	lm_vec3 dir = ctx->meshPosition.sample.direction;
+	lm_vec3 up = ctx->meshPosition.sample.up;
+	lm_vec3 right = lm_cross3(dir, up);
+
+	// find the view parameters of the hemisphere side that we will render next
+	// hemisphere layout in the framebuffer:
+	//       +-------+---+---+-------+
+	//       |       |   |   |   D   |
+	//       |   C   | R | L +-------+
+	//       |       |   |   |   U   |
+	//       +-------+---+---+-------+
+	switch (ctx->meshPosition.hemisphere.side)
+	{
+	case 0: // center
+		lm_setView(viewport, x, y, size, size,
+				   view,     pos, dir, up,
+				   proj,     -zNear, zNear, -zNear, zNear, zNear, zFar);
+		break;
+	case 1: // right
+		lm_setView(viewport, size + x, y, size / 2, size,
+				   view,     pos, right, up,
+				   proj,     -zNear, 0.0f, -zNear, zNear, zNear, zFar);
+		break;
+	case 2: // left
+		lm_setView(viewport, size + x + size / 2, y, size / 2, size,
+				   view,     pos, lm_negate3(right), up,
+				   proj,     0.0f, zNear, -zNear, zNear, zNear, zFar);
+		break;
+	case 3: // down
+		lm_setView(viewport, 2 * size + x, y + size / 2, size, size / 2,
+				   view,     pos, lm_negate3(up), dir,
+				   proj,     -zNear, zNear, 0.0f, zNear, zNear, zFar);
+		break;
+	case 4: // up
+		lm_setView(viewport, 2 * size + x, y, size, size / 2,
+				   view,     pos, up, lm_negate3(dir),
+				   proj,     -zNear, zNear, -zNear, 0.0f, zNear, zFar);
+		break;
+	default:
+		assert(LM_FALSE);
+		break;
+	}
+
+	return LM_TRUE;
+}
+
+static void lm_endSampleHemisphere(lm_context *ctx)
+{
+	if (++ctx->meshPosition.hemisphere.side == 5)
+	{
+		// finish hemisphere
+		glBindFramebuffer(GL_FRAMEBUFFER, 0);
+		if (++ctx->hemisphere.fbHemiIndex == ctx->hemisphere.fbHemiCountX * ctx->hemisphere.fbHemiCountY)
+		{
+			// downsample new hemisphere batch and store the results
+			lm_integrateHemisphereBatch(ctx);
+		}
+	}
+}
+
+static void lm_inverseTranspose(const float *m44, float *n33)
+{
+	if (!m44)
+	{
+		n33[0] = 1.0f; n33[1] = 0.0f; n33[2] = 0.0f;
+		n33[3] = 0.0f; n33[4] = 1.0f; n33[5] = 0.0f;
+		n33[6] = 0.0f; n33[7] = 0.0f; n33[8] = 1.0f;
+		return;
+	}
+
+	float determinant = m44[ 0] * (m44[ 5] * m44[10] - m44[ 9] * m44[ 6])
+					  - m44[ 1] * (m44[ 4] * m44[10] - m44[ 6] * m44[ 8])
+					  + m44[ 2] * (m44[ 4] * m44[ 9] - m44[ 5] * m44[ 8]);
+
+	assert(fabs(determinant) > FLT_EPSILON);
+	float rcpDeterminant = 1.0f / determinant;
+
+	n33[0] =  (m44[ 5] * m44[10] - m44[ 9] * m44[ 6]) * rcpDeterminant;
+	n33[3] = -(m44[ 1] * m44[10] - m44[ 2] * m44[ 9]) * rcpDeterminant;
+	n33[6] =  (m44[ 1] * m44[ 6] - m44[ 2] * m44[ 5]) * rcpDeterminant;
+	n33[1] = -(m44[ 4] * m44[10] - m44[ 6] * m44[ 8]) * rcpDeterminant;
+	n33[4] =  (m44[ 0] * m44[10] - m44[ 2] * m44[ 8]) * rcpDeterminant;
+	n33[7] = -(m44[ 0] * m44[ 6] - m44[ 4] * m44[ 2]) * rcpDeterminant;
+	n33[2] =  (m44[ 4] * m44[ 9] - m44[ 8] * m44[ 5]) * rcpDeterminant;
+	n33[5] = -(m44[ 0] * m44[ 9] - m44[ 8] * m44[ 1]) * rcpDeterminant;
+	n33[8] =  (m44[ 0] * m44[ 5] - m44[ 4] * m44[ 1]) * rcpDeterminant;
+}
+
+static lm_vec3 lm_transformNormal(const float *m, lm_vec3 n)
+{
+	lm_vec3 r;
+	r.x = m[0] * n.x + m[3] * n.y + m[6] * n.z;
+	r.y = m[1] * n.x + m[4] * n.y + m[7] * n.z;
+	r.z = m[2] * n.x + m[5] * n.y + m[8] * n.z;
+	return r;
+}
+
+static lm_vec3 lm_transformPosition(const float *m, lm_vec3 v)
+{
+	if (!m)
+		return v;
+	lm_vec3 r;
+	r.x =     m[0] * v.x + m[4] * v.y + m[ 8] * v.z + m[12];
+	r.y =     m[1] * v.x + m[5] * v.y + m[ 9] * v.z + m[13];
+	r.z =     m[2] * v.x + m[6] * v.y + m[10] * v.z + m[14];
+	float d = m[3] * v.x + m[7] * v.y + m[11] * v.z + m[15];
+	assert(lm_absf(d - 1.0f) < 0.00001f); // could divide by d, but this shouldn't be a projection transform!
+	return r;
+}
+
+static void lm_setMeshPosition(lm_context *ctx, unsigned int indicesTriangleBaseIndex)
+{
+	// fetch triangle at the specified indicesTriangleBaseIndex
+	ctx->meshPosition.triangle.baseIndex = indicesTriangleBaseIndex;
+
+	// load and transform triangle to process next
+	lm_vec2 uvMin = lm_v2(FLT_MAX, FLT_MAX), uvMax = lm_v2(-FLT_MAX, -FLT_MAX);
+	lm_vec2 uvScale = lm_v2i(ctx->lightmap.width, ctx->lightmap.height);
+	unsigned int vIndices[3];
+	for (int i = 0; i < 3; i++)
+	{
+		// decode index
+		unsigned int vIndex;
+		switch (ctx->mesh.indicesType)
+		{
+		case LM_NONE:
+			vIndex = ctx->meshPosition.triangle.baseIndex + i;
+			break;
+		case LM_UNSIGNED_BYTE:
+			vIndex = ((const unsigned char*)ctx->mesh.indices + ctx->meshPosition.triangle.baseIndex)[i];
+			break;
+		case LM_UNSIGNED_SHORT:
+			vIndex = ((const unsigned short*)ctx->mesh.indices + ctx->meshPosition.triangle.baseIndex)[i];
+			break;
+		case LM_UNSIGNED_INT:
+			vIndex = ((const unsigned int*)ctx->mesh.indices + ctx->meshPosition.triangle.baseIndex)[i];
+			break;
+		default:
+			assert(LM_FALSE);
+			break;
+		}
+		vIndices[i] = vIndex;
+
+		// decode and pre-transform vertex position
+		const void *pPtr = ctx->mesh.positions + vIndex * ctx->mesh.positionsStride;
+		lm_vec3 p;
+		switch (ctx->mesh.positionsType)
+		{
+		// TODO: signed formats
+		case LM_UNSIGNED_BYTE: {
+			const unsigned char *uc = (const unsigned char*)pPtr;
+			p = lm_v3(uc[0], uc[1], uc[2]);
+		} break;
+		case LM_UNSIGNED_SHORT: {
+			const unsigned short *us = (const unsigned short*)pPtr;
+			p = lm_v3(us[0], us[1], us[2]);
+		} break;
+		case LM_UNSIGNED_INT: {
+			const unsigned int *ui = (const unsigned int*)pPtr;
+			p = lm_v3((float)ui[0], (float)ui[1], (float)ui[2]);
+		} break;
+		case LM_FLOAT: {
+			p = *(const lm_vec3*)pPtr;
+		} break;
+		default: {
+			assert(LM_FALSE);
+		} break;
+		}
+		ctx->meshPosition.triangle.p[i] = lm_transformPosition(ctx->mesh.modelMatrix, p);
+
+		// decode and scale (to lightmap resolution) vertex lightmap texture coords
+		const void *uvPtr = ctx->mesh.uvs + vIndex * ctx->mesh.uvsStride;
+		lm_vec2 uv;
+		switch (ctx->mesh.uvsType)
+		{
+		case LM_UNSIGNED_BYTE: {
+			const unsigned char *uc = (const unsigned char*)uvPtr;
+			uv = lm_v2(uc[0] / (float)UCHAR_MAX, uc[1] / (float)UCHAR_MAX);
+		} break;
+		case LM_UNSIGNED_SHORT: {
+			const unsigned short *us = (const unsigned short*)uvPtr;
+			uv = lm_v2(us[0] / (float)USHRT_MAX, us[1] / (float)USHRT_MAX);
+		} break;
+		case LM_UNSIGNED_INT: {
+			const unsigned int *ui = (const unsigned int*)uvPtr;
+			uv = lm_v2(ui[0] / (float)UINT_MAX, ui[1] / (float)UINT_MAX);
+		} break;
+		case LM_FLOAT: {
+			uv = *(const lm_vec2*)uvPtr;
+		} break;
+		default: {
+			assert(LM_FALSE);
+		} break;
+		}
+
+		ctx->meshPosition.triangle.uv[i] = lm_mul2(lm_pmod2(uv, 1.0f), uvScale); // maybe clamp to 0.0-1.0 instead of pmod?
+
+		// update bounds on lightmap
+		uvMin = lm_min2(uvMin, ctx->meshPosition.triangle.uv[i]);
+		uvMax = lm_max2(uvMax, ctx->meshPosition.triangle.uv[i]);
+	}
+
+	lm_vec3 flatNormal = lm_cross3(
+		lm_sub3(ctx->meshPosition.triangle.p[1], ctx->meshPosition.triangle.p[0]),
+		lm_sub3(ctx->meshPosition.triangle.p[2], ctx->meshPosition.triangle.p[0]));
+
+	for (int i = 0; i < 3; i++)
+	{
+		// decode and pre-transform vertex normal
+		const void *nPtr = ctx->mesh.normals + vIndices[i] * ctx->mesh.normalsStride;
+		lm_vec3 n;
+		switch (ctx->mesh.normalsType)
+		{
+		// TODO: signed formats
+		case LM_FLOAT: {
+			n = *(const lm_vec3*)nPtr;
+		} break;
+		case LM_NONE: {
+			n = flatNormal;
+		} break;
+		default: {
+			assert(LM_FALSE);
+		} break;
+		}
+		ctx->meshPosition.triangle.n[i] = lm_normalize3(lm_transformNormal(ctx->mesh.normalMatrix, n));
+	}
+
+	// calculate area of interest (on lightmap) for conservative rasterization
+	lm_vec2 bbMin = lm_floor2(uvMin);
+	lm_vec2 bbMax = lm_ceil2 (uvMax);
+	ctx->meshPosition.rasterizer.minx = lm_maxi((int)bbMin.x - 1, 0);
+	ctx->meshPosition.rasterizer.miny = lm_maxi((int)bbMin.y - 1, 0);
+	ctx->meshPosition.rasterizer.maxx = lm_mini((int)bbMax.x + 1, ctx->lightmap.width - 1);
+	ctx->meshPosition.rasterizer.maxy = lm_mini((int)bbMax.y + 1, ctx->lightmap.height - 1);
+	assert(ctx->meshPosition.rasterizer.minx <= ctx->meshPosition.rasterizer.maxx &&
+		   ctx->meshPosition.rasterizer.miny <= ctx->meshPosition.rasterizer.maxy);
+	ctx->meshPosition.rasterizer.x = ctx->meshPosition.rasterizer.minx + lm_passOffsetX(ctx);
+	ctx->meshPosition.rasterizer.y = ctx->meshPosition.rasterizer.miny + lm_passOffsetY(ctx);
+
+	// try moving to first valid sample position
+	if (ctx->meshPosition.rasterizer.x <= ctx->meshPosition.rasterizer.maxx &&
+		ctx->meshPosition.rasterizer.y <= ctx->meshPosition.rasterizer.maxy &&
+		lm_findFirstConservativeTriangleRasterizerPosition(ctx))
+		ctx->meshPosition.hemisphere.side = 0; // we can start sampling the hemisphere
+	else
+		ctx->meshPosition.hemisphere.side = 5; // no samples on this triangle! put hemisphere sampler into finished state
+}
+
+static GLuint lm_LoadShader(GLenum type, const char *source)
+{
+	GLuint shader = glCreateShader(type);
+	if (shader == 0)
+	{
+		fprintf(stderr, "Could not create shader!\n");
+		return 0;
+	}
+	glShaderSource(shader, 1, &source, NULL);
+	glCompileShader(shader);
+	GLint compiled;
+	glGetShaderiv(shader, GL_COMPILE_STATUS, &compiled);
+	if (!compiled)
+	{
+		fprintf(stderr, "Could not compile shader!\n");
+		GLint infoLen = 0;
+		glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &infoLen);
+		if (infoLen)
+		{
+			char* infoLog = (char*)malloc(infoLen);
+			glGetShaderInfoLog(shader, infoLen, NULL, infoLog);
+			fprintf(stderr, "%s\n", infoLog);
+			free(infoLog);
+		}
+		glDeleteShader(shader);
+		return 0;
+	}
+	return shader;
+}
+
+static GLuint lm_LoadProgram(const char *vp, const char *fp)
+{
+	GLuint program = glCreateProgram();
+	if (program == 0)
+	{
+		fprintf(stderr, "Could not create program!\n");
+		return 0;
+	}
+	GLuint vertexShader = lm_LoadShader(GL_VERTEX_SHADER, vp);
+	GLuint fragmentShader = lm_LoadShader(GL_FRAGMENT_SHADER, fp);
+	glAttachShader(program, vertexShader);
+	glAttachShader(program, fragmentShader);
+	glLinkProgram(program);
+	glDeleteShader(vertexShader);
+	glDeleteShader(fragmentShader);
+	GLint linked;
+	glGetProgramiv(program, GL_LINK_STATUS, &linked);
+	if (!linked)
+	{
+		fprintf(stderr, "Could not link program!\n");
+		GLint infoLen = 0;
+		glGetProgramiv(program, GL_INFO_LOG_LENGTH, &infoLen);
+		if (infoLen)
+		{
+			char* infoLog = (char*)malloc(sizeof(char) * infoLen);
+			glGetProgramInfoLog(program, infoLen, NULL, infoLog);
+			fprintf(stderr, "%s\n", infoLog);
+			free(infoLog);
+		}
+		glDeleteProgram(program);
+		return 0;
+	}
+	return program;
+}
+
+static float lm_defaultWeights(float cos_theta, void *userdata)
+{
+	return 1.0f;
+}
+
+lm_context *lmCreate(int hemisphereSize, float zNear, float zFar,
+	float clearR, float clearG, float clearB,
+	int interpolationPasses, float interpolationThreshold,
+	float cameraToSurfaceDistanceModifier)
+{
+	assert(hemisphereSize == 512 || hemisphereSize == 256 || hemisphereSize == 128 ||
+		   hemisphereSize ==  64 || hemisphereSize ==  32 || hemisphereSize ==  16);
+	assert(zNear < zFar && zNear > 0.0f);
+	assert(cameraToSurfaceDistanceModifier >= -1.0f);
+	assert(interpolationPasses >= 0 && interpolationPasses <= 8);
+	assert(interpolationThreshold >= 0.0f);
+
+	lm_context *ctx = (lm_context*)LM_CALLOC(1, sizeof(lm_context));
+
+	ctx->meshPosition.passCount = 1 + 3 * interpolationPasses;
+	ctx->interpolationThreshold = interpolationThreshold;
+	ctx->hemisphere.size = hemisphereSize;
+	ctx->hemisphere.zNear = zNear;
+	ctx->hemisphere.zFar = zFar;
+	ctx->hemisphere.cameraToSurfaceDistanceModifier = cameraToSurfaceDistanceModifier;
+	ctx->hemisphere.clearColor.r = clearR;
+	ctx->hemisphere.clearColor.g = clearG;
+	ctx->hemisphere.clearColor.b = clearB;
+
+	// calculate hemisphere batch size
+	ctx->hemisphere.fbHemiCountX = 1536 / (3 * ctx->hemisphere.size);
+	ctx->hemisphere.fbHemiCountY = 512 / ctx->hemisphere.size;
+
+	// hemisphere batch framebuffers
+	unsigned int w[] = {
+		ctx->hemisphere.fbHemiCountX * ctx->hemisphere.size * 3,
+		ctx->hemisphere.fbHemiCountX * ctx->hemisphere.size / 2 };
+	unsigned int h[] = {
+		ctx->hemisphere.fbHemiCountY * ctx->hemisphere.size,
+		ctx->hemisphere.fbHemiCountY * ctx->hemisphere.size / 2 };
+
+	glGenTextures(2, ctx->hemisphere.fbTexture);
+	glGenFramebuffers(2, ctx->hemisphere.fb);
+	glGenRenderbuffers(1, &ctx->hemisphere.fbDepth);
+
+	glBindRenderbuffer(GL_RENDERBUFFER, ctx->hemisphere.fbDepth);
+	glRenderbufferStorage(GL_RENDERBUFFER, GL_DEPTH_COMPONENT24, w[0], h[0]);
+	glBindFramebuffer(GL_FRAMEBUFFER, ctx->hemisphere.fb[0]);
+	glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_RENDERBUFFER, ctx->hemisphere.fbDepth);
+	for (int i = 0; i < 2; i++)
+	{
+		glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.fbTexture[i]);
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+		glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F, w[i], h[i], 0, GL_RGBA, GL_FLOAT, 0);
+
+		glBindFramebuffer(GL_FRAMEBUFFER, ctx->hemisphere.fb[i]);
+		glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, ctx->hemisphere.fbTexture[i], 0);
+		GLenum status = glCheckFramebufferStatus(GL_FRAMEBUFFER);
+		if (status != GL_FRAMEBUFFER_COMPLETE)
+		{
+			fprintf(stderr, "Could not create framebuffer!\n");
+			glDeleteRenderbuffers(1, &ctx->hemisphere.fbDepth);
+			glDeleteFramebuffers(2, ctx->hemisphere.fb);
+			glDeleteTextures(2, ctx->hemisphere.fbTexture);
+			LM_FREE(ctx);
+			return NULL;
+		}
+	}
+	glBindFramebuffer(GL_FRAMEBUFFER, 0);
+
+	// dummy vao for fullscreen quad rendering
+	glGenVertexArrays(1, &ctx->hemisphere.vao);
+
+	// hemisphere shader (weighted downsampling of the 3x1 hemisphere layout to a 0.5x0.5 square)
+	{
+		const char *vs =
+			"#version 150 core\n"
+			"const vec2 ps[4] = vec2[](vec2(1, -1), vec2(1, 1), vec2(-1, -1), vec2(-1, 1));\n"
+			"void main()\n"
+			"{\n"
+				"gl_Position = vec4(ps[gl_VertexID], 0, 1);\n"
+			"}\n";
+		const char *fs =
+			"#version 150 core\n"
+			"uniform sampler2D hemispheres;\n"
+			"uniform sampler2D weights;\n"
+
+			"layout(pixel_center_integer) in vec4 gl_FragCoord;\n" // whole integer values represent pixel centers, GL_ARB_fragment_coord_conventions
+
+			"out vec4 outColor;\n"
+
+			"vec4 weightedSample(ivec2 h_uv, ivec2 w_uv, ivec2 quadrant)\n"
+			"{\n"
+				"vec4 sample = texelFetch(hemispheres, h_uv + quadrant, 0);\n"
+				"vec2 weight = texelFetch(weights, w_uv + quadrant, 0).rg;\n"
+				"return vec4(sample.rgb * weight.r, sample.a * weight.g);\n"
+			"}\n"
+
+			"vec4 threeWeightedSamples(ivec2 h_uv, ivec2 w_uv, ivec2 offset)\n"
+			"{\n" // horizontal triple sum
+				"vec4 sum = weightedSample(h_uv, w_uv, offset);\n"
+				"sum += weightedSample(h_uv, w_uv, offset + ivec2(2, 0));\n"
+				"sum += weightedSample(h_uv, w_uv, offset + ivec2(4, 0));\n"
+				"return sum;\n"
+			"}\n"
+
+			"void main()\n"
+			"{\n" // this is a weighted sum downsampling pass (alpha component contains the weighted valid sample count)
+				"vec2 in_uv = gl_FragCoord.xy * vec2(6.0, 2.0) + vec2(0.5);\n"
+				"ivec2 h_uv = ivec2(in_uv);\n"
+				"ivec2 w_uv = ivec2(mod(in_uv, vec2(textureSize(weights, 0))));\n" // there's no integer modulo :(
+				"vec4 lb = threeWeightedSamples(h_uv, w_uv, ivec2(0, 0));\n"
+				"vec4 rb = threeWeightedSamples(h_uv, w_uv, ivec2(1, 0));\n"
+				"vec4 lt = threeWeightedSamples(h_uv, w_uv, ivec2(0, 1));\n"
+				"vec4 rt = threeWeightedSamples(h_uv, w_uv, ivec2(1, 1));\n"
+				"outColor = lb + rb + lt + rt;\n"
+			"}\n";
+		ctx->hemisphere.firstPass.programID = lm_LoadProgram(vs, fs);
+		if (!ctx->hemisphere.firstPass.programID)
+		{
+			fprintf(stderr, "Error loading the hemisphere first pass shader program... leaving!\n");
+			glDeleteVertexArrays(1, &ctx->hemisphere.vao);
+			glDeleteRenderbuffers(1, &ctx->hemisphere.fbDepth);
+			glDeleteFramebuffers(2, ctx->hemisphere.fb);
+			glDeleteTextures(2, ctx->hemisphere.fbTexture);
+			LM_FREE(ctx);
+			return NULL;
+		}
+		ctx->hemisphere.firstPass.hemispheresTextureID = glGetUniformLocation(ctx->hemisphere.firstPass.programID, "hemispheres");
+		ctx->hemisphere.firstPass.weightsTextureID = glGetUniformLocation(ctx->hemisphere.firstPass.programID, "weights");
+	}
+
+	// downsample shader
+	{
+		const char *vs =
+			"#version 150 core\n"
+			"const vec2 ps[4] = vec2[](vec2(1, -1), vec2(1, 1), vec2(-1, -1), vec2(-1, 1));\n"
+			"void main()\n"
+			"{\n"
+				"gl_Position = vec4(ps[gl_VertexID], 0, 1);\n"
+			"}\n";
+		const char *fs =
+			"#version 150 core\n"
+			"uniform sampler2D hemispheres;\n"
+
+			"layout(pixel_center_integer) in vec4 gl_FragCoord;\n" // whole integer values represent pixel centers, GL_ARB_fragment_coord_conventions
+
+			"out vec4 outColor;\n"
+
+			"void main()\n"
+			"{\n" // this is a sum downsampling pass (alpha component contains the weighted valid sample count)
+				"ivec2 h_uv = ivec2(gl_FragCoord.xy) * 2;\n"
+				"vec4 lb = texelFetch(hemispheres, h_uv + ivec2(0, 0), 0);\n"
+				"vec4 rb = texelFetch(hemispheres, h_uv + ivec2(1, 0), 0);\n"
+				"vec4 lt = texelFetch(hemispheres, h_uv + ivec2(0, 1), 0);\n"
+				"vec4 rt = texelFetch(hemispheres, h_uv + ivec2(1, 1), 0);\n"
+				"outColor = lb + rb + lt + rt;\n"
+			"}\n";
+		ctx->hemisphere.downsamplePass.programID = lm_LoadProgram(vs, fs);
+		if (!ctx->hemisphere.downsamplePass.programID)
+		{
+			fprintf(stderr, "Error loading the hemisphere downsample pass shader program... leaving!\n");
+			glDeleteProgram(ctx->hemisphere.firstPass.programID);
+			glDeleteVertexArrays(1, &ctx->hemisphere.vao);
+			glDeleteRenderbuffers(1, &ctx->hemisphere.fbDepth);
+			glDeleteFramebuffers(2, ctx->hemisphere.fb);
+			glDeleteTextures(2, ctx->hemisphere.fbTexture);
+			LM_FREE(ctx);
+			return NULL;
+		}
+		ctx->hemisphere.downsamplePass.hemispheresTextureID = glGetUniformLocation(ctx->hemisphere.downsamplePass.programID, "hemispheres");
+	}
+
+	// hemisphere weights texture
+	glGenTextures(1, &ctx->hemisphere.firstPass.weightsTexture);
+	lmSetHemisphereWeights(ctx, lm_defaultWeights, 0);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
+
+	// allocate batchPosition-to-lightmapPosition map
+	ctx->hemisphere.fbHemiToLightmapLocation = (lm_ivec2*)LM_CALLOC(ctx->hemisphere.fbHemiCountX * ctx->hemisphere.fbHemiCountY, sizeof(lm_ivec2));
+
+	return ctx;
+}
+
+void lmDestroy(lm_context *ctx)
+{
+	// reset state
+	glUseProgram(0);
+	glBindTexture(GL_TEXTURE_2D, 0);
+	glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
+	glBindVertexArray(0);
+	glBindFramebuffer(GL_FRAMEBUFFER, 0);
+	glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
+	glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+
+	// delete gl objects
+	glDeleteTextures(1, &ctx->hemisphere.firstPass.weightsTexture);
+	glDeleteTextures(1, &ctx->hemisphere.storage.texture);
+	glDeleteProgram(ctx->hemisphere.downsamplePass.programID);
+	glDeleteProgram(ctx->hemisphere.firstPass.programID);
+	glDeleteVertexArrays(1, &ctx->hemisphere.vao);
+	glDeleteRenderbuffers(1, &ctx->hemisphere.fbDepth);
+	glDeleteFramebuffers(2, ctx->hemisphere.fb);
+	glDeleteTextures(2, ctx->hemisphere.fbTexture);
+	glDeleteTextures(1, &ctx->hemisphere.storage.texture);
+
+	// free memory
+	LM_FREE(ctx->hemisphere.storage.toLightmapLocation);
+	LM_FREE(ctx->hemisphere.fbHemiToLightmapLocation);
+#ifdef LM_DEBUG_INTERPOLATION
+	LM_FREE(ctx->lightmap.debug);
+#endif
+	LM_FREE(ctx);
+}
+
+void lmSetHemisphereWeights(lm_context *ctx, lm_weight_func f, void *userdata)
+{
+	// hemisphere weights texture. bakes in material dependent attenuation behaviour.
+	float *weights = (float*)LM_CALLOC(2 * 3 * ctx->hemisphere.size * ctx->hemisphere.size, sizeof(float));
+	float center = (ctx->hemisphere.size - 1) * 0.5f;
+	double sum = 0.0;
+	for (unsigned int y = 0; y < ctx->hemisphere.size; y++)
+	{
+		float dy = 2.0f * (y - center) / (float)ctx->hemisphere.size;
+		for (unsigned int x = 0; x < ctx->hemisphere.size; x++)
+		{
+			float dx = 2.0f * (x - center) / (float)ctx->hemisphere.size;
+			lm_vec3 v = lm_normalize3(lm_v3(dx, dy, 1.0f));
+
+			float solidAngle = v.z * v.z * v.z;
+
+			float *w0 = weights + 2 * (y * (3 * ctx->hemisphere.size) + x);
+			float *w1 = w0 + 2 * ctx->hemisphere.size;
+			float *w2 = w1 + 2 * ctx->hemisphere.size;
+
+			// center weights
+			w0[0] = solidAngle * f(v.z, userdata);
+			w0[1] = solidAngle;
+
+			// left/right side weights
+			w1[0] = solidAngle * f(lm_absf(v.x), userdata);
+			w1[1] = solidAngle;
+
+			// up/down side weights
+			w2[0] = solidAngle * f(lm_absf(v.y), userdata);
+			w2[1] = solidAngle;
+
+			sum += 3.0 * (double)solidAngle;
+		}
+	}
+
+	// normalize weights
+	float weightScale = (float)(1.0 / sum);
+	for (unsigned int i = 0; i < 2 * 3 * ctx->hemisphere.size * ctx->hemisphere.size; i++)
+		weights[i] *= weightScale;
+
+	// upload weight texture
+	glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.firstPass.weightsTexture);
+	glTexImage2D(GL_TEXTURE_2D, 0, GL_RG32F, 3 * ctx->hemisphere.size, ctx->hemisphere.size, 0, GL_RG, GL_FLOAT, weights);
+	LM_FREE(weights);
+}
+
+void lmSetTargetLightmap(lm_context *ctx, float *outLightmap, int w, int h, int c)
+{
+	ctx->lightmap.data = outLightmap;
+	ctx->lightmap.width = w;
+	ctx->lightmap.height = h;
+	ctx->lightmap.channels = c;
+
+	// allocate storage texture
+	if (!ctx->hemisphere.storage.texture)
+		glGenTextures(1, &ctx->hemisphere.storage.texture);
+	glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.storage.texture);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+	glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F, w, h, 0, GL_RGBA, GL_FLOAT, 0);
+
+	// allocate storage position to lightmap position map
+	if (ctx->hemisphere.storage.toLightmapLocation)
+		LM_FREE(ctx->hemisphere.storage.toLightmapLocation);
+	ctx->hemisphere.storage.toLightmapLocation = (lm_ivec2*)LM_CALLOC(w * h, sizeof(lm_ivec2));
+	// invalidate all positions
+	for (int i = 0; i < w * h; i++)
+		ctx->hemisphere.storage.toLightmapLocation[i].x = -1;
+
+#ifdef LM_DEBUG_INTERPOLATION
+	if (ctx->lightmap.debug)
+		LM_FREE(ctx->lightmap.debug);
+	ctx->lightmap.debug = (unsigned char*)LM_CALLOC(ctx->lightmap.width * ctx->lightmap.height, 3);
+#endif
+}
+
+void lmSetGeometry(lm_context *ctx,
+	const float *transformationMatrix,
+	lm_type positionsType, const void *positionsXYZ, int positionsStride,
+	lm_type normalsType, const void *normalsXYZ, int normalsStride,
+	lm_type lightmapCoordsType, const void *lightmapCoordsUV, int lightmapCoordsStride,
+	int count, lm_type indicesType, const void *indices)
+{
+	ctx->mesh.modelMatrix = transformationMatrix;
+	ctx->mesh.positions = (const unsigned char*)positionsXYZ;
+	ctx->mesh.positionsType = positionsType;
+	ctx->mesh.positionsStride = positionsStride == 0 ? sizeof(lm_vec3) : positionsStride;
+	ctx->mesh.normals = (const unsigned char*)normalsXYZ;
+	ctx->mesh.normalsType = normalsType;
+	ctx->mesh.normalsStride = normalsStride == 0 ? sizeof(lm_vec3) : normalsStride;
+	ctx->mesh.uvs = (const unsigned char*)lightmapCoordsUV;
+	ctx->mesh.uvsType = lightmapCoordsType;
+	ctx->mesh.uvsStride = lightmapCoordsStride == 0 ? sizeof(lm_vec2) : lightmapCoordsStride;
+	ctx->mesh.indicesType = indicesType;
+	ctx->mesh.indices = (const unsigned char*)indices;
+	ctx->mesh.count = count;
+
+	lm_inverseTranspose(transformationMatrix, ctx->mesh.normalMatrix);
+
+	ctx->meshPosition.pass = 0;
+	lm_setMeshPosition(ctx, 0);
+}
+
+lm_bool lmBegin(lm_context *ctx, int* outViewport4, float* outView4x4, float* outProjection4x4)
+{
+	assert(ctx->meshPosition.triangle.baseIndex < ctx->mesh.count);
+	while (!lm_beginSampleHemisphere(ctx, outViewport4, outView4x4, outProjection4x4))
+	{ // as long as there are no hemisphere sides to render...
+		// try moving to the next rasterizer position
+		if (lm_findNextConservativeTriangleRasterizerPosition(ctx))
+		{ // if we successfully moved to the next sample position on the current triangle...
+			ctx->meshPosition.hemisphere.side = 0; // start sampling a hemisphere there
+		}
+		else
+		{ // if there are no valid sample positions on the current triangle...
+			if (ctx->meshPosition.triangle.baseIndex + 3 < ctx->mesh.count)
+			{ // ...and there are triangles left: move to the next triangle and continue sampling.
+				lm_setMeshPosition(ctx, ctx->meshPosition.triangle.baseIndex + 3);
+			}
+			else
+			{ // ...and there are no triangles left: finish
+				lm_integrateHemisphereBatch(ctx); // integrate and store last batch
+				lm_writeResultsToLightmap(ctx); // read storage data from gpu memory and write it to the lightmap
+
+				if (++ctx->meshPosition.pass == ctx->meshPosition.passCount)
+				{
+					ctx->meshPosition.pass = 0;
+					ctx->meshPosition.triangle.baseIndex = ctx->mesh.count; // set end condition (in case someone accidentally calls lmBegin again)
+
+#ifdef LM_DEBUG_INTERPOLATION
+					lmImageSaveTGAub("debug_interpolation.tga", ctx->lightmap.debug, ctx->lightmap.width, ctx->lightmap.height, 3);
+
+					// lightmap texel statistics
+					int rendered = 0, interpolated = 0, wasted = 0;
+					for (int y = 0; y < ctx->lightmap.height; y++)
+					{
+						for (int x = 0; x < ctx->lightmap.width; x++)
+						{
+							if (ctx->lightmap.debug[(y * ctx->lightmap.width + x) * 3 + 0])
+								rendered++;
+							else if (ctx->lightmap.debug[(y * ctx->lightmap.width + x) * 3 + 1])
+								interpolated++;
+							else
+								wasted++;
+						}
+					}
+					int used = rendered + interpolated;
+					int total = used + wasted;
+					printf("\n#######################################################################\n");
+					printf("%10d %6.2f%% rendered hemicubes integrated to lightmap texels.\n", rendered, 100.0f * (float)rendered / (float)total);
+					printf("%10d %6.2f%% interpolated lightmap texels.\n", interpolated, 100.0f * (float)interpolated / (float)total);
+					printf("%10d %6.2f%% wasted lightmap texels.\n", wasted, 100.0f * (float)wasted / (float)total);
+					printf("\n%17.2f%% of used texels were rendered.\n", 100.0f * (float)rendered / (float)used);
+					printf("#######################################################################\n");
+#endif
+
+					return LM_FALSE;
+				}
+
+				lm_setMeshPosition(ctx, 0); // start over with the next pass
+			}
+		}
+	}
+	return LM_TRUE;
+}
+
+float lmProgress(lm_context *ctx)
+{
+	float passProgress = (float)ctx->meshPosition.triangle.baseIndex / (float)ctx->mesh.count;
+	return ((float)ctx->meshPosition.pass + passProgress) / (float)ctx->meshPosition.passCount;
+}
+
+void lmEnd(lm_context *ctx)
+{
+	lm_endSampleHemisphere(ctx);
+}
+
+// these are not performance tuned since their impact on the whole lightmapping duration is insignificant
+float lmImageMin(const float *image, int w, int h, int c, int m)
+{
+	assert(c > 0 && m);
+	float minValue = FLT_MAX;
+	for (int i = 0; i < w * h; i++)
+		for (int j = 0; j < c; j++)
+			if (m & (1 << j))
+				minValue = lm_minf(minValue, image[i * c + j]);
+	return minValue;
+}
+
+float lmImageMax(const float *image, int w, int h, int c, int m)
+{
+	assert(c > 0 && m);
+	float maxValue = 0.0f;
+	for (int i = 0; i < w * h; i++)
+		for (int j = 0; j < c; j++)
+			if (m & (1 << j))
+				maxValue = lm_maxf(maxValue, image[i * c + j]);
+	return maxValue;
+}
+
+void lmImageAdd(float *image, int w, int h, int c, float value, int m)
+{
+	assert(c > 0 && m);
+	for (int i = 0; i < w * h; i++)
+		for (int j = 0; j < c; j++)
+			if (m & (1 << j))
+				image[i * c + j] += value;
+}
+
+void lmImageScale(float *image, int w, int h, int c, float factor, int m)
+{
+	assert(c > 0 && m);
+	for (int i = 0; i < w * h; i++)
+		for (int j = 0; j < c; j++)
+			if (m & (1 << j))
+				image[i * c + j] *= factor;
+}
+
+void lmImagePower(float *image, int w, int h, int c, float exponent, int m)
+{
+	assert(c > 0 && m);
+	for (int i = 0; i < w * h; i++)
+		for (int j = 0; j < c; j++)
+			if (m & (1 << j))
+				image[i * c + j] = powf(image[i * c + j], exponent);
+}
+
+void lmImageDilate(const float *image, float *outImage, int w, int h, int c)
+{
+	assert(c > 0 && c <= 4);
+	for (int y = 0; y < h; y++)
+	{
+		for (int x = 0; x < w; x++)
+		{
+			float color[4];
+			lm_bool valid = LM_FALSE;
+			for (int i = 0; i < c; i++)
+			{
+				color[i] = image[(y * w + x) * c + i];
+				valid |= color[i] > 0.0f;
+			}
+			if (!valid)
+			{
+				int n = 0;
+				const int dx[] = { -1, 0, 1,  0 };
+				const int dy[] = {  0, 1, 0, -1 };
+				for (int d = 0; d < 4; d++)
+				{
+					int cx = x + dx[d];
+					int cy = y + dy[d];
+					if (cx >= 0 && cx < w && cy >= 0 && cy < h)
+					{
+						float dcolor[4];
+						lm_bool dvalid = LM_FALSE;
+						for (int i = 0; i < c; i++)
+						{
+							dcolor[i] = image[(cy * w + cx) * c + i];
+							dvalid |= dcolor[i] > 0.0f;
+						}
+						if (dvalid)
+						{
+							for (int i = 0; i < c; i++)
+								color[i] += dcolor[i];
+							n++;
+						}
+					}
+				}
+				if (n)
+				{
+					float in = 1.0f / n;
+					for (int i = 0; i < c; i++)
+						color[i] *= in;
+				}
+			}
+			for (int i = 0; i < c; i++)
+				outImage[(y * w + x) * c + i] = color[i];
+		}
+	}
+}
+
+void lmImageSmooth(const float *image, float *outImage, int w, int h, int c)
+{
+	assert(c > 0 && c <= 4);
+	for (int y = 0; y < h; y++)
+	{
+		for (int x = 0; x < w; x++)
+		{
+			float color[4] = {0};
+			int n = 0;
+			for (int dy = -1; dy <= 1; dy++)
+			{
+				int cy = y + dy;
+				for (int dx = -1; dx <= 1; dx++)
+				{
+					int cx = x + dx;
+					if (cx >= 0 && cx < w && cy >= 0 && cy < h)
+					{
+						lm_bool valid = LM_FALSE;
+						for (int i = 0; i < c; i++)
+							valid |= image[(cy * w + cx) * c + i] > 0.0f;
+						if (valid)
+						{
+							for (int i = 0; i < c; i++)
+								color[i] += image[(cy * w + cx) * c + i];
+							n++;
+						}
+					}
+				}
+			}
+			for (int i = 0; i < c; i++)
+				outImage[(y * w + x) * c + i] = n ? color[i] / n : 0.0f;
+		}
+	}
+}
+
+void lmImageDownsample(const float *image, float *outImage, int w, int h, int c)
+{
+	assert(c > 0 && c <= 4);
+	for (int y = 0; y < h / 2; y++)
+	{
+		for (int x = 0; x < w / 2; x++)
+		{
+			int p0 = 2 * (y * w + x) * c;
+			int p1 = p0 + w * c;
+			int valid[2][2] = {0};
+			float sums[4] = {0};
+			for (int i = 0; i < c; i++)
+			{
+				valid[0][0] |= image[p0     + i] != 0.0f ? 1 : 0;
+				valid[0][1] |= image[p0 + c + i] != 0.0f ? 1 : 0;
+				valid[1][0] |= image[p1     + i] != 0.0f ? 1 : 0;
+				valid[1][1] |= image[p1 + c + i] != 0.0f ? 1 : 0;
+				sums[i] += image[p0 + i] + image[p0 + c + i] + image[p1 + i] + image[p1 + c + i];
+			}
+			int n = valid[0][0] + valid[0][1] + valid[1][0] + valid[1][1];
+			int p = (y * w / 2 + x) * c;
+			for (int i = 0; i < c; i++)
+				outImage[p + i] = n ? sums[i] / n : 0.0f;
+		}
+	}
+}
+
+void lmImageFtoUB(const float *image, unsigned char *outImage, int w, int h, int c, float max)
+{
+	assert(c > 0);
+	float scale = 255.0f / (max != 0.0f ? max : lmImageMax(image, w, h, c, LM_ALL_CHANNELS));
+	for (int i = 0; i < w * h * c; i++)
+		outImage[i] = (unsigned char)lm_minf(lm_maxf(image[i] * scale, 0.0f), 255.0f);
+}
+
+// TGA output helpers
+static void lm_swapRandBub(unsigned char *image, int w, int h, int c)
+{
+	assert(c >= 3);
+	for (int i = 0; i < w * h * c; i += c)
+		LM_SWAP(unsigned char, image[i], image[i + 2]);
+}
+
+lm_bool lmImageSaveTGAub(const char *filename, const unsigned char *image, int w, int h, int c)
+{
+	assert(c == 1 || c == 3 || c == 4);
+	lm_bool isGreyscale = c == 1;
+	lm_bool hasAlpha = c == 4;
+	unsigned char header[18] = {
+		0, 0, (unsigned char)(isGreyscale ? 3 : 2), 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		(unsigned char)(w & 0xff), (unsigned char)((w >> 8) & 0xff), (unsigned char)(h & 0xff), (unsigned char)((h >> 8) & 0xff),
+		(unsigned char)(8 * c), (unsigned char)(hasAlpha ? 8 : 0)
+	};
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+	FILE *file;
+	if (fopen_s(&file, filename, "wb") != 0) return LM_FALSE;
+#else
+	FILE *file = fopen(filename, "wb");
+	if (!file) return LM_FALSE;
+#endif
+	fwrite(header, 1, sizeof(header), file);
+
+	// we make sure to swap it back! trust me. :)
+	if (!isGreyscale)
+		lm_swapRandBub((unsigned char*)image, w, h, c);
+	fwrite(image, 1, w * h * c , file);
+	if (!isGreyscale)
+		lm_swapRandBub((unsigned char*)image, w, h, c);
+
+	fclose(file);
+	return LM_TRUE;
+}
+
+lm_bool lmImageSaveTGAf(const char *filename, const float *image, int w, int h, int c, float max)
+{
+	unsigned char *temp = (unsigned char*)LM_CALLOC(w * h * c, sizeof(unsigned char));
+	lmImageFtoUB(image, temp, w, h, c, max);
+	lm_bool success = lmImageSaveTGAub(filename, temp, w, h, c);
+	LM_FREE(temp);
+	return success;
+}
+
+#endif // LIGHTMAPPER_IMPLEMENTATION
diff --git a/demos/99-gui.c b/demos/99-gui.c
index 4933586..ddae7d4 100644
--- a/demos/99-gui.c
+++ b/demos/99-gui.c
@@ -45,13 +45,13 @@ int main() {
             gui_rect(vec4(40,240, 240, 20*skinned->scale), "vial");
             gui_rect(vec4(40,240, 160, 14*skinned->scale), "mp");
 
-            vec2 badge_size = gui_getskinsize("badge");
+            vec2 badge_size = gui_getskinsize("badge", 0);
             badge_size.x += 2; // padding
             gui_rect(vec4(60+badge_size.x*0,320, 1, 1), "badge");
             gui_rect(vec4(60+badge_size.x*1,320, 1, 1), "badge");
             gui_rect(vec4(60+badge_size.x*2,320, 1, 1), "badge_empty");
 
-            vec2 slider_size = gui_getskinsize("slider");
+            vec2 slider_size = gui_getskinsize("slider", 0);
             gui_slider(vec4(60, 480, 80*skinned->scale, 1), 0, 0.0f, 15.0f, 1.0f, &testval);
             gui_slider_label(va(FONT_H1 "%.02f", testval2), vec4(60, 480+slider_size.y+10, 120*skinned->scale, 1), 0, -5.0f, 20.0f, 0.0f, &testval2);
         gui_panel_end();
diff --git a/demos/99-lmap.c b/demos/99-lmap.c
new file mode 100644
index 0000000..306c2d2
--- /dev/null
+++ b/demos/99-lmap.c
@@ -0,0 +1,407 @@
+#include "v4k.h"
+
+#define LIGHTMAPPER_IMPLEMENTATION
+// #define LM_DEBUG_INTERPOLATION
+#include "3rd_lightmapper.h"
+
+#define scene_t scene_t2
+
+typedef struct {
+    float p[3];
+    float t[2];
+} vertex_t;
+
+typedef struct
+{
+    GLuint program;
+    GLint u_lightmap;
+    GLint u_projection;
+    GLint u_view;
+
+    GLuint lightmap;
+    int w, h;
+
+    GLuint vao, vbo, ibo;
+    vertex_t *vertices;
+    unsigned short *indices;
+    unsigned int vertexCount, indexCount;
+} scene_t;
+
+static int initScene(scene_t *scene);
+static void drawScene(scene_t *scene, float *view, float *projection);
+static void destroyScene(scene_t *scene);
+
+static int bake(scene_t *scene)
+{
+    lm_context *ctx = lmCreate(
+        64,               // hemisphere resolution (power of two, max=512)
+        0.001f, 100.0f,   // zNear, zFar of hemisphere cameras
+        1.0f, 1.0f, 1.0f, // background color (white for ambient occlusion)
+        2, 0.01f,         // lightmap interpolation threshold (small differences are interpolated rather than sampled)
+                          // check debug_interpolation.tga for an overview of sampled (red) vs interpolated (green) pixels.
+        0.0f);            // modifier for camera-to-surface distance for hemisphere rendering.
+                          // tweak this to trade-off between interpolated normals quality and other artifacts (see declaration).
+
+    if (!ctx)
+    {
+        fprintf(stderr, "Error: Could not initialize lightmapper.\n");
+        return 0;
+    }
+
+    int w = scene->w, h = scene->h;
+    float *data = CALLOC(w * h * 4, sizeof(float));
+    for (int b = 0; b < 1; b++) {
+        memset(data, 0, w*h*4);
+        lmSetTargetLightmap(ctx, data, w, h, 4);
+
+        lmSetGeometry(ctx, NULL,                                                                 // no transformation in this example
+            LM_FLOAT, (unsigned char*)scene->vertices + offsetof(vertex_t, p), sizeof(vertex_t),
+            LM_NONE , NULL                                                   , 0               , // no interpolated normals in this example
+            LM_FLOAT, (unsigned char*)scene->vertices + offsetof(vertex_t, t), sizeof(vertex_t),
+            scene->indexCount, LM_UNSIGNED_SHORT, scene->indices);
+
+        glDisable(GL_BLEND);
+
+        int vp[4];
+        float view[16], projection[16];
+        double lastUpdateTime = 0.0;
+        while (lmBegin(ctx, vp, view, projection))
+        {
+            // render to lightmapper framebuffer
+            glViewport(vp[0], vp[1], vp[2], vp[3]);
+            drawScene(scene, view, projection);
+
+            // display progress every second (printf is expensive)
+            double time = time_ms() / 1000.0;
+            if (time - lastUpdateTime > 1.0)
+            {
+                lastUpdateTime = time;
+                printf("\r%6.2f%%", lmProgress(ctx) * 100.0f);
+                fflush(stdout);
+            }
+
+            lmEnd(ctx);
+    //      window_swap();
+        }
+        printf("\rFinished baking %d triangles.\n", scene->indexCount / 3);
+    }
+
+    lmDestroy(ctx);
+
+    // postprocess texture
+    float *temp = CALLOC(w * h * 4, sizeof(float));
+    for (int i = 0; i < 16; i++)
+    {
+        lmImageDilate(data, temp, w, h, 4);
+        lmImageDilate(temp, data, w, h, 4);
+    }
+    lmImageSmooth(data, temp, w, h, 4);
+    lmImageDilate(temp, data, w, h, 4);
+    lmImagePower(data, w, h, 4, 1.0f / 2.2f, 0x7); // gamma correct color channels
+    FREE(temp);
+
+    // save result to a file
+    if (lmImageSaveTGAf("result.tga", data, w, h, 4, 1.0f))
+        printf("Saved result.tga\n");
+
+    // upload result
+    glBindTexture(GL_TEXTURE_2D, scene->lightmap);
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, w, h, 0, GL_RGBA, GL_FLOAT, data);
+    FREE(data);
+
+    return 1;
+}
+
+static void fpsCameraViewMatrix(float *view);
+
+static void mainLoop(scene_t *scene)
+{
+    glViewport(0, 0, window_width(), window_height());
+
+    // camera for glfw window
+    float view[16], projection[16];
+    fpsCameraViewMatrix(view);
+    perspective44(projection, 45.0f, window_aspect(), 0.01f, 100.0f);
+
+    // draw to screen with a blueish sky
+    glClearColor(0.6f, 0.8f, 1.0f, 1.0f);
+    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+    drawScene(scene, view, projection);
+}
+
+int main()
+{
+    window_create(0.5, WINDOW_VSYNC_DISABLED);
+
+    scene_t scene = {0};
+    if (!initScene(&scene))
+    {
+        fprintf(stderr, "Could not initialize scene.\n");
+        return EXIT_FAILURE;
+    }
+
+    window_title("AO Baking demo");
+
+    while (window_swap())
+    {
+        mainLoop(&scene);
+
+        if( ui_panel("Lightmapper", PANEL_OPEN) ) {
+            ui_label2("Freecam", "Mouse + W/A/S/D/E/Q keys");
+            ui_label("Warning " ICON_MD_WARNING "@This will take a few seconds and bake a lightmap illuminated by: The mesh itself (initially black) + A white sky (1.0f, 1.0f, 1.0f)");
+            if( ui_button("Bake 1 light bounce") ) {
+                bake(&scene);
+            }
+            ui_panel_end();
+        }
+    }
+
+    destroyScene(&scene);
+}
+
+// helpers ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+static int loadSimpleObjFile(const char *filename, vertex_t **vertices, unsigned int *vertexCount, unsigned short **indices, unsigned int *indexCount);
+
+static int initScene(scene_t *scene)
+{
+    // load mesh
+    if (!loadSimpleObjFile("demos/art/meshes/gazebo.obj", &scene->vertices, &scene->vertexCount, &scene->indices, &scene->indexCount))
+    {
+        fprintf(stderr, "Error loading obj file\n");
+        return 0;
+    }
+
+    glGenVertexArrays(1, &scene->vao);
+    glBindVertexArray(scene->vao);
+
+    glGenBuffers(1, &scene->vbo);
+    glBindBuffer(GL_ARRAY_BUFFER, scene->vbo);
+    glBufferData(GL_ARRAY_BUFFER, scene->vertexCount * sizeof(vertex_t), scene->vertices, GL_STATIC_DRAW);
+
+    glGenBuffers(1, &scene->ibo);
+    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, scene->ibo);
+    glBufferData(GL_ELEMENT_ARRAY_BUFFER, scene->indexCount * sizeof(unsigned short), scene->indices, GL_STATIC_DRAW);
+
+    glEnableVertexAttribArray(0);
+    glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE, sizeof(vertex_t), (void*)offsetof(vertex_t, p));
+    glEnableVertexAttribArray(1);
+    glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, sizeof(vertex_t), (void*)offsetof(vertex_t, t));
+
+    // create lightmap texture
+    scene->w = 654;
+    scene->h = 654;
+    glGenTextures(1, &scene->lightmap);
+    glBindTexture(GL_TEXTURE_2D, scene->lightmap);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+    unsigned char emissive[] = { 0, 0, 0, 255 };
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, 1, 1, 0, GL_RGBA, GL_UNSIGNED_BYTE, emissive);
+
+    // load shader
+    const char *vp =
+        "in vec3 a_position;\n"
+        "in vec2 a_texcoord;\n"
+        "uniform mat4 u_view;\n"
+        "uniform mat4 u_projection;\n"
+        "out vec2 v_texcoord;\n"
+
+        "void main()\n"
+        "{\n"
+        "gl_Position = u_projection * (u_view * vec4(a_position, 1.0));\n"
+        "v_texcoord = a_texcoord;\n"
+        "}\n";
+
+    const char *fp =
+        "in vec2 v_texcoord;\n"
+        "uniform sampler2D u_lightmap;\n"
+        "out vec4 o_color;\n"
+
+        "void main()\n"
+        "{\n"
+        "o_color = vec4(texture(u_lightmap, v_texcoord).rgb, gl_FrontFacing ? 1.0 : 0.0);\n"
+        "}\n";
+
+    scene->program = shader(vp, fp, "a_position,a_texcoord", "o_color", NULL);
+    if (!scene->program)
+    {
+        fprintf(stderr, "Error loading shader\n");
+        return 0;
+    }
+    scene->u_view = glGetUniformLocation(scene->program, "u_view");
+    scene->u_projection = glGetUniformLocation(scene->program, "u_projection");
+    scene->u_lightmap = glGetUniformLocation(scene->program, "u_lightmap");
+
+    return 1;
+}
+
+static void drawScene(scene_t *scene, float *view, float *projection)
+{
+    glEnable(GL_DEPTH_TEST);
+
+    glUseProgram(scene->program);
+    glUniform1i(scene->u_lightmap, 0);
+    glUniformMatrix4fv(scene->u_projection, 1, GL_FALSE, projection);
+    glUniformMatrix4fv(scene->u_view, 1, GL_FALSE, view);
+
+    glBindTexture(GL_TEXTURE_2D, scene->lightmap);
+
+    glBindVertexArray(scene->vao);
+    glDrawElements(GL_TRIANGLES, scene->indexCount, GL_UNSIGNED_SHORT, 0);
+}
+
+static void destroyScene(scene_t *scene)
+{
+    FREE(scene->vertices);
+    FREE(scene->indices);
+    glDeleteVertexArrays(1, &scene->vao);
+    glDeleteBuffers(1, &scene->vbo);
+    glDeleteBuffers(1, &scene->ibo);
+    glDeleteTextures(1, &scene->lightmap);
+    glDeleteProgram(scene->program);
+}
+
+static int loadSimpleObjFile(const char *filename, vertex_t **vertices, unsigned int *vertexCount, unsigned short **indices, unsigned int *indexCount)
+{
+    FILE *file = fopen(filename, "rt");
+    if (!file)
+        return 0;
+    char line[1024];
+
+    // first pass
+    unsigned int np = 0, nn = 0, nt = 0, nf = 0;
+    while (!feof(file))
+    {
+        fgets(line, 1024, file);
+        if (line[0] == '#') continue;
+        if (line[0] == 'v')
+        {
+            if (line[1] == ' ') { np++; continue; }
+            if (line[1] == 'n') { nn++; continue; }
+            if (line[1] == 't') { nt++; continue; }
+            assert(!"unknown vertex attribute");
+        }
+        if (line[0] == 'f') { nf++; continue; }
+        assert(!"unknown identifier");
+    }
+    assert(np && np == nn && np == nt && nf); // only supports obj files without separately indexed vertex attributes
+
+    // allocate memory
+    *vertexCount = np;
+    *vertices = CALLOC(np, sizeof(vertex_t));
+    *indexCount = nf * 3;
+    *indices = CALLOC(nf * 3, sizeof(unsigned short));
+
+    // second pass
+    fseek(file, 0, SEEK_SET);
+    unsigned int cp = 0, cn = 0, ct = 0, cf = 0;
+    while (!feof(file))
+    {
+        fgets(line, 1024, file);
+        if (line[0] == '#') continue;
+        if (line[0] == 'v')
+        {
+            if (line[1] == ' ') { float *p = (*vertices)[cp++].p; char *e1, *e2; p[0] = (float)strtod(line + 2, &e1); p[1] = (float)strtod(e1, &e2); p[2] = (float)strtod(e2, 0); continue; }
+            if (line[1] == 'n') { /*float *n = (*vertices)[cn++].n; char *e1, *e2; n[0] = (float)strtod(line + 3, &e1); n[1] = (float)strtod(e1, &e2); n[2] = (float)strtod(e2, 0);*/ continue; } // no normals needed
+            if (line[1] == 't') { float *t = (*vertices)[ct++].t; char *e1;      t[0] = (float)strtod(line + 3, &e1); t[1] = (float)strtod(e1, 0);                                continue; }
+            assert(!"unknown vertex attribute");
+        }
+        if (line[0] == 'f')
+        {
+            unsigned short *tri = (*indices) + cf;
+            cf += 3;
+            char *e1, *e2, *e3 = line + 1;
+            for (int i = 0; i < 3; i++)
+            {
+                unsigned long pi = strtoul(e3 + 1, &e1, 10);
+                assert(e1[0] == '/');
+                unsigned long ti = strtoul(e1 + 1, &e2, 10);
+                assert(e2[0] == '/');
+                unsigned long ni = strtoul(e2 + 1, &e3, 10);
+                assert(pi == ti && pi == ni);
+                tri[i] = (unsigned short)(pi - 1);
+            }
+            continue;
+        }
+        assert(!"unknown identifier");
+    }
+
+    fclose(file);
+    return 1;
+}
+
+
+
+static void fpsCameraViewMatrix(float *view)
+{
+    // initial camera config
+    static float position[] = { 0.0f, 0.3f, 1.5f };
+    static float rotation[] = { 0.0f, 0.0f };
+
+    // mouse look
+    static double lastMouse[] = { 0.0, 0.0 };
+    double mouse_coord[2];
+    mouse_coord[0] = input(MOUSE_X);
+    mouse_coord[1] = input(MOUSE_Y);
+    if (input(MOUSE_L))
+    {
+        rotation[0] += (float)(mouse_coord[1] - lastMouse[1]) * -0.2f;
+        rotation[1] += (float)(mouse_coord[0] - lastMouse[0]) * -0.2f;
+    }
+    lastMouse[0] = mouse_coord[0];
+    lastMouse[1] = mouse_coord[1];
+
+    float rotationY[16], rotationX[16], rotationYX[16];
+    rotation44(rotationX, rotation[0], 1.0f, 0.0f, 0.0f);
+    rotation44(rotationY, rotation[1], 0.0f, 1.0f, 0.0f);
+    multiply44x2(rotationYX, rotationY, rotationX);
+
+    // keyboard movement (WSADEQ)
+    float speed = input(KEY_SHIFT) ? 0.1f : 0.01f;
+    vec3 movement = {0};
+    if (input(KEY_W)) movement.z -= speed;
+    if (input(KEY_S)) movement.z += speed;
+    if (input(KEY_A)) movement.x -= speed;
+    if (input(KEY_D)) movement.x += speed;
+    if (input(KEY_E)) movement.y -= speed;
+    if (input(KEY_Q)) movement.y += speed;
+
+    vec3 worldMovement = transform344(rotationYX, movement);
+    position[0] += worldMovement.x;
+    position[1] += worldMovement.y;
+    position[2] += worldMovement.z;
+
+    // construct view matrix
+    float inverseRotation[16], inverseTranslation[16];
+    transpose44(inverseRotation, rotationYX);
+    translation44(inverseTranslation, -position[0], -position[1], -position[2]);
+    multiply44x2(view, inverseRotation, inverseTranslation); // = inverse(translation(position) * rotationYX);
+}
+
+#if 0
+
+#######################################################################
+     76702  17.93% rendered hemicubes integrated to lightmap texels.
+    179388  41.94% interpolated lightmap texels.
+    171626  40.13% wasted lightmap texels.
+
+            29.95% of used texels were rendered.
+#######################################################################
+Finished baking 731 triangles.
+Saved result.tga
+
+vs
+
+#######################################################################
+       124   0.05% rendered hemicubes integrated to lightmap texels.
+       201   0.08% interpolated lightmap texels.
+    261947  99.88% wasted lightmap texels.
+
+            38.15% of used texels were rendered.
+#######################################################################
+Finished baking 731 triangles.
+Saved result.tga
+
+#endif
+
diff --git a/demos/99-tls.c b/demos/99-tls.c
deleted file mode 100644
index 47fab1b..0000000
--- a/demos/99-tls.c
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "v4k.h" // Minimal C sample
-
-__thread int foo=0; 
-
-int worker(void *ud) {
-    printf("thread:%d\n", foo);
-    return 0;
-}
-
-int main() {
-    foo=1;
-    printf("main:%d\n", foo);
-    thread_destroy(thread(worker, NULL));
-    return 0;
-}
\ No newline at end of file
diff --git a/engine/joint/v4k.h b/engine/joint/v4k.h
index c6eac10..48dc983 100644
--- a/engine/joint/v4k.h
+++ b/engine/joint/v4k.h
@@ -17571,6 +17571,13 @@ typedef struct anims_t {
 
 API anims_t animations(const char *pathfile, int flags);
 
+// -----------------------------------------------------------------------------
+// lightmapping utils
+
+typedef struct lightmap_t {
+    struct lm_context *ctx; // private
+} lightmap_t;
+
 // -----------------------------------------------------------------------------
 // skyboxes
 
@@ -18117,18 +18124,19 @@ API void     sprite_setanim(sprite_t *s, unsigned name);
 // game ui
 
 typedef struct guiskin_t {
-    void (*drawrect)(void* userdata, const char *skin, vec4 rect);
-    void (*getskinsize)(void* userdata, const char *skin, vec2 *size);
-    void (*getscissorrect)(void* userdata, const char *skin, vec4 rect, vec4 *dims);
-    bool (*ismouseinrect)(void* userdata, const char *skin, vec4 rect);
+    void (*drawrect)(void* userdata, const char *skin, const char *fallback, vec4 rect);
+    void (*getskinsize)(void* userdata, const char *skin, const char *fallback, vec2 *size);
+    void (*getscissorrect)(void* userdata, const char *skin, const char *fallback, vec4 rect, vec4 *dims);
+    bool (*ismouseinrect)(void* userdata, const char *skin, const char *fallback, vec4 rect);
     void (*free)(void* userdata);
     void *userdata;
 } guiskin_t;
 
 API void    gui_pushskin(guiskin_t skin);
 API void*       gui_userdata();
-API vec2        gui_getskinsize(const char *skin);
-API bool        gui_ismouseinrect(const char *skin, vec4 rect);
+API vec2        gui_getskinsize(const char *skin, const char *fallback);
+API bool        gui_ismouseinrect(const char *skin, const char *fallback, vec4 rect);
+API vec4        gui_getscissorrect(const char *skin, const char *fallback, vec4 rect);
 // --
 API void        gui_panel_id(int id, vec4 rect, const char *skin);
 API void            gui_rect_id(int id, vec4 rect, const char *skin);
@@ -32793,6 +32801,7 @@ int gladLoadGL( GLADloadfunc load) {
 #define BQ_PLATFORM_IMPLEMENTATION            // websocket
 #define BQ_WEBSOCKET_IMPLEMENTATION           // websocket
 #define XML_C                                 // xml
+#define LIGHTMAPPER_IMPLEMENTATION            // lightmapper
 #ifdef __APPLE__
 #define MA_NO_RUNTIME_LINKING                 // miniaudio osx
 #define _GLFW_COCOA                           // glfw osx
@@ -347855,6 +347864,1770 @@ void lt_tick(struct lua_State *L) {
 }
 #line 0
 
+#line 1 "3rd_lightmapper.h"
+/***********************************************************
+* A single header file OpenGL lightmapping library         *
+* https://github.com/ands/lightmapper                      *
+* no warranty implied | use at your own risk               *
+* author: Andreas Mantler (ands) | last change: 10.05.2018 *
+*                                                          *
+* License:                                                 *
+* This software is in the public domain.                   *
+* Where that dedication is not recognized,                 *
+* you are granted a perpetual, irrevocable license to copy *
+* and modify this file however you want.                   *
+***********************************************************/
+
+#ifndef LIGHTMAPPER_H
+#define LIGHTMAPPER_H
+
+#ifdef __cplusplus
+#define LM_DEFAULT_VALUE(value) = value
+#else
+#define LM_DEFAULT_VALUE(value)
+#endif
+
+#ifndef LM_CALLOC
+#define LM_CALLOC(count, size) calloc(count, size)
+#endif
+
+#ifndef LM_FREE
+#define LM_FREE(ptr) free(ptr)
+#endif
+
+typedef int lm_bool;
+#define LM_FALSE 0
+#define LM_TRUE  1
+
+typedef int lm_type;
+#define LM_NONE           0
+#define LM_UNSIGNED_BYTE  GL_UNSIGNED_BYTE
+#define LM_UNSIGNED_SHORT GL_UNSIGNED_SHORT
+#define LM_UNSIGNED_INT   GL_UNSIGNED_INT
+#define LM_FLOAT          GL_FLOAT
+
+typedef struct lm_context lm_context;
+
+// creates a lightmapper instance. it can be used to render multiple lightmaps.
+lm_context *lmCreate(
+	int hemisphereSize,                                                                                // hemisphereSize: resolution of the hemisphere renderings. must be a power of two! typical: 64.
+	float zNear, float zFar,                                                                           // zNear/zFar: hemisphere min/max draw distances.
+	float clearR, float clearG, float clearB,                                                          // clear color / background color / sky color.
+	int interpolationPasses, float interpolationThreshold,                                             // passes: hierarchical selective interpolation passes (0-8; initial step size = 2^passes).
+	                                                                                                   // threshold: error value below which lightmap pixels are interpolated instead of rendered.
+	                                                                                                   // use output image from LM_DEBUG_INTERPOLATION to determine a good value.
+	                                                                                                   // values around and below 0.01 are probably ok.
+	                                                                                                   // the lower the value, the more hemispheres are rendered -> slower, but possibly better quality.
+	float cameraToSurfaceDistanceModifier LM_DEFAULT_VALUE(0.0f));                                     // modifier for the height of the rendered hemispheres above the surface
+	                                                                                                   // -1.0f => stick to surface, 0.0f => minimum height for interpolated surface normals,
+	                                                                                                   // > 0.0f => improves gradients on surfaces with interpolated normals due to the flat surface horizon,
+	                                                                                                   // but may introduce other artifacts.
+
+// optional: set material characteristics by specifying cos(theta)-dependent weights for incoming light.
+typedef float (*lm_weight_func)(float cos_theta, void *userdata);
+void lmSetHemisphereWeights(lm_context *ctx, lm_weight_func f, void *userdata);                        // precalculates weights for incoming light depending on its angle. (default: all weights are 1.0f)
+
+// specify an output lightmap image buffer with w * h * c * sizeof(float) bytes of memory.
+void lmSetTargetLightmap(lm_context *ctx, float *outLightmap, int w, int h, int c);                    // output HDR lightmap (linear 32bit float channels; c: 1->Greyscale, 2->Greyscale+Alpha, 3->RGB, 4->RGBA).
+
+// set the geometry to map to the currently set target lightmap (set the target lightmap before calling this!).
+void lmSetGeometry(lm_context *ctx,
+	const float *transformationMatrix,                                                                 // 4x4 object-to-world transform for the geometry or NULL (no transformation).
+	lm_type positionsType, const void *positionsXYZ, int positionsStride,                              // triangle mesh in object space.
+	lm_type normalsType, const void *normalsXYZ, int normalsStride,                                    // optional normals for the mesh in object space (Use LM_NONE type in case you only need flat surfaces).
+	lm_type lightmapCoordsType, const void *lightmapCoordsUV, int lightmapCoordsStride,                // lightmap atlas texture coordinates for the mesh [0..1]x[0..1] (integer types are normalized to 0..1 range).
+	int count, lm_type indicesType LM_DEFAULT_VALUE(LM_NONE), const void *indices LM_DEFAULT_VALUE(0));// if mesh indices are used, count = number of indices else count = number of vertices.
+
+
+// as long as lmBegin returns true, the scene has to be rendered with the
+// returned camera and view parameters to the currently bound framebuffer.
+// if lmBegin returns true, it must be followed by lmEnd after rendering!
+lm_bool lmBegin(lm_context *ctx,
+	int* outViewport4,                                                                                 // output of the current viewport: { x, y, w, h }. use these to call glViewport()!
+	float* outView4x4,                                                                                 // output of the current camera view matrix.
+	float* outProjection4x4);                                                                          // output of the current camera projection matrix.
+
+float lmProgress(lm_context *ctx);                                                                     // should only be called between lmBegin/lmEnd!
+                                                                                                       // provides the light mapping progress as a value increasing from 0.0 to 1.0.
+
+void lmEnd(lm_context *ctx);
+
+// destroys the lightmapper instance. should be called to free resources.
+void lmDestroy(lm_context *ctx);
+
+
+// image based post processing (c is the number of color channels in the image, m a channel mask for the operation)
+#define LM_ALL_CHANNELS 0x0f
+float lmImageMin(const float *image, int w, int h, int c, int m LM_DEFAULT_VALUE(LM_ALL_CHANNELS));                    // find the minimum value (across the specified channels)
+float lmImageMax(const float *image, int w, int h, int c, int m LM_DEFAULT_VALUE(LM_ALL_CHANNELS));                    // find the maximum value (across the specified channels)
+void lmImageAdd(float *image, int w, int h, int c, float value, int m LM_DEFAULT_VALUE(LM_ALL_CHANNELS));              // in-place add to the specified channels
+void lmImageScale(float *image, int w, int h, int c, float factor, int m LM_DEFAULT_VALUE(LM_ALL_CHANNELS));           // in-place scaling of the specified channels
+void lmImagePower(float *image, int w, int h, int c, float exponent, int m LM_DEFAULT_VALUE(LM_ALL_CHANNELS));         // in-place powf(v, exponent) of the specified channels (for gamma)
+void lmImageDilate(const float *image, float *outImage, int w, int h, int c);                                          // widen the populated non-zero areas by 1 pixel.
+void lmImageSmooth(const float *image, float *outImage, int w, int h, int c);                                          // simple box filter on only the non-zero values.
+void lmImageDownsample(const float *image, float *outImage, int w, int h, int c);                                      // downsamples [0..w]x[0..h] to [0..w/2]x[0..h/2] by avereging only the non-zero values
+void lmImageFtoUB(const float *image, unsigned char *outImage, int w, int h, int c, float max LM_DEFAULT_VALUE(0.0f)); // casts a floating point image to an 8bit/channel image
+
+// TGA file output helpers
+lm_bool lmImageSaveTGAub(const char *filename, const unsigned char *image, int w, int h, int c);
+lm_bool lmImageSaveTGAf(const char *filename, const float *image, int w, int h, int c, float max LM_DEFAULT_VALUE(0.0f));
+
+#endif
+////////////////////// END OF HEADER //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef LIGHTMAPPER_IMPLEMENTATION
+#undef LIGHTMAPPER_IMPLEMENTATION
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <float.h>
+#include <assert.h>
+#include <limits.h>
+
+#define LM_SWAP(type, a, b) { type tmp = (a); (a) = (b); (b) = tmp; }
+
+#if defined(_MSC_VER) && !defined(__cplusplus)
+#define inline __inline
+#endif
+
+#if defined(_MSC_VER) && (_MSC_VER <= 1700)
+static inline lm_bool lm_finite(float a) { return _finite(a); }
+#else
+static inline lm_bool lm_finite(float a) { return isfinite(a); }
+#endif
+
+static inline int      lm_mini      (int     a, int     b) { return a < b ? a : b; }
+static inline int      lm_maxi      (int     a, int     b) { return a > b ? a : b; }
+static inline int      lm_absi      (int     a           ) { return a < 0 ? -a : a; }
+static inline float    lm_minf      (float   a, float   b) { return a < b ? a : b; }
+static inline float    lm_maxf      (float   a, float   b) { return a > b ? a : b; }
+static inline float    lm_absf      (float   a           ) { return a < 0.0f ? -a : a; }
+static inline float    lm_pmodf     (float   a, float   b) { return (a < 0.0f ? 1.0f : 0.0f) + (float)fmod(a, b); } // positive mod
+
+typedef struct lm_ivec2 { int x, y; } lm_ivec2;
+static inline lm_ivec2 lm_i2        (int     x, int     y) { lm_ivec2 v = { x, y }; return v; }
+
+typedef struct lm_vec2 { float x, y; } lm_vec2;
+static inline lm_vec2  lm_v2i       (int     x, int     y) { lm_vec2 v = { (float)x, (float)y }; return v; }
+static inline lm_vec2  lm_v2        (float   x, float   y) { lm_vec2 v = { x, y }; return v; }
+static inline lm_vec2  lm_negate2   (lm_vec2 a           ) { return lm_v2(-a.x, -a.y); }
+static inline lm_vec2  lm_add2      (lm_vec2 a, lm_vec2 b) { return lm_v2(a.x + b.x, a.y + b.y); }
+static inline lm_vec2  lm_sub2      (lm_vec2 a, lm_vec2 b) { return lm_v2(a.x - b.x, a.y - b.y); }
+static inline lm_vec2  lm_mul2      (lm_vec2 a, lm_vec2 b) { return lm_v2(a.x * b.x, a.y * b.y); }
+static inline lm_vec2  lm_scale2    (lm_vec2 a, float   b) { return lm_v2(a.x * b, a.y * b); }
+static inline lm_vec2  lm_div2      (lm_vec2 a, float   b) { return lm_scale2(a, 1.0f / b); }
+static inline lm_vec2  lm_pmod2     (lm_vec2 a, float   b) { return lm_v2(lm_pmodf(a.x, b), lm_pmodf(a.y, b)); }
+static inline lm_vec2  lm_min2      (lm_vec2 a, lm_vec2 b) { return lm_v2(lm_minf(a.x, b.x), lm_minf(a.y, b.y)); }
+static inline lm_vec2  lm_max2      (lm_vec2 a, lm_vec2 b) { return lm_v2(lm_maxf(a.x, b.x), lm_maxf(a.y, b.y)); }
+static inline lm_vec2  lm_abs2      (lm_vec2 a           ) { return lm_v2(lm_absf(a.x), lm_absf(a.y)); }
+static inline lm_vec2  lm_floor2    (lm_vec2 a           ) { return lm_v2(floorf(a.x), floorf(a.y)); }
+static inline lm_vec2  lm_ceil2     (lm_vec2 a           ) { return lm_v2(ceilf (a.x), ceilf (a.y)); }
+static inline float    lm_dot2      (lm_vec2 a, lm_vec2 b) { return a.x * b.x + a.y * b.y; }
+static inline float    lm_cross2    (lm_vec2 a, lm_vec2 b) { return a.x * b.y - a.y * b.x; } // pseudo cross product
+static inline float    lm_length2sq (lm_vec2 a           ) { return a.x * a.x + a.y * a.y; }
+static inline float    lm_length2   (lm_vec2 a           ) { return sqrtf(lm_length2sq(a)); }
+static inline lm_vec2  lm_normalize2(lm_vec2 a           ) { return lm_div2(a, lm_length2(a)); }
+static inline lm_bool  lm_finite2   (lm_vec2 a           ) { return lm_finite(a.x) && lm_finite(a.y); }
+
+typedef struct lm_vec3 { float x, y, z; } lm_vec3;
+static inline lm_vec3  lm_v3        (float   x, float   y, float   z) { lm_vec3 v = { x, y, z }; return v; }
+static inline lm_vec3  lm_negate3   (lm_vec3 a           ) { return lm_v3(-a.x, -a.y, -a.z); }
+static inline lm_vec3  lm_add3      (lm_vec3 a, lm_vec3 b) { return lm_v3(a.x + b.x, a.y + b.y, a.z + b.z); }
+static inline lm_vec3  lm_sub3      (lm_vec3 a, lm_vec3 b) { return lm_v3(a.x - b.x, a.y - b.y, a.z - b.z); }
+static inline lm_vec3  lm_mul3      (lm_vec3 a, lm_vec3 b) { return lm_v3(a.x * b.x, a.y * b.y, a.z * b.z); }
+static inline lm_vec3  lm_scale3    (lm_vec3 a, float   b) { return lm_v3(a.x * b, a.y * b, a.z * b); }
+static inline lm_vec3  lm_div3      (lm_vec3 a, float   b) { return lm_scale3(a, 1.0f / b); }
+static inline lm_vec3  lm_pmod3     (lm_vec3 a, float   b) { return lm_v3(lm_pmodf(a.x, b), lm_pmodf(a.y, b), lm_pmodf(a.z, b)); }
+static inline lm_vec3  lm_min3      (lm_vec3 a, lm_vec3 b) { return lm_v3(lm_minf(a.x, b.x), lm_minf(a.y, b.y), lm_minf(a.z, b.z)); }
+static inline lm_vec3  lm_max3      (lm_vec3 a, lm_vec3 b) { return lm_v3(lm_maxf(a.x, b.x), lm_maxf(a.y, b.y), lm_maxf(a.z, b.z)); }
+static inline lm_vec3  lm_abs3      (lm_vec3 a           ) { return lm_v3(lm_absf(a.x), lm_absf(a.y), lm_absf(a.z)); }
+static inline lm_vec3  lm_floor3    (lm_vec3 a           ) { return lm_v3(floorf(a.x), floorf(a.y), floorf(a.z)); }
+static inline lm_vec3  lm_ceil3     (lm_vec3 a           ) { return lm_v3(ceilf (a.x), ceilf (a.y), ceilf (a.z)); }
+static inline float    lm_dot3      (lm_vec3 a, lm_vec3 b) { return a.x * b.x + a.y * b.y + a.z * b.z; }
+static inline lm_vec3  lm_cross3    (lm_vec3 a, lm_vec3 b) { return lm_v3(a.y * b.z - b.y * a.z, a.z * b.x - b.z * a.x, a.x * b.y - b.x * a.y); }
+static inline float    lm_length3sq (lm_vec3 a           ) { return a.x * a.x + a.y * a.y + a.z * a.z; }
+static inline float    lm_length3   (lm_vec3 a           ) { return sqrtf(lm_length3sq(a)); }
+static inline lm_vec3  lm_normalize3(lm_vec3 a           ) { return lm_div3(a, lm_length3(a)); }
+static inline lm_bool  lm_finite3   (lm_vec3 a           ) { return lm_finite(a.x) && lm_finite(a.y) && lm_finite(a.z); }
+
+static lm_vec2 lm_toBarycentric(lm_vec2 p1, lm_vec2 p2, lm_vec2 p3, lm_vec2 p)
+{
+	// http://www.blackpawn.com/texts/pointinpoly/
+	// Compute vectors
+	lm_vec2 v0 = lm_sub2(p3, p1);
+	lm_vec2 v1 = lm_sub2(p2, p1);
+	lm_vec2 v2 = lm_sub2(p, p1);
+	// Compute dot products
+	float dot00 = lm_dot2(v0, v0);
+	float dot01 = lm_dot2(v0, v1);
+	float dot02 = lm_dot2(v0, v2);
+	float dot11 = lm_dot2(v1, v1);
+	float dot12 = lm_dot2(v1, v2);
+	// Compute barycentric coordinates
+	float invDenom = 1.0f / (dot00 * dot11 - dot01 * dot01);
+	float u = (dot11 * dot02 - dot01 * dot12) * invDenom;
+	float v = (dot00 * dot12 - dot01 * dot02) * invDenom;
+	return lm_v2(u, v);
+}
+
+static inline int lm_leftOf(lm_vec2 a, lm_vec2 b, lm_vec2 c)
+{
+	float x = lm_cross2(lm_sub2(b, a), lm_sub2(c, b));
+	return x < 0 ? -1 : x > 0;
+}
+
+static lm_bool lm_lineIntersection(lm_vec2 x0, lm_vec2 x1, lm_vec2 y0, lm_vec2 y1, lm_vec2* res)
+{
+	lm_vec2 dx = lm_sub2(x1, x0);
+	lm_vec2 dy = lm_sub2(y1, y0);
+	lm_vec2 d = lm_sub2(x0, y0);
+	float dyx = lm_cross2(dy, dx);
+	if (dyx == 0.0f)
+		return LM_FALSE;
+	dyx = lm_cross2(d, dx) / dyx;
+	if (dyx <= 0 || dyx >= 1)
+		return LM_FALSE;
+	res->x = y0.x + dyx * dy.x;
+	res->y = y0.y + dyx * dy.y;
+	return LM_TRUE;
+}
+
+// this modifies the poly array! the poly array must be big enough to hold the result!
+// res must be big enough to hold the result!
+static int lm_convexClip(lm_vec2 *poly, int nPoly, const lm_vec2 *clip, int nClip, lm_vec2 *res)
+{
+	int nRes = nPoly;
+	int dir = lm_leftOf(clip[0], clip[1], clip[2]);
+	for (int i = 0, j = nClip - 1; i < nClip && nRes; j = i++)
+	{
+		if (i != 0)
+			for (nPoly = 0; nPoly < nRes; nPoly++)
+				poly[nPoly] = res[nPoly];
+		nRes = 0;
+		lm_vec2 v0 = poly[nPoly - 1];
+		int side0 = lm_leftOf(clip[j], clip[i], v0);
+		if (side0 != -dir)
+			res[nRes++] = v0;
+		for (int k = 0; k < nPoly; k++)
+		{
+			lm_vec2 v1 = poly[k], x;
+			int side1 = lm_leftOf(clip[j], clip[i], v1);
+			if (side0 + side1 == 0 && side0 && lm_lineIntersection(clip[j], clip[i], v0, v1, &x))
+				res[nRes++] = x;
+			if (k == nPoly - 1)
+				break;
+			if (side1 != -dir)
+				res[nRes++] = v1;
+			v0 = v1;
+			side0 = side1;
+		}
+	}
+
+	return nRes;
+}
+
+struct lm_context
+{
+	struct
+	{
+		const float *modelMatrix;
+		float normalMatrix[9];
+
+		const unsigned char *positions;
+		lm_type positionsType;
+		int positionsStride;
+		const unsigned char *normals;
+		lm_type normalsType;
+		int normalsStride;
+		const unsigned char *uvs;
+		lm_type uvsType;
+		int uvsStride;
+		const unsigned char *indices;
+		lm_type indicesType;
+		unsigned int count;
+	} mesh;
+
+	struct
+	{
+		int pass;
+		int passCount;
+
+		struct
+		{
+			unsigned int baseIndex;
+			lm_vec3 p[3];
+			lm_vec3 n[3];
+			lm_vec2 uv[3];
+		} triangle;
+
+		struct
+		{
+			int minx, miny;
+			int maxx, maxy;
+			int x, y;
+		} rasterizer;
+
+		struct
+		{
+			lm_vec3 position;
+			lm_vec3 direction;
+			lm_vec3 up;
+		} sample;
+
+		struct
+		{
+			int side;
+		} hemisphere;
+	} meshPosition;
+
+	struct
+	{
+		int width;
+		int height;
+		int channels;
+		float *data;
+
+#ifdef LM_DEBUG_INTERPOLATION
+		unsigned char *debug;
+#endif
+	} lightmap;
+
+	struct
+	{
+		unsigned int size;
+		float zNear, zFar;
+		float cameraToSurfaceDistanceModifier;
+		struct { float r, g, b; } clearColor;
+
+		unsigned int fbHemiCountX;
+		unsigned int fbHemiCountY;
+		unsigned int fbHemiIndex;
+		lm_ivec2 *fbHemiToLightmapLocation;
+		GLuint fbTexture[2];
+		GLuint fb[2];
+		GLuint fbDepth;
+		GLuint vao;
+		struct
+		{
+			GLuint programID;
+			GLuint hemispheresTextureID;
+			GLuint weightsTextureID;
+			GLuint weightsTexture;
+		} firstPass;
+		struct
+		{
+			GLuint programID;
+			GLuint hemispheresTextureID;
+		} downsamplePass;
+		struct
+		{
+			GLuint texture;
+			lm_ivec2 writePosition;
+			lm_ivec2 *toLightmapLocation;
+		} storage;
+	} hemisphere;
+
+	float interpolationThreshold;
+};
+
+// pass order of one 4x4 interpolation patch for two interpolation steps (and the next neighbors right of/below it)
+// 0 4 1 4 0
+// 5 6 5 6 5
+// 2 4 3 4 2
+// 5 6 5 6 5
+// 0 4 1 4 0
+
+static unsigned int lm_passStepSize(lm_context *ctx)
+{
+	unsigned int shift = ctx->meshPosition.passCount / 3 - (ctx->meshPosition.pass - 1) / 3;
+	unsigned int step = (1 << shift);
+	assert(step > 0);
+	return step;
+}
+
+static unsigned int lm_passOffsetX(lm_context *ctx)
+{
+	if (!ctx->meshPosition.pass)
+		return 0;
+	int passType = (ctx->meshPosition.pass - 1) % 3;
+	unsigned int halfStep = lm_passStepSize(ctx) >> 1;
+	return passType != 1 ? halfStep : 0;
+}
+
+static unsigned int lm_passOffsetY(lm_context *ctx)
+{
+	if (!ctx->meshPosition.pass)
+		return 0;
+	int passType = (ctx->meshPosition.pass - 1) % 3;
+	unsigned int halfStep = lm_passStepSize(ctx) >> 1;
+	return passType != 0 ? halfStep : 0;
+}
+
+static lm_bool lm_hasConservativeTriangleRasterizerFinished(lm_context *ctx)
+{
+	return ctx->meshPosition.rasterizer.y >= ctx->meshPosition.rasterizer.maxy;
+}
+
+static void lm_moveToNextPotentialConservativeTriangleRasterizerPosition(lm_context *ctx)
+{
+	unsigned int step = lm_passStepSize(ctx);
+	ctx->meshPosition.rasterizer.x += step;
+	while (ctx->meshPosition.rasterizer.x >= ctx->meshPosition.rasterizer.maxx)
+	{
+		ctx->meshPosition.rasterizer.x = ctx->meshPosition.rasterizer.minx + lm_passOffsetX(ctx);
+		ctx->meshPosition.rasterizer.y += step;
+		if (lm_hasConservativeTriangleRasterizerFinished(ctx))
+			break;
+	}
+}
+
+static float *lm_getLightmapPixel(lm_context *ctx, int x, int y)
+{
+	assert(x >= 0 && x < ctx->lightmap.width && y >= 0 && y < ctx->lightmap.height);
+	return ctx->lightmap.data + (y * ctx->lightmap.width + x) * ctx->lightmap.channels;
+}
+
+static void lm_setLightmapPixel(lm_context *ctx, int x, int y, float *in)
+{
+	assert(x >= 0 && x < ctx->lightmap.width && y >= 0 && y < ctx->lightmap.height);
+	float *p = ctx->lightmap.data + (y * ctx->lightmap.width + x) * ctx->lightmap.channels;
+	for (int j = 0; j < ctx->lightmap.channels; j++)
+		*p++ = *in++;
+}
+
+#define lm_baseAngle 0.1f
+static const float lm_baseAngles[3][3] = {
+	{ lm_baseAngle, lm_baseAngle + 1.0f / 3.0f, lm_baseAngle + 2.0f / 3.0f },
+	{ lm_baseAngle + 1.0f / 3.0f, lm_baseAngle + 2.0f / 3.0f, lm_baseAngle },
+	{ lm_baseAngle + 2.0f / 3.0f, lm_baseAngle, lm_baseAngle + 1.0f / 3.0f }
+};
+
+static lm_bool lm_trySamplingConservativeTriangleRasterizerPosition(lm_context *ctx)
+{
+	if (lm_hasConservativeTriangleRasterizerFinished(ctx))
+		return LM_FALSE;
+
+	// check if lightmap pixel was already set
+	float *pixelValue = lm_getLightmapPixel(ctx, ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y);
+	for (int j = 0; j < ctx->lightmap.channels; j++)
+		if (pixelValue[j] != 0.0f)
+			return LM_FALSE;
+
+	// try calculating centroid by clipping the pixel against the triangle
+	lm_vec2 pixel[16];
+	pixel[0] = lm_v2i(ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y);
+	pixel[1] = lm_v2i(ctx->meshPosition.rasterizer.x + 1, ctx->meshPosition.rasterizer.y);
+	pixel[2] = lm_v2i(ctx->meshPosition.rasterizer.x + 1, ctx->meshPosition.rasterizer.y + 1);
+	pixel[3] = lm_v2i(ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y + 1);
+
+	lm_vec2 res[16];
+	int nRes = lm_convexClip(pixel, 4, ctx->meshPosition.triangle.uv, 3, res);
+	if (nRes == 0)
+		return LM_FALSE; // nothing left
+
+	// calculate centroid position and area
+	lm_vec2 centroid = res[0];
+	float area = res[nRes - 1].x * res[0].y - res[nRes - 1].y * res[0].x;
+	for (int i = 1; i < nRes; i++)
+	{
+		centroid = lm_add2(centroid, res[i]);
+		area += res[i - 1].x * res[i].y - res[i - 1].y * res[i].x;
+	}
+	centroid = lm_div2(centroid, (float)nRes);
+	area = lm_absf(area / 2.0f);
+
+	if (area <= 0.0f)
+		return LM_FALSE; // no area left
+
+	// calculate barycentric coords
+	lm_vec2 uv = lm_toBarycentric(
+		ctx->meshPosition.triangle.uv[0],
+		ctx->meshPosition.triangle.uv[1],
+		ctx->meshPosition.triangle.uv[2],
+		centroid);
+
+	if (!lm_finite2(uv))
+		return LM_FALSE; // degenerate
+
+	// try to interpolate color from neighbors:
+	if (ctx->meshPosition.pass > 0)
+	{
+		float *neighbors[4];
+		int neighborCount = 0;
+		int neighborsExpected = 0;
+		int d = (int)lm_passStepSize(ctx) / 2;
+		int dirs = ((ctx->meshPosition.pass - 1) % 3) + 1;
+		if (dirs & 1) // check x-neighbors with distance d
+		{
+			neighborsExpected += 2;
+			if (ctx->meshPosition.rasterizer.x - d >= ctx->meshPosition.rasterizer.minx &&
+				ctx->meshPosition.rasterizer.x + d <= ctx->meshPosition.rasterizer.maxx)
+			{
+				neighbors[neighborCount++] = lm_getLightmapPixel(ctx, ctx->meshPosition.rasterizer.x - d, ctx->meshPosition.rasterizer.y);
+				neighbors[neighborCount++] = lm_getLightmapPixel(ctx, ctx->meshPosition.rasterizer.x + d, ctx->meshPosition.rasterizer.y);
+			}
+		}
+		if (dirs & 2) // check y-neighbors with distance d
+		{
+			neighborsExpected += 2;
+			if (ctx->meshPosition.rasterizer.y - d >= ctx->meshPosition.rasterizer.miny &&
+				ctx->meshPosition.rasterizer.y + d <= ctx->meshPosition.rasterizer.maxy)
+			{
+				neighbors[neighborCount++] = lm_getLightmapPixel(ctx, ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y - d);
+				neighbors[neighborCount++] = lm_getLightmapPixel(ctx, ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y + d);
+			}
+		}
+		if (neighborCount == neighborsExpected) // are all interpolation neighbors available?
+		{
+			// calculate average neighbor pixel value
+			float avg[4] = { 0 };
+			for (int i = 0; i < neighborCount; i++)
+				for (int j = 0; j < ctx->lightmap.channels; j++)
+					avg[j] += neighbors[i][j];
+			float ni = 1.0f / neighborCount;
+			for (int j = 0; j < ctx->lightmap.channels; j++)
+				avg[j] *= ni;
+
+			// check if error from average pixel to neighbors is above the interpolation threshold
+			lm_bool interpolate = LM_TRUE;
+			for (int i = 0; i < neighborCount; i++)
+			{
+				lm_bool zero = LM_TRUE;
+				for (int j = 0; j < ctx->lightmap.channels; j++)
+				{
+					if (neighbors[i][j] != 0.0f)
+						zero = LM_FALSE;
+					if (fabs(neighbors[i][j] - avg[j]) > ctx->interpolationThreshold)
+						interpolate = LM_FALSE;
+				}
+				if (zero)
+					interpolate = LM_FALSE;
+				if (!interpolate)
+					break;
+			}
+
+			// set interpolated value and return if interpolation is acceptable
+			if (interpolate)
+			{
+				lm_setLightmapPixel(ctx, ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y, avg);
+#ifdef LM_DEBUG_INTERPOLATION
+				// set interpolated pixel to green in debug output
+				ctx->lightmap.debug[(ctx->meshPosition.rasterizer.y * ctx->lightmap.width + ctx->meshPosition.rasterizer.x) * 3 + 1] = 255;
+#endif
+				return LM_FALSE;
+			}
+		}
+	}
+
+	// could not interpolate. must render a hemisphere.
+	// calculate 3D sample position and orientation
+	lm_vec3 p0 = ctx->meshPosition.triangle.p[0];
+	lm_vec3 p1 = ctx->meshPosition.triangle.p[1];
+	lm_vec3 p2 = ctx->meshPosition.triangle.p[2];
+	lm_vec3 v1 = lm_sub3(p1, p0);
+	lm_vec3 v2 = lm_sub3(p2, p0);
+	ctx->meshPosition.sample.position = lm_add3(p0, lm_add3(lm_scale3(v2, uv.x), lm_scale3(v1, uv.y)));
+
+	lm_vec3 n0 = ctx->meshPosition.triangle.n[0];
+	lm_vec3 n1 = ctx->meshPosition.triangle.n[1];
+	lm_vec3 n2 = ctx->meshPosition.triangle.n[2];
+	lm_vec3 nv1 = lm_sub3(n1, n0);
+	lm_vec3 nv2 = lm_sub3(n2, n0);
+	ctx->meshPosition.sample.direction = lm_normalize3(lm_add3(n0, lm_add3(lm_scale3(nv2, uv.x), lm_scale3(nv1, uv.y))));
+	ctx->meshPosition.sample.direction = lm_normalize3(ctx->meshPosition.sample.direction);
+	float cameraToSurfaceDistance = (1.0f + ctx->hemisphere.cameraToSurfaceDistanceModifier) * ctx->hemisphere.zNear * sqrtf(2.0f);
+	ctx->meshPosition.sample.position = lm_add3(ctx->meshPosition.sample.position, lm_scale3(ctx->meshPosition.sample.direction, cameraToSurfaceDistance));
+
+	if (!lm_finite3(ctx->meshPosition.sample.position) ||
+		!lm_finite3(ctx->meshPosition.sample.direction) ||
+		lm_length3sq(ctx->meshPosition.sample.direction) < 0.5f) // don't allow 0.0f. should always be ~1.0f
+		return LM_FALSE;
+
+	lm_vec3 up = lm_v3(0.0f, 1.0f, 0.0f);
+	if (lm_absf(lm_dot3(up, ctx->meshPosition.sample.direction)) > 0.8f)
+		up = lm_v3(0.0f, 0.0f, 1.0f);
+
+#if 0
+	// triangle-consistent up vector
+	ctx->meshPosition.sample.up = lm_normalize3(lm_cross3(up, ctx->meshPosition.sample.direction));
+	return LM_TRUE;
+#else
+	// "randomized" rotation with pattern
+	lm_vec3 side = lm_normalize3(lm_cross3(up, ctx->meshPosition.sample.direction));
+	up = lm_normalize3(lm_cross3(side, ctx->meshPosition.sample.direction));
+	int rx = ctx->meshPosition.rasterizer.x % 3;
+	int ry = ctx->meshPosition.rasterizer.y % 3;
+	static const float lm_pi = 3.14159265358979f;
+	float phi = 2.0f * lm_pi * lm_baseAngles[ry][rx] + 0.1f * ((float)rand() / (float)RAND_MAX);
+	ctx->meshPosition.sample.up = lm_normalize3(lm_add3(lm_scale3(side, cosf(phi)), lm_scale3(up, sinf(phi))));
+	return LM_TRUE;
+#endif
+}
+
+// returns true if a sampling position was found and
+// false if we finished rasterizing the current triangle
+static lm_bool lm_findFirstConservativeTriangleRasterizerPosition(lm_context *ctx)
+{
+	while (!lm_trySamplingConservativeTriangleRasterizerPosition(ctx))
+	{
+		lm_moveToNextPotentialConservativeTriangleRasterizerPosition(ctx);
+		if (lm_hasConservativeTriangleRasterizerFinished(ctx))
+			return LM_FALSE;
+	}
+	return LM_TRUE;
+}
+
+static lm_bool lm_findNextConservativeTriangleRasterizerPosition(lm_context *ctx)
+{
+	lm_moveToNextPotentialConservativeTriangleRasterizerPosition(ctx);
+	return lm_findFirstConservativeTriangleRasterizerPosition(ctx);
+}
+
+static void lm_integrateHemisphereBatch(lm_context *ctx)
+{
+	if (!ctx->hemisphere.fbHemiIndex)
+		return; // nothing to do
+
+	glDisable(GL_DEPTH_TEST);
+	glBindVertexArray(ctx->hemisphere.vao);
+
+	int fbRead = 0;
+	int fbWrite = 1;
+
+	// weighted downsampling pass
+	int outHemiSize = ctx->hemisphere.size / 2;
+	glBindFramebuffer(GL_FRAMEBUFFER, ctx->hemisphere.fb[fbWrite]);
+	glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, ctx->hemisphere.fbTexture[fbWrite], 0);
+	glViewport(0, 0, outHemiSize * ctx->hemisphere.fbHemiCountX, outHemiSize * ctx->hemisphere.fbHemiCountY);
+	glUseProgram(ctx->hemisphere.firstPass.programID);
+	glUniform1i(ctx->hemisphere.firstPass.hemispheresTextureID, 0);
+	glActiveTexture(GL_TEXTURE0);
+	glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.fbTexture[fbRead]);
+	glUniform1i(ctx->hemisphere.firstPass.weightsTextureID, 1);
+	glActiveTexture(GL_TEXTURE1);
+	glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.firstPass.weightsTexture);
+	glActiveTexture(GL_TEXTURE0);
+	glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+	//glBindTexture(GL_TEXTURE_2D, 0);
+
+#if 0
+	// debug output
+	int w = outHemiSize * ctx->hemisphere.fbHemiCountX, h = outHemiSize * ctx->hemisphere.fbHemiCountY;
+	glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
+	glBindFramebuffer(GL_READ_FRAMEBUFFER, ctx->hemisphere.fb[fbWrite]);
+	glReadBuffer(GL_COLOR_ATTACHMENT0);
+	float *image = new float[3 * w * h];
+	glReadPixels(0, 0, w, h, GL_RGB, GL_FLOAT, image);
+	lmImageSaveTGAf("firstpass.png", image, w, h, 3);
+	delete[] image;
+#endif
+
+	// downsampling passes
+	glUseProgram(ctx->hemisphere.downsamplePass.programID);
+	glUniform1i(ctx->hemisphere.downsamplePass.hemispheresTextureID, 0);
+	while (outHemiSize > 1)
+	{
+		LM_SWAP(int, fbRead, fbWrite);
+		outHemiSize /= 2;
+		glBindFramebuffer(GL_FRAMEBUFFER, ctx->hemisphere.fb[fbWrite]);
+		glViewport(0, 0, outHemiSize * ctx->hemisphere.fbHemiCountX, outHemiSize * ctx->hemisphere.fbHemiCountY);
+		glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.fbTexture[fbRead]);
+		glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+		//glBindTexture(GL_TEXTURE_2D, 0);
+	}
+
+	// copy results to storage texture
+	glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.storage.texture);
+	glCopyTexSubImage2D(GL_TEXTURE_2D, 0,
+		ctx->hemisphere.storage.writePosition.x, ctx->hemisphere.storage.writePosition.y,
+		0, 0, ctx->hemisphere.fbHemiCountX, ctx->hemisphere.fbHemiCountY);
+	glBindTexture(GL_TEXTURE_2D, 0);
+	glBindFramebuffer(GL_FRAMEBUFFER, 0);
+	glBindVertexArray(0);
+	glEnable(GL_DEPTH_TEST);
+
+	// copy position mapping to storage
+	for (unsigned int y = 0; y < ctx->hemisphere.fbHemiCountY; y++)
+	{
+		int sy = ctx->hemisphere.storage.writePosition.y + y;
+		for (unsigned int x = 0; x < ctx->hemisphere.fbHemiCountX; x++)
+		{
+			int sx = ctx->hemisphere.storage.writePosition.x + x;
+			unsigned int hemiIndex = y * ctx->hemisphere.fbHemiCountX + x;
+			if (hemiIndex >= ctx->hemisphere.fbHemiIndex)
+				ctx->hemisphere.storage.toLightmapLocation[sy * ctx->lightmap.width + sx] = lm_i2(-1, -1);
+			else
+				ctx->hemisphere.storage.toLightmapLocation[sy * ctx->lightmap.width + sx] = ctx->hemisphere.fbHemiToLightmapLocation[hemiIndex];
+		}
+	}
+
+	// advance storage texture write position
+	ctx->hemisphere.storage.writePosition.x += ctx->hemisphere.fbHemiCountX;
+	if (ctx->hemisphere.storage.writePosition.x + (int)ctx->hemisphere.fbHemiCountX > ctx->lightmap.width)
+	{
+		ctx->hemisphere.storage.writePosition.x = 0;
+		ctx->hemisphere.storage.writePosition.y += ctx->hemisphere.fbHemiCountY;
+		assert(ctx->hemisphere.storage.writePosition.y + (int)ctx->hemisphere.fbHemiCountY < ctx->lightmap.height);
+	}
+
+	ctx->hemisphere.fbHemiIndex = 0;
+}
+
+static void lm_writeResultsToLightmap(lm_context *ctx)
+{
+	// do the GPU->CPU transfer of downsampled hemispheres
+	float *hemi = (float*)LM_CALLOC(ctx->lightmap.width * ctx->lightmap.height, 4 * sizeof(float));
+	glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.storage.texture);
+	glGetTexImage(GL_TEXTURE_2D, 0, GL_RGBA, GL_FLOAT, hemi);
+
+	// write results to lightmap texture
+	for (int y = 0; y < ctx->hemisphere.storage.writePosition.y + (int)ctx->hemisphere.fbHemiCountY; y++)
+	{
+		for (int x = 0; x < ctx->lightmap.width; x++)
+		{
+			lm_ivec2 lmUV = ctx->hemisphere.storage.toLightmapLocation[y * ctx->lightmap.width + x];
+			if (lmUV.x >= 0)
+			{
+				float *c = hemi + (y * ctx->lightmap.width + x) * 4;
+				float validity = c[3];
+				float *lm = ctx->lightmap.data + (lmUV.y * ctx->lightmap.width + lmUV.x) * ctx->lightmap.channels;
+				if (!lm[0] && validity > 0.9)
+				{
+					float scale = 1.0f / validity;
+					switch (ctx->lightmap.channels)
+					{
+					case 1:
+						lm[0] = lm_maxf((c[0] + c[1] + c[2]) * scale / 3.0f, FLT_MIN);
+						break;
+					case 2:
+						lm[0] = lm_maxf((c[0] + c[1] + c[2]) * scale / 3.0f, FLT_MIN);
+						lm[1] = 1.0f; // do we want to support this format?
+						break;
+					case 3:
+						lm[0] = lm_maxf(c[0] * scale, FLT_MIN);
+						lm[1] = lm_maxf(c[1] * scale, FLT_MIN);
+						lm[2] = lm_maxf(c[2] * scale, FLT_MIN);
+						break;
+					case 4:
+						lm[0] = lm_maxf(c[0] * scale, FLT_MIN);
+						lm[1] = lm_maxf(c[1] * scale, FLT_MIN);
+						lm[2] = lm_maxf(c[2] * scale, FLT_MIN);
+						lm[3] = 1.0f;
+						break;
+					default:
+						assert(LM_FALSE);
+						break;
+					}
+
+#ifdef LM_DEBUG_INTERPOLATION
+					// set sampled pixel to red in debug output
+					ctx->lightmap.debug[(lmUV.y * ctx->lightmap.width + lmUV.x) * 3 + 0] = 255;
+#endif
+				}
+			}
+			ctx->hemisphere.storage.toLightmapLocation[y * ctx->lightmap.width + x].x = -1; // reset
+		}
+	}
+
+	LM_FREE(hemi);
+	ctx->hemisphere.storage.writePosition = lm_i2(0, 0);
+}
+
+static void lm_setView(
+	int* viewport, int x, int y, int w, int h,
+	float* view,   lm_vec3 pos, lm_vec3 dir, lm_vec3 up,
+	float* proj,   float l, float r, float b, float t, float n, float f)
+{
+	// viewport
+	viewport[0] = x; viewport[1] = y; viewport[2] = w; viewport[3] = h;
+
+	// view matrix: lookAt(pos, pos + dir, up)
+	lm_vec3 side = lm_cross3(dir, up);
+	//up = cross(side, dir);
+	dir = lm_negate3(dir); pos = lm_negate3(pos);
+	view[ 0] = side.x;             view[ 1] = up.x;             view[ 2] = dir.x;             view[ 3] = 0.0f;
+	view[ 4] = side.y;             view[ 5] = up.y;             view[ 6] = dir.y;             view[ 7] = 0.0f;
+	view[ 8] = side.z;             view[ 9] = up.z;             view[10] = dir.z;             view[11] = 0.0f;
+	view[12] = lm_dot3(side, pos); view[13] = lm_dot3(up, pos); view[14] = lm_dot3(dir, pos); view[15] = 1.0f;
+
+	// projection matrix: frustum(l, r, b, t, n, f)
+	float ilr = 1.0f / (r - l), ibt = 1.0f / (t - b), ninf = -1.0f / (f - n), n2 = 2.0f * n;
+	proj[ 0] = n2 * ilr;      proj[ 1] = 0.0f;          proj[ 2] = 0.0f;           proj[ 3] = 0.0f;
+	proj[ 4] = 0.0f;          proj[ 5] = n2 * ibt;      proj[ 6] = 0.0f;           proj[ 7] = 0.0f;
+	proj[ 8] = (r + l) * ilr; proj[ 9] = (t + b) * ibt; proj[10] = (f + n) * ninf; proj[11] = -1.0f;
+	proj[12] = 0.0f;          proj[13] = 0.0f;          proj[14] = f * n2 * ninf;  proj[15] = 0.0f;
+}
+
+// returns true if a hemisphere side was prepared for rendering and
+// false if we finished the current hemisphere
+static lm_bool lm_beginSampleHemisphere(lm_context *ctx, int* viewport, float* view, float* proj)
+{
+	if (ctx->meshPosition.hemisphere.side >= 5)
+		return LM_FALSE;
+
+	if (ctx->meshPosition.hemisphere.side == 0)
+	{
+		// prepare hemisphere
+		glBindFramebuffer(GL_FRAMEBUFFER, ctx->hemisphere.fb[0]);
+		if (ctx->hemisphere.fbHemiIndex == 0)
+		{
+			// prepare hemisphere batch
+			glClearColor( // clear to valid background pixels!
+				ctx->hemisphere.clearColor.r,
+				ctx->hemisphere.clearColor.g,
+				ctx->hemisphere.clearColor.b, 1.0f);
+			glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+		}
+		ctx->hemisphere.fbHemiToLightmapLocation[ctx->hemisphere.fbHemiIndex] =
+			lm_i2(ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y);
+	}
+
+	// find the target position in the batch
+	int x = (ctx->hemisphere.fbHemiIndex % ctx->hemisphere.fbHemiCountX) * ctx->hemisphere.size * 3;
+	int y = (ctx->hemisphere.fbHemiIndex / ctx->hemisphere.fbHemiCountX) * ctx->hemisphere.size;
+
+	int size = ctx->hemisphere.size;
+	float zNear = ctx->hemisphere.zNear;
+	float zFar = ctx->hemisphere.zFar;
+
+	lm_vec3 pos = ctx->meshPosition.sample.position;
+	lm_vec3 dir = ctx->meshPosition.sample.direction;
+	lm_vec3 up = ctx->meshPosition.sample.up;
+	lm_vec3 right = lm_cross3(dir, up);
+
+	// find the view parameters of the hemisphere side that we will render next
+	// hemisphere layout in the framebuffer:
+	//       +-------+---+---+-------+
+	//       |       |   |   |   D   |
+	//       |   C   | R | L +-------+
+	//       |       |   |   |   U   |
+	//       +-------+---+---+-------+
+	switch (ctx->meshPosition.hemisphere.side)
+	{
+	case 0: // center
+		lm_setView(viewport, x, y, size, size,
+				   view,     pos, dir, up,
+				   proj,     -zNear, zNear, -zNear, zNear, zNear, zFar);
+		break;
+	case 1: // right
+		lm_setView(viewport, size + x, y, size / 2, size,
+				   view,     pos, right, up,
+				   proj,     -zNear, 0.0f, -zNear, zNear, zNear, zFar);
+		break;
+	case 2: // left
+		lm_setView(viewport, size + x + size / 2, y, size / 2, size,
+				   view,     pos, lm_negate3(right), up,
+				   proj,     0.0f, zNear, -zNear, zNear, zNear, zFar);
+		break;
+	case 3: // down
+		lm_setView(viewport, 2 * size + x, y + size / 2, size, size / 2,
+				   view,     pos, lm_negate3(up), dir,
+				   proj,     -zNear, zNear, 0.0f, zNear, zNear, zFar);
+		break;
+	case 4: // up
+		lm_setView(viewport, 2 * size + x, y, size, size / 2,
+				   view,     pos, up, lm_negate3(dir),
+				   proj,     -zNear, zNear, -zNear, 0.0f, zNear, zFar);
+		break;
+	default:
+		assert(LM_FALSE);
+		break;
+	}
+
+	return LM_TRUE;
+}
+
+static void lm_endSampleHemisphere(lm_context *ctx)
+{
+	if (++ctx->meshPosition.hemisphere.side == 5)
+	{
+		// finish hemisphere
+		glBindFramebuffer(GL_FRAMEBUFFER, 0);
+		if (++ctx->hemisphere.fbHemiIndex == ctx->hemisphere.fbHemiCountX * ctx->hemisphere.fbHemiCountY)
+		{
+			// downsample new hemisphere batch and store the results
+			lm_integrateHemisphereBatch(ctx);
+		}
+	}
+}
+
+static void lm_inverseTranspose(const float *m44, float *n33)
+{
+	if (!m44)
+	{
+		n33[0] = 1.0f; n33[1] = 0.0f; n33[2] = 0.0f;
+		n33[3] = 0.0f; n33[4] = 1.0f; n33[5] = 0.0f;
+		n33[6] = 0.0f; n33[7] = 0.0f; n33[8] = 1.0f;
+		return;
+	}
+
+	float determinant = m44[ 0] * (m44[ 5] * m44[10] - m44[ 9] * m44[ 6])
+					  - m44[ 1] * (m44[ 4] * m44[10] - m44[ 6] * m44[ 8])
+					  + m44[ 2] * (m44[ 4] * m44[ 9] - m44[ 5] * m44[ 8]);
+
+	assert(fabs(determinant) > FLT_EPSILON);
+	float rcpDeterminant = 1.0f / determinant;
+
+	n33[0] =  (m44[ 5] * m44[10] - m44[ 9] * m44[ 6]) * rcpDeterminant;
+	n33[3] = -(m44[ 1] * m44[10] - m44[ 2] * m44[ 9]) * rcpDeterminant;
+	n33[6] =  (m44[ 1] * m44[ 6] - m44[ 2] * m44[ 5]) * rcpDeterminant;
+	n33[1] = -(m44[ 4] * m44[10] - m44[ 6] * m44[ 8]) * rcpDeterminant;
+	n33[4] =  (m44[ 0] * m44[10] - m44[ 2] * m44[ 8]) * rcpDeterminant;
+	n33[7] = -(m44[ 0] * m44[ 6] - m44[ 4] * m44[ 2]) * rcpDeterminant;
+	n33[2] =  (m44[ 4] * m44[ 9] - m44[ 8] * m44[ 5]) * rcpDeterminant;
+	n33[5] = -(m44[ 0] * m44[ 9] - m44[ 8] * m44[ 1]) * rcpDeterminant;
+	n33[8] =  (m44[ 0] * m44[ 5] - m44[ 4] * m44[ 1]) * rcpDeterminant;
+}
+
+static lm_vec3 lm_transformNormal(const float *m, lm_vec3 n)
+{
+	lm_vec3 r;
+	r.x = m[0] * n.x + m[3] * n.y + m[6] * n.z;
+	r.y = m[1] * n.x + m[4] * n.y + m[7] * n.z;
+	r.z = m[2] * n.x + m[5] * n.y + m[8] * n.z;
+	return r;
+}
+
+static lm_vec3 lm_transformPosition(const float *m, lm_vec3 v)
+{
+	if (!m)
+		return v;
+	lm_vec3 r;
+	r.x =     m[0] * v.x + m[4] * v.y + m[ 8] * v.z + m[12];
+	r.y =     m[1] * v.x + m[5] * v.y + m[ 9] * v.z + m[13];
+	r.z =     m[2] * v.x + m[6] * v.y + m[10] * v.z + m[14];
+	float d = m[3] * v.x + m[7] * v.y + m[11] * v.z + m[15];
+	assert(lm_absf(d - 1.0f) < 0.00001f); // could divide by d, but this shouldn't be a projection transform!
+	return r;
+}
+
+static void lm_setMeshPosition(lm_context *ctx, unsigned int indicesTriangleBaseIndex)
+{
+	// fetch triangle at the specified indicesTriangleBaseIndex
+	ctx->meshPosition.triangle.baseIndex = indicesTriangleBaseIndex;
+
+	// load and transform triangle to process next
+	lm_vec2 uvMin = lm_v2(FLT_MAX, FLT_MAX), uvMax = lm_v2(-FLT_MAX, -FLT_MAX);
+	lm_vec2 uvScale = lm_v2i(ctx->lightmap.width, ctx->lightmap.height);
+	unsigned int vIndices[3];
+	for (int i = 0; i < 3; i++)
+	{
+		// decode index
+		unsigned int vIndex;
+		switch (ctx->mesh.indicesType)
+		{
+		case LM_NONE:
+			vIndex = ctx->meshPosition.triangle.baseIndex + i;
+			break;
+		case LM_UNSIGNED_BYTE:
+			vIndex = ((const unsigned char*)ctx->mesh.indices + ctx->meshPosition.triangle.baseIndex)[i];
+			break;
+		case LM_UNSIGNED_SHORT:
+			vIndex = ((const unsigned short*)ctx->mesh.indices + ctx->meshPosition.triangle.baseIndex)[i];
+			break;
+		case LM_UNSIGNED_INT:
+			vIndex = ((const unsigned int*)ctx->mesh.indices + ctx->meshPosition.triangle.baseIndex)[i];
+			break;
+		default:
+			assert(LM_FALSE);
+			break;
+		}
+		vIndices[i] = vIndex;
+
+		// decode and pre-transform vertex position
+		const void *pPtr = ctx->mesh.positions + vIndex * ctx->mesh.positionsStride;
+		lm_vec3 p;
+		switch (ctx->mesh.positionsType)
+		{
+		// TODO: signed formats
+		case LM_UNSIGNED_BYTE: {
+			const unsigned char *uc = (const unsigned char*)pPtr;
+			p = lm_v3(uc[0], uc[1], uc[2]);
+		} break;
+		case LM_UNSIGNED_SHORT: {
+			const unsigned short *us = (const unsigned short*)pPtr;
+			p = lm_v3(us[0], us[1], us[2]);
+		} break;
+		case LM_UNSIGNED_INT: {
+			const unsigned int *ui = (const unsigned int*)pPtr;
+			p = lm_v3((float)ui[0], (float)ui[1], (float)ui[2]);
+		} break;
+		case LM_FLOAT: {
+			p = *(const lm_vec3*)pPtr;
+		} break;
+		default: {
+			assert(LM_FALSE);
+		} break;
+		}
+		ctx->meshPosition.triangle.p[i] = lm_transformPosition(ctx->mesh.modelMatrix, p);
+
+		// decode and scale (to lightmap resolution) vertex lightmap texture coords
+		const void *uvPtr = ctx->mesh.uvs + vIndex * ctx->mesh.uvsStride;
+		lm_vec2 uv;
+		switch (ctx->mesh.uvsType)
+		{
+		case LM_UNSIGNED_BYTE: {
+			const unsigned char *uc = (const unsigned char*)uvPtr;
+			uv = lm_v2(uc[0] / (float)UCHAR_MAX, uc[1] / (float)UCHAR_MAX);
+		} break;
+		case LM_UNSIGNED_SHORT: {
+			const unsigned short *us = (const unsigned short*)uvPtr;
+			uv = lm_v2(us[0] / (float)USHRT_MAX, us[1] / (float)USHRT_MAX);
+		} break;
+		case LM_UNSIGNED_INT: {
+			const unsigned int *ui = (const unsigned int*)uvPtr;
+			uv = lm_v2(ui[0] / (float)UINT_MAX, ui[1] / (float)UINT_MAX);
+		} break;
+		case LM_FLOAT: {
+			uv = *(const lm_vec2*)uvPtr;
+		} break;
+		default: {
+			assert(LM_FALSE);
+		} break;
+		}
+
+		ctx->meshPosition.triangle.uv[i] = lm_mul2(lm_pmod2(uv, 1.0f), uvScale); // maybe clamp to 0.0-1.0 instead of pmod?
+
+		// update bounds on lightmap
+		uvMin = lm_min2(uvMin, ctx->meshPosition.triangle.uv[i]);
+		uvMax = lm_max2(uvMax, ctx->meshPosition.triangle.uv[i]);
+	}
+
+	lm_vec3 flatNormal = lm_cross3(
+		lm_sub3(ctx->meshPosition.triangle.p[1], ctx->meshPosition.triangle.p[0]),
+		lm_sub3(ctx->meshPosition.triangle.p[2], ctx->meshPosition.triangle.p[0]));
+
+	for (int i = 0; i < 3; i++)
+	{
+		// decode and pre-transform vertex normal
+		const void *nPtr = ctx->mesh.normals + vIndices[i] * ctx->mesh.normalsStride;
+		lm_vec3 n;
+		switch (ctx->mesh.normalsType)
+		{
+		// TODO: signed formats
+		case LM_FLOAT: {
+			n = *(const lm_vec3*)nPtr;
+		} break;
+		case LM_NONE: {
+			n = flatNormal;
+		} break;
+		default: {
+			assert(LM_FALSE);
+		} break;
+		}
+		ctx->meshPosition.triangle.n[i] = lm_normalize3(lm_transformNormal(ctx->mesh.normalMatrix, n));
+	}
+
+	// calculate area of interest (on lightmap) for conservative rasterization
+	lm_vec2 bbMin = lm_floor2(uvMin);
+	lm_vec2 bbMax = lm_ceil2 (uvMax);
+	ctx->meshPosition.rasterizer.minx = lm_maxi((int)bbMin.x - 1, 0);
+	ctx->meshPosition.rasterizer.miny = lm_maxi((int)bbMin.y - 1, 0);
+	ctx->meshPosition.rasterizer.maxx = lm_mini((int)bbMax.x + 1, ctx->lightmap.width - 1);
+	ctx->meshPosition.rasterizer.maxy = lm_mini((int)bbMax.y + 1, ctx->lightmap.height - 1);
+	assert(ctx->meshPosition.rasterizer.minx <= ctx->meshPosition.rasterizer.maxx &&
+		   ctx->meshPosition.rasterizer.miny <= ctx->meshPosition.rasterizer.maxy);
+	ctx->meshPosition.rasterizer.x = ctx->meshPosition.rasterizer.minx + lm_passOffsetX(ctx);
+	ctx->meshPosition.rasterizer.y = ctx->meshPosition.rasterizer.miny + lm_passOffsetY(ctx);
+
+	// try moving to first valid sample position
+	if (ctx->meshPosition.rasterizer.x <= ctx->meshPosition.rasterizer.maxx &&
+		ctx->meshPosition.rasterizer.y <= ctx->meshPosition.rasterizer.maxy &&
+		lm_findFirstConservativeTriangleRasterizerPosition(ctx))
+		ctx->meshPosition.hemisphere.side = 0; // we can start sampling the hemisphere
+	else
+		ctx->meshPosition.hemisphere.side = 5; // no samples on this triangle! put hemisphere sampler into finished state
+}
+
+static GLuint lm_LoadShader(GLenum type, const char *source)
+{
+	GLuint shader = glCreateShader(type);
+	if (shader == 0)
+	{
+		fprintf(stderr, "Could not create shader!\n");
+		return 0;
+	}
+	glShaderSource(shader, 1, &source, NULL);
+	glCompileShader(shader);
+	GLint compiled;
+	glGetShaderiv(shader, GL_COMPILE_STATUS, &compiled);
+	if (!compiled)
+	{
+		fprintf(stderr, "Could not compile shader!\n");
+		GLint infoLen = 0;
+		glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &infoLen);
+		if (infoLen)
+		{
+			char* infoLog = (char*)malloc(infoLen);
+			glGetShaderInfoLog(shader, infoLen, NULL, infoLog);
+			fprintf(stderr, "%s\n", infoLog);
+			free(infoLog);
+		}
+		glDeleteShader(shader);
+		return 0;
+	}
+	return shader;
+}
+
+static GLuint lm_LoadProgram(const char *vp, const char *fp)
+{
+	GLuint program = glCreateProgram();
+	if (program == 0)
+	{
+		fprintf(stderr, "Could not create program!\n");
+		return 0;
+	}
+	GLuint vertexShader = lm_LoadShader(GL_VERTEX_SHADER, vp);
+	GLuint fragmentShader = lm_LoadShader(GL_FRAGMENT_SHADER, fp);
+	glAttachShader(program, vertexShader);
+	glAttachShader(program, fragmentShader);
+	glLinkProgram(program);
+	glDeleteShader(vertexShader);
+	glDeleteShader(fragmentShader);
+	GLint linked;
+	glGetProgramiv(program, GL_LINK_STATUS, &linked);
+	if (!linked)
+	{
+		fprintf(stderr, "Could not link program!\n");
+		GLint infoLen = 0;
+		glGetProgramiv(program, GL_INFO_LOG_LENGTH, &infoLen);
+		if (infoLen)
+		{
+			char* infoLog = (char*)malloc(sizeof(char) * infoLen);
+			glGetProgramInfoLog(program, infoLen, NULL, infoLog);
+			fprintf(stderr, "%s\n", infoLog);
+			free(infoLog);
+		}
+		glDeleteProgram(program);
+		return 0;
+	}
+	return program;
+}
+
+static float lm_defaultWeights(float cos_theta, void *userdata)
+{
+	return 1.0f;
+}
+
+lm_context *lmCreate(int hemisphereSize, float zNear, float zFar,
+	float clearR, float clearG, float clearB,
+	int interpolationPasses, float interpolationThreshold,
+	float cameraToSurfaceDistanceModifier)
+{
+	assert(hemisphereSize == 512 || hemisphereSize == 256 || hemisphereSize == 128 ||
+		   hemisphereSize ==  64 || hemisphereSize ==  32 || hemisphereSize ==  16);
+	assert(zNear < zFar && zNear > 0.0f);
+	assert(cameraToSurfaceDistanceModifier >= -1.0f);
+	assert(interpolationPasses >= 0 && interpolationPasses <= 8);
+	assert(interpolationThreshold >= 0.0f);
+
+	lm_context *ctx = (lm_context*)LM_CALLOC(1, sizeof(lm_context));
+
+	ctx->meshPosition.passCount = 1 + 3 * interpolationPasses;
+	ctx->interpolationThreshold = interpolationThreshold;
+	ctx->hemisphere.size = hemisphereSize;
+	ctx->hemisphere.zNear = zNear;
+	ctx->hemisphere.zFar = zFar;
+	ctx->hemisphere.cameraToSurfaceDistanceModifier = cameraToSurfaceDistanceModifier;
+	ctx->hemisphere.clearColor.r = clearR;
+	ctx->hemisphere.clearColor.g = clearG;
+	ctx->hemisphere.clearColor.b = clearB;
+
+	// calculate hemisphere batch size
+	ctx->hemisphere.fbHemiCountX = 1536 / (3 * ctx->hemisphere.size);
+	ctx->hemisphere.fbHemiCountY = 512 / ctx->hemisphere.size;
+
+	// hemisphere batch framebuffers
+	unsigned int w[] = {
+		ctx->hemisphere.fbHemiCountX * ctx->hemisphere.size * 3,
+		ctx->hemisphere.fbHemiCountX * ctx->hemisphere.size / 2 };
+	unsigned int h[] = {
+		ctx->hemisphere.fbHemiCountY * ctx->hemisphere.size,
+		ctx->hemisphere.fbHemiCountY * ctx->hemisphere.size / 2 };
+
+	glGenTextures(2, ctx->hemisphere.fbTexture);
+	glGenFramebuffers(2, ctx->hemisphere.fb);
+	glGenRenderbuffers(1, &ctx->hemisphere.fbDepth);
+
+	glBindRenderbuffer(GL_RENDERBUFFER, ctx->hemisphere.fbDepth);
+	glRenderbufferStorage(GL_RENDERBUFFER, GL_DEPTH_COMPONENT24, w[0], h[0]);
+	glBindFramebuffer(GL_FRAMEBUFFER, ctx->hemisphere.fb[0]);
+	glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_RENDERBUFFER, ctx->hemisphere.fbDepth);
+	for (int i = 0; i < 2; i++)
+	{
+		glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.fbTexture[i]);
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+		glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F, w[i], h[i], 0, GL_RGBA, GL_FLOAT, 0);
+
+		glBindFramebuffer(GL_FRAMEBUFFER, ctx->hemisphere.fb[i]);
+		glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, ctx->hemisphere.fbTexture[i], 0);
+		GLenum status = glCheckFramebufferStatus(GL_FRAMEBUFFER);
+		if (status != GL_FRAMEBUFFER_COMPLETE)
+		{
+			fprintf(stderr, "Could not create framebuffer!\n");
+			glDeleteRenderbuffers(1, &ctx->hemisphere.fbDepth);
+			glDeleteFramebuffers(2, ctx->hemisphere.fb);
+			glDeleteTextures(2, ctx->hemisphere.fbTexture);
+			LM_FREE(ctx);
+			return NULL;
+		}
+	}
+	glBindFramebuffer(GL_FRAMEBUFFER, 0);
+
+	// dummy vao for fullscreen quad rendering
+	glGenVertexArrays(1, &ctx->hemisphere.vao);
+
+	// hemisphere shader (weighted downsampling of the 3x1 hemisphere layout to a 0.5x0.5 square)
+	{
+		const char *vs =
+			"#version 150 core\n"
+			"const vec2 ps[4] = vec2[](vec2(1, -1), vec2(1, 1), vec2(-1, -1), vec2(-1, 1));\n"
+			"void main()\n"
+			"{\n"
+				"gl_Position = vec4(ps[gl_VertexID], 0, 1);\n"
+			"}\n";
+		const char *fs =
+			"#version 150 core\n"
+			"uniform sampler2D hemispheres;\n"
+			"uniform sampler2D weights;\n"
+
+			"layout(pixel_center_integer) in vec4 gl_FragCoord;\n" // whole integer values represent pixel centers, GL_ARB_fragment_coord_conventions
+
+			"out vec4 outColor;\n"
+
+			"vec4 weightedSample(ivec2 h_uv, ivec2 w_uv, ivec2 quadrant)\n"
+			"{\n"
+				"vec4 sample = texelFetch(hemispheres, h_uv + quadrant, 0);\n"
+				"vec2 weight = texelFetch(weights, w_uv + quadrant, 0).rg;\n"
+				"return vec4(sample.rgb * weight.r, sample.a * weight.g);\n"
+			"}\n"
+
+			"vec4 threeWeightedSamples(ivec2 h_uv, ivec2 w_uv, ivec2 offset)\n"
+			"{\n" // horizontal triple sum
+				"vec4 sum = weightedSample(h_uv, w_uv, offset);\n"
+				"sum += weightedSample(h_uv, w_uv, offset + ivec2(2, 0));\n"
+				"sum += weightedSample(h_uv, w_uv, offset + ivec2(4, 0));\n"
+				"return sum;\n"
+			"}\n"
+
+			"void main()\n"
+			"{\n" // this is a weighted sum downsampling pass (alpha component contains the weighted valid sample count)
+				"vec2 in_uv = gl_FragCoord.xy * vec2(6.0, 2.0) + vec2(0.5);\n"
+				"ivec2 h_uv = ivec2(in_uv);\n"
+				"ivec2 w_uv = ivec2(mod(in_uv, vec2(textureSize(weights, 0))));\n" // there's no integer modulo :(
+				"vec4 lb = threeWeightedSamples(h_uv, w_uv, ivec2(0, 0));\n"
+				"vec4 rb = threeWeightedSamples(h_uv, w_uv, ivec2(1, 0));\n"
+				"vec4 lt = threeWeightedSamples(h_uv, w_uv, ivec2(0, 1));\n"
+				"vec4 rt = threeWeightedSamples(h_uv, w_uv, ivec2(1, 1));\n"
+				"outColor = lb + rb + lt + rt;\n"
+			"}\n";
+		ctx->hemisphere.firstPass.programID = lm_LoadProgram(vs, fs);
+		if (!ctx->hemisphere.firstPass.programID)
+		{
+			fprintf(stderr, "Error loading the hemisphere first pass shader program... leaving!\n");
+			glDeleteVertexArrays(1, &ctx->hemisphere.vao);
+			glDeleteRenderbuffers(1, &ctx->hemisphere.fbDepth);
+			glDeleteFramebuffers(2, ctx->hemisphere.fb);
+			glDeleteTextures(2, ctx->hemisphere.fbTexture);
+			LM_FREE(ctx);
+			return NULL;
+		}
+		ctx->hemisphere.firstPass.hemispheresTextureID = glGetUniformLocation(ctx->hemisphere.firstPass.programID, "hemispheres");
+		ctx->hemisphere.firstPass.weightsTextureID = glGetUniformLocation(ctx->hemisphere.firstPass.programID, "weights");
+	}
+
+	// downsample shader
+	{
+		const char *vs =
+			"#version 150 core\n"
+			"const vec2 ps[4] = vec2[](vec2(1, -1), vec2(1, 1), vec2(-1, -1), vec2(-1, 1));\n"
+			"void main()\n"
+			"{\n"
+				"gl_Position = vec4(ps[gl_VertexID], 0, 1);\n"
+			"}\n";
+		const char *fs =
+			"#version 150 core\n"
+			"uniform sampler2D hemispheres;\n"
+
+			"layout(pixel_center_integer) in vec4 gl_FragCoord;\n" // whole integer values represent pixel centers, GL_ARB_fragment_coord_conventions
+
+			"out vec4 outColor;\n"
+
+			"void main()\n"
+			"{\n" // this is a sum downsampling pass (alpha component contains the weighted valid sample count)
+				"ivec2 h_uv = ivec2(gl_FragCoord.xy) * 2;\n"
+				"vec4 lb = texelFetch(hemispheres, h_uv + ivec2(0, 0), 0);\n"
+				"vec4 rb = texelFetch(hemispheres, h_uv + ivec2(1, 0), 0);\n"
+				"vec4 lt = texelFetch(hemispheres, h_uv + ivec2(0, 1), 0);\n"
+				"vec4 rt = texelFetch(hemispheres, h_uv + ivec2(1, 1), 0);\n"
+				"outColor = lb + rb + lt + rt;\n"
+			"}\n";
+		ctx->hemisphere.downsamplePass.programID = lm_LoadProgram(vs, fs);
+		if (!ctx->hemisphere.downsamplePass.programID)
+		{
+			fprintf(stderr, "Error loading the hemisphere downsample pass shader program... leaving!\n");
+			glDeleteProgram(ctx->hemisphere.firstPass.programID);
+			glDeleteVertexArrays(1, &ctx->hemisphere.vao);
+			glDeleteRenderbuffers(1, &ctx->hemisphere.fbDepth);
+			glDeleteFramebuffers(2, ctx->hemisphere.fb);
+			glDeleteTextures(2, ctx->hemisphere.fbTexture);
+			LM_FREE(ctx);
+			return NULL;
+		}
+		ctx->hemisphere.downsamplePass.hemispheresTextureID = glGetUniformLocation(ctx->hemisphere.downsamplePass.programID, "hemispheres");
+	}
+
+	// hemisphere weights texture
+	glGenTextures(1, &ctx->hemisphere.firstPass.weightsTexture);
+	lmSetHemisphereWeights(ctx, lm_defaultWeights, 0);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
+
+	// allocate batchPosition-to-lightmapPosition map
+	ctx->hemisphere.fbHemiToLightmapLocation = (lm_ivec2*)LM_CALLOC(ctx->hemisphere.fbHemiCountX * ctx->hemisphere.fbHemiCountY, sizeof(lm_ivec2));
+
+	return ctx;
+}
+
+void lmDestroy(lm_context *ctx)
+{
+	// reset state
+	glUseProgram(0);
+	glBindTexture(GL_TEXTURE_2D, 0);
+	glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
+	glBindVertexArray(0);
+	glBindFramebuffer(GL_FRAMEBUFFER, 0);
+	glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
+	glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+
+	// delete gl objects
+	glDeleteTextures(1, &ctx->hemisphere.firstPass.weightsTexture);
+	glDeleteTextures(1, &ctx->hemisphere.storage.texture);
+	glDeleteProgram(ctx->hemisphere.downsamplePass.programID);
+	glDeleteProgram(ctx->hemisphere.firstPass.programID);
+	glDeleteVertexArrays(1, &ctx->hemisphere.vao);
+	glDeleteRenderbuffers(1, &ctx->hemisphere.fbDepth);
+	glDeleteFramebuffers(2, ctx->hemisphere.fb);
+	glDeleteTextures(2, ctx->hemisphere.fbTexture);
+	glDeleteTextures(1, &ctx->hemisphere.storage.texture);
+
+	// free memory
+	LM_FREE(ctx->hemisphere.storage.toLightmapLocation);
+	LM_FREE(ctx->hemisphere.fbHemiToLightmapLocation);
+#ifdef LM_DEBUG_INTERPOLATION
+	LM_FREE(ctx->lightmap.debug);
+#endif
+	LM_FREE(ctx);
+}
+
+void lmSetHemisphereWeights(lm_context *ctx, lm_weight_func f, void *userdata)
+{
+	// hemisphere weights texture. bakes in material dependent attenuation behaviour.
+	float *weights = (float*)LM_CALLOC(2 * 3 * ctx->hemisphere.size * ctx->hemisphere.size, sizeof(float));
+	float center = (ctx->hemisphere.size - 1) * 0.5f;
+	double sum = 0.0;
+	for (unsigned int y = 0; y < ctx->hemisphere.size; y++)
+	{
+		float dy = 2.0f * (y - center) / (float)ctx->hemisphere.size;
+		for (unsigned int x = 0; x < ctx->hemisphere.size; x++)
+		{
+			float dx = 2.0f * (x - center) / (float)ctx->hemisphere.size;
+			lm_vec3 v = lm_normalize3(lm_v3(dx, dy, 1.0f));
+
+			float solidAngle = v.z * v.z * v.z;
+
+			float *w0 = weights + 2 * (y * (3 * ctx->hemisphere.size) + x);
+			float *w1 = w0 + 2 * ctx->hemisphere.size;
+			float *w2 = w1 + 2 * ctx->hemisphere.size;
+
+			// center weights
+			w0[0] = solidAngle * f(v.z, userdata);
+			w0[1] = solidAngle;
+
+			// left/right side weights
+			w1[0] = solidAngle * f(lm_absf(v.x), userdata);
+			w1[1] = solidAngle;
+
+			// up/down side weights
+			w2[0] = solidAngle * f(lm_absf(v.y), userdata);
+			w2[1] = solidAngle;
+
+			sum += 3.0 * (double)solidAngle;
+		}
+	}
+
+	// normalize weights
+	float weightScale = (float)(1.0 / sum);
+	for (unsigned int i = 0; i < 2 * 3 * ctx->hemisphere.size * ctx->hemisphere.size; i++)
+		weights[i] *= weightScale;
+
+	// upload weight texture
+	glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.firstPass.weightsTexture);
+	glTexImage2D(GL_TEXTURE_2D, 0, GL_RG32F, 3 * ctx->hemisphere.size, ctx->hemisphere.size, 0, GL_RG, GL_FLOAT, weights);
+	LM_FREE(weights);
+}
+
+void lmSetTargetLightmap(lm_context *ctx, float *outLightmap, int w, int h, int c)
+{
+	ctx->lightmap.data = outLightmap;
+	ctx->lightmap.width = w;
+	ctx->lightmap.height = h;
+	ctx->lightmap.channels = c;
+
+	// allocate storage texture
+	if (!ctx->hemisphere.storage.texture)
+		glGenTextures(1, &ctx->hemisphere.storage.texture);
+	glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.storage.texture);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+	glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F, w, h, 0, GL_RGBA, GL_FLOAT, 0);
+
+	// allocate storage position to lightmap position map
+	if (ctx->hemisphere.storage.toLightmapLocation)
+		LM_FREE(ctx->hemisphere.storage.toLightmapLocation);
+	ctx->hemisphere.storage.toLightmapLocation = (lm_ivec2*)LM_CALLOC(w * h, sizeof(lm_ivec2));
+	// invalidate all positions
+	for (int i = 0; i < w * h; i++)
+		ctx->hemisphere.storage.toLightmapLocation[i].x = -1;
+
+#ifdef LM_DEBUG_INTERPOLATION
+	if (ctx->lightmap.debug)
+		LM_FREE(ctx->lightmap.debug);
+	ctx->lightmap.debug = (unsigned char*)LM_CALLOC(ctx->lightmap.width * ctx->lightmap.height, 3);
+#endif
+}
+
+void lmSetGeometry(lm_context *ctx,
+	const float *transformationMatrix,
+	lm_type positionsType, const void *positionsXYZ, int positionsStride,
+	lm_type normalsType, const void *normalsXYZ, int normalsStride,
+	lm_type lightmapCoordsType, const void *lightmapCoordsUV, int lightmapCoordsStride,
+	int count, lm_type indicesType, const void *indices)
+{
+	ctx->mesh.modelMatrix = transformationMatrix;
+	ctx->mesh.positions = (const unsigned char*)positionsXYZ;
+	ctx->mesh.positionsType = positionsType;
+	ctx->mesh.positionsStride = positionsStride == 0 ? sizeof(lm_vec3) : positionsStride;
+	ctx->mesh.normals = (const unsigned char*)normalsXYZ;
+	ctx->mesh.normalsType = normalsType;
+	ctx->mesh.normalsStride = normalsStride == 0 ? sizeof(lm_vec3) : normalsStride;
+	ctx->mesh.uvs = (const unsigned char*)lightmapCoordsUV;
+	ctx->mesh.uvsType = lightmapCoordsType;
+	ctx->mesh.uvsStride = lightmapCoordsStride == 0 ? sizeof(lm_vec2) : lightmapCoordsStride;
+	ctx->mesh.indicesType = indicesType;
+	ctx->mesh.indices = (const unsigned char*)indices;
+	ctx->mesh.count = count;
+
+	lm_inverseTranspose(transformationMatrix, ctx->mesh.normalMatrix);
+
+	ctx->meshPosition.pass = 0;
+	lm_setMeshPosition(ctx, 0);
+}
+
+lm_bool lmBegin(lm_context *ctx, int* outViewport4, float* outView4x4, float* outProjection4x4)
+{
+	assert(ctx->meshPosition.triangle.baseIndex < ctx->mesh.count);
+	while (!lm_beginSampleHemisphere(ctx, outViewport4, outView4x4, outProjection4x4))
+	{ // as long as there are no hemisphere sides to render...
+		// try moving to the next rasterizer position
+		if (lm_findNextConservativeTriangleRasterizerPosition(ctx))
+		{ // if we successfully moved to the next sample position on the current triangle...
+			ctx->meshPosition.hemisphere.side = 0; // start sampling a hemisphere there
+		}
+		else
+		{ // if there are no valid sample positions on the current triangle...
+			if (ctx->meshPosition.triangle.baseIndex + 3 < ctx->mesh.count)
+			{ // ...and there are triangles left: move to the next triangle and continue sampling.
+				lm_setMeshPosition(ctx, ctx->meshPosition.triangle.baseIndex + 3);
+			}
+			else
+			{ // ...and there are no triangles left: finish
+				lm_integrateHemisphereBatch(ctx); // integrate and store last batch
+				lm_writeResultsToLightmap(ctx); // read storage data from gpu memory and write it to the lightmap
+
+				if (++ctx->meshPosition.pass == ctx->meshPosition.passCount)
+				{
+					ctx->meshPosition.pass = 0;
+					ctx->meshPosition.triangle.baseIndex = ctx->mesh.count; // set end condition (in case someone accidentally calls lmBegin again)
+
+#ifdef LM_DEBUG_INTERPOLATION
+					lmImageSaveTGAub("debug_interpolation.tga", ctx->lightmap.debug, ctx->lightmap.width, ctx->lightmap.height, 3);
+
+					// lightmap texel statistics
+					int rendered = 0, interpolated = 0, wasted = 0;
+					for (int y = 0; y < ctx->lightmap.height; y++)
+					{
+						for (int x = 0; x < ctx->lightmap.width; x++)
+						{
+							if (ctx->lightmap.debug[(y * ctx->lightmap.width + x) * 3 + 0])
+								rendered++;
+							else if (ctx->lightmap.debug[(y * ctx->lightmap.width + x) * 3 + 1])
+								interpolated++;
+							else
+								wasted++;
+						}
+					}
+					int used = rendered + interpolated;
+					int total = used + wasted;
+					printf("\n#######################################################################\n");
+					printf("%10d %6.2f%% rendered hemicubes integrated to lightmap texels.\n", rendered, 100.0f * (float)rendered / (float)total);
+					printf("%10d %6.2f%% interpolated lightmap texels.\n", interpolated, 100.0f * (float)interpolated / (float)total);
+					printf("%10d %6.2f%% wasted lightmap texels.\n", wasted, 100.0f * (float)wasted / (float)total);
+					printf("\n%17.2f%% of used texels were rendered.\n", 100.0f * (float)rendered / (float)used);
+					printf("#######################################################################\n");
+#endif
+
+					return LM_FALSE;
+				}
+
+				lm_setMeshPosition(ctx, 0); // start over with the next pass
+			}
+		}
+	}
+	return LM_TRUE;
+}
+
+float lmProgress(lm_context *ctx)
+{
+	float passProgress = (float)ctx->meshPosition.triangle.baseIndex / (float)ctx->mesh.count;
+	return ((float)ctx->meshPosition.pass + passProgress) / (float)ctx->meshPosition.passCount;
+}
+
+void lmEnd(lm_context *ctx)
+{
+	lm_endSampleHemisphere(ctx);
+}
+
+// these are not performance tuned since their impact on the whole lightmapping duration is insignificant
+float lmImageMin(const float *image, int w, int h, int c, int m)
+{
+	assert(c > 0 && m);
+	float minValue = FLT_MAX;
+	for (int i = 0; i < w * h; i++)
+		for (int j = 0; j < c; j++)
+			if (m & (1 << j))
+				minValue = lm_minf(minValue, image[i * c + j]);
+	return minValue;
+}
+
+float lmImageMax(const float *image, int w, int h, int c, int m)
+{
+	assert(c > 0 && m);
+	float maxValue = 0.0f;
+	for (int i = 0; i < w * h; i++)
+		for (int j = 0; j < c; j++)
+			if (m & (1 << j))
+				maxValue = lm_maxf(maxValue, image[i * c + j]);
+	return maxValue;
+}
+
+void lmImageAdd(float *image, int w, int h, int c, float value, int m)
+{
+	assert(c > 0 && m);
+	for (int i = 0; i < w * h; i++)
+		for (int j = 0; j < c; j++)
+			if (m & (1 << j))
+				image[i * c + j] += value;
+}
+
+void lmImageScale(float *image, int w, int h, int c, float factor, int m)
+{
+	assert(c > 0 && m);
+	for (int i = 0; i < w * h; i++)
+		for (int j = 0; j < c; j++)
+			if (m & (1 << j))
+				image[i * c + j] *= factor;
+}
+
+void lmImagePower(float *image, int w, int h, int c, float exponent, int m)
+{
+	assert(c > 0 && m);
+	for (int i = 0; i < w * h; i++)
+		for (int j = 0; j < c; j++)
+			if (m & (1 << j))
+				image[i * c + j] = powf(image[i * c + j], exponent);
+}
+
+void lmImageDilate(const float *image, float *outImage, int w, int h, int c)
+{
+	assert(c > 0 && c <= 4);
+	for (int y = 0; y < h; y++)
+	{
+		for (int x = 0; x < w; x++)
+		{
+			float color[4];
+			lm_bool valid = LM_FALSE;
+			for (int i = 0; i < c; i++)
+			{
+				color[i] = image[(y * w + x) * c + i];
+				valid |= color[i] > 0.0f;
+			}
+			if (!valid)
+			{
+				int n = 0;
+				const int dx[] = { -1, 0, 1,  0 };
+				const int dy[] = {  0, 1, 0, -1 };
+				for (int d = 0; d < 4; d++)
+				{
+					int cx = x + dx[d];
+					int cy = y + dy[d];
+					if (cx >= 0 && cx < w && cy >= 0 && cy < h)
+					{
+						float dcolor[4];
+						lm_bool dvalid = LM_FALSE;
+						for (int i = 0; i < c; i++)
+						{
+							dcolor[i] = image[(cy * w + cx) * c + i];
+							dvalid |= dcolor[i] > 0.0f;
+						}
+						if (dvalid)
+						{
+							for (int i = 0; i < c; i++)
+								color[i] += dcolor[i];
+							n++;
+						}
+					}
+				}
+				if (n)
+				{
+					float in = 1.0f / n;
+					for (int i = 0; i < c; i++)
+						color[i] *= in;
+				}
+			}
+			for (int i = 0; i < c; i++)
+				outImage[(y * w + x) * c + i] = color[i];
+		}
+	}
+}
+
+void lmImageSmooth(const float *image, float *outImage, int w, int h, int c)
+{
+	assert(c > 0 && c <= 4);
+	for (int y = 0; y < h; y++)
+	{
+		for (int x = 0; x < w; x++)
+		{
+			float color[4] = {0};
+			int n = 0;
+			for (int dy = -1; dy <= 1; dy++)
+			{
+				int cy = y + dy;
+				for (int dx = -1; dx <= 1; dx++)
+				{
+					int cx = x + dx;
+					if (cx >= 0 && cx < w && cy >= 0 && cy < h)
+					{
+						lm_bool valid = LM_FALSE;
+						for (int i = 0; i < c; i++)
+							valid |= image[(cy * w + cx) * c + i] > 0.0f;
+						if (valid)
+						{
+							for (int i = 0; i < c; i++)
+								color[i] += image[(cy * w + cx) * c + i];
+							n++;
+						}
+					}
+				}
+			}
+			for (int i = 0; i < c; i++)
+				outImage[(y * w + x) * c + i] = n ? color[i] / n : 0.0f;
+		}
+	}
+}
+
+void lmImageDownsample(const float *image, float *outImage, int w, int h, int c)
+{
+	assert(c > 0 && c <= 4);
+	for (int y = 0; y < h / 2; y++)
+	{
+		for (int x = 0; x < w / 2; x++)
+		{
+			int p0 = 2 * (y * w + x) * c;
+			int p1 = p0 + w * c;
+			int valid[2][2] = {0};
+			float sums[4] = {0};
+			for (int i = 0; i < c; i++)
+			{
+				valid[0][0] |= image[p0     + i] != 0.0f ? 1 : 0;
+				valid[0][1] |= image[p0 + c + i] != 0.0f ? 1 : 0;
+				valid[1][0] |= image[p1     + i] != 0.0f ? 1 : 0;
+				valid[1][1] |= image[p1 + c + i] != 0.0f ? 1 : 0;
+				sums[i] += image[p0 + i] + image[p0 + c + i] + image[p1 + i] + image[p1 + c + i];
+			}
+			int n = valid[0][0] + valid[0][1] + valid[1][0] + valid[1][1];
+			int p = (y * w / 2 + x) * c;
+			for (int i = 0; i < c; i++)
+				outImage[p + i] = n ? sums[i] / n : 0.0f;
+		}
+	}
+}
+
+void lmImageFtoUB(const float *image, unsigned char *outImage, int w, int h, int c, float max)
+{
+	assert(c > 0);
+	float scale = 255.0f / (max != 0.0f ? max : lmImageMax(image, w, h, c, LM_ALL_CHANNELS));
+	for (int i = 0; i < w * h * c; i++)
+		outImage[i] = (unsigned char)lm_minf(lm_maxf(image[i] * scale, 0.0f), 255.0f);
+}
+
+// TGA output helpers
+static void lm_swapRandBub(unsigned char *image, int w, int h, int c)
+{
+	assert(c >= 3);
+	for (int i = 0; i < w * h * c; i += c)
+		LM_SWAP(unsigned char, image[i], image[i + 2]);
+}
+
+lm_bool lmImageSaveTGAub(const char *filename, const unsigned char *image, int w, int h, int c)
+{
+	assert(c == 1 || c == 3 || c == 4);
+	lm_bool isGreyscale = c == 1;
+	lm_bool hasAlpha = c == 4;
+	unsigned char header[18] = {
+		0, 0, (unsigned char)(isGreyscale ? 3 : 2), 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		(unsigned char)(w & 0xff), (unsigned char)((w >> 8) & 0xff), (unsigned char)(h & 0xff), (unsigned char)((h >> 8) & 0xff),
+		(unsigned char)(8 * c), (unsigned char)(hasAlpha ? 8 : 0)
+	};
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+	FILE *file;
+	if (fopen_s(&file, filename, "wb") != 0) return LM_FALSE;
+#else
+	FILE *file = fopen(filename, "wb");
+	if (!file) return LM_FALSE;
+#endif
+	fwrite(header, 1, sizeof(header), file);
+
+	// we make sure to swap it back! trust me. :)
+	if (!isGreyscale)
+		lm_swapRandBub((unsigned char*)image, w, h, c);
+	fwrite(image, 1, w * h * c , file);
+	if (!isGreyscale)
+		lm_swapRandBub((unsigned char*)image, w, h, c);
+
+	fclose(file);
+	return LM_TRUE;
+}
+
+lm_bool lmImageSaveTGAf(const char *filename, const float *image, int w, int h, int c, float max)
+{
+	unsigned char *temp = (unsigned char*)LM_CALLOC(w * h * c, sizeof(unsigned char));
+	lmImageFtoUB(image, temp, w, h, c, max);
+	lm_bool success = lmImageSaveTGAub(filename, temp, w, h, c);
+	LM_FREE(temp);
+	return success;
+}
+
+#endif // LIGHTMAPPER_IMPLEMENTATION
+#line 0
+
 #endif // V4K_3RD
 /* game framework.
 *  - rlyeh, public domain
@@ -358986,17 +360759,23 @@ void *gui_userdata() {
     return last_skin->userdata;
 }
 
-vec2 gui_getskinsize(const char *skin) {
+vec2 gui_getskinsize(const char *skin, const char *fallback) {
     vec2 size={0};
-    if (last_skin->getskinsize) last_skin->getskinsize(last_skin->userdata, skin, &size);
+    if (last_skin->getskinsize) last_skin->getskinsize(last_skin->userdata, skin, fallback, &size);
     return size;
 }
 
-bool gui_ismouseinrect(const char *skin, vec4 rect) {
-    if (last_skin->ismouseinrect) return last_skin->ismouseinrect(last_skin->userdata, skin, rect);
+bool gui_ismouseinrect(const char *skin, const char *fallback, vec4 rect) {
+    if (last_skin->ismouseinrect) return last_skin->ismouseinrect(last_skin->userdata, skin, fallback, rect);
     return false; 
 }
 
+vec4 gui_getscissorrect(const char *skin, const char *fallback, vec4 rect) {
+    vec4 scissor = rect;
+    if (last_skin->getscissorrect) last_skin->getscissorrect(last_skin->userdata, skin, fallback, rect, &scissor);
+    return scissor;
+}
+
 static
 gui_state_t *gui_getstate(int id) {
     if (!ctl_states) map_init(ctl_states, less_int, hash_int);
@@ -359006,8 +360785,8 @@ gui_state_t *gui_getstate(int id) {
 void gui_panel_id(int id, vec4 rect, const char *skin) {
     (void)id;
     vec4 scissor={0, 0, window_width(), window_height()};
-    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, skin, rect);
-    if (last_skin->getscissorrect) last_skin->getscissorrect(last_skin->userdata, skin, rect, &scissor);
+    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, skin, NULL, rect);
+    scissor = gui_getscissorrect(skin, NULL, rect);
 
     if (!array_count(scissor_rects))
         glEnable(GL_SCISSOR_TEST);
@@ -359032,7 +360811,7 @@ bool gui_button_id(int id, vec4 r, const char *skin) {
 
     skin=skin?skin:"button";
     char *btn = va("%s%s", skin, entry->held?"_press":entry->hover?"_hover":"");
-    if (gui_ismouseinrect(btn, r)) {
+    if (gui_ismouseinrect(btn, skin, r)) {
         if (input_up(MOUSE_L) && entry->held) {
             was_clicked=1;
         }
@@ -359042,24 +360821,27 @@ bool gui_button_id(int id, vec4 r, const char *skin) {
             entry->hover = true;
         }
     }
-    else if (input_up(MOUSE_L) && entry->held) {
-        entry->held = false;
-        any_widget_used = false;
-    }
     else {
         entry->hover = false;
     }
 
-    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, btn, r);
+    if (input_up(MOUSE_L) && entry->held) {
+        entry->held = false;
+        any_widget_used = false;
+    }
+
+    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, btn, skin, r);
 
     return was_clicked;
 }
 
 bool gui_button_label_id(int id, const char *text, vec4 rect, const char *skin) {
+    gui_state_t *entry = gui_getstate(id);
     bool state = gui_button_id(id, rect, skin);
     vec2 buttonsize={0};
     skin=skin?skin:"button";
-    if (last_skin->getskinsize) last_skin->getskinsize(last_skin->userdata, skin, &buttonsize);
+    char *btn = va("%s%s", skin, entry->held?"_press":entry->hover?"_hover":"");
+    buttonsize = gui_getskinsize(btn, skin);
 
     vec2 textsize = font_rect(text);
     vec2 pos;
@@ -359092,7 +360874,8 @@ bool gui_slider_id(int id, vec4 rect, const char *skin, float min, float max, fl
 
     skin = skin?skin:"slider";
     char *cursorskin = va("%s_cursor%s", skin, entry->held?"_press":entry->hover?"_hover":"");
-    if (gui_ismouseinrect(skin, rect) && !any_widget_used) {
+    char *fbcursor = va("%s_cursor", skin);
+    if (gui_ismouseinrect(skin, NULL, rect) && !any_widget_used) {
         any_widget_used = entry->held = input_held(MOUSE_L);
         entry->hover = true;
     }
@@ -359105,13 +360888,12 @@ bool gui_slider_id(int id, vec4 rect, const char *skin, float min, float max, fl
     }
 
     float old_value = *value;
-    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, skin, rect);
+    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, skin, NULL, rect);
 
     vec2 slidersize={0}, cursorsize={0};
-    vec4 usablerect=rect;
-    if (last_skin->getscissorrect) last_skin->getscissorrect(last_skin->userdata, skin, rect, &usablerect);
-    if (last_skin->getskinsize) last_skin->getskinsize(last_skin->userdata, skin, &slidersize);
-    if (last_skin->getskinsize) last_skin->getskinsize(last_skin->userdata, cursorskin, &cursorsize);
+    vec4 usablerect = gui_getscissorrect(skin, NULL, rect);
+    slidersize = gui_getskinsize(skin, NULL);
+    cursorsize = gui_getskinsize(cursorskin, fbcursor);
     if (entry->held) {
         *value = posx2slider(usablerect, min, max, input(MOUSE_X), step);
     }
@@ -359122,7 +360904,7 @@ bool gui_slider_id(int id, vec4 rect, const char *skin, float min, float max, fl
     cursorrect.y += cursorpos.y;
     cursorrect.z = cursorsize.x;
     cursorrect.w = cursorsize.y;
-    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, cursorskin, cursorrect);
+    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, cursorskin, fbcursor, cursorrect);
 
     return entry->held && (old_value!=*value);
 }
@@ -359131,7 +360913,7 @@ bool gui_slider_label_id(int id, const char *text, vec4 rect, const char *skin,
     bool state = gui_slider_id(id, rect, skin, min, max, step, value);
     vec2 slidersize={0};
     skin=skin?skin:"slider";
-    if (last_skin->getskinsize) last_skin->getskinsize(last_skin->userdata, skin, &slidersize);
+    slidersize = gui_getskinsize(skin, NULL);
 
     vec2 textsize = font_rect(text);
     vec2 pos;
@@ -359144,7 +360926,7 @@ bool gui_slider_label_id(int id, const char *text, vec4 rect, const char *skin,
 
 void gui_rect_id(int id, vec4 r, const char *skin) {
     (void)id;
-    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, skin, r);
+    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, skin, NULL, r);
 }
 
 void gui_label_id(int id, const char *text, vec4 rect) {
@@ -359168,7 +360950,7 @@ atlas_slice_frame_t *skinned_getsliceframe(atlas_t *a, const char *name) {
     for (int i = 0; i < array_count(a->slices); i++) 
         if (!strcmp(quark_string(&a->db, a->slices[i].name), name))
             return &a->slice_frames[a->slices[i].frames[0]];
-    PRINTF("slice name: '%s' is missing in atlas!\n", name);
+    // PRINTF("slice name: '%s' is missing in atlas!\n", name);
     return NULL;
 }
 
@@ -359179,9 +360961,10 @@ void skinned_draw_missing_rect(vec4 r) {
 }
 
 static
-bool skinned_ismouseinrect(void *userdata, const char *skin, vec4 r) {
+bool skinned_ismouseinrect(void *userdata, const char *skin, const char *fallback, vec4 r) {
     skinned_t *a = C_CAST(skinned_t*, userdata);
     atlas_slice_frame_t *f = skinned_getsliceframe(&a->atlas, skin);
+    if (!f && fallback) f = skinned_getsliceframe(&a->atlas, fallback);
     if (!f) return false;
 
     vec4 outer = f->bounds;
@@ -359264,18 +361047,20 @@ void skinned_draw_sprite(float scale, atlas_t *a, atlas_slice_frame_t *f, vec4 r
 }
 
 static
-void skinned_draw_rect(void* userdata, const char *skin, vec4 r) {
+void skinned_draw_rect(void* userdata, const char *skin, const char *fallback, vec4 r) {
     skinned_t *a = C_CAST(skinned_t*, userdata);
 
     atlas_slice_frame_t *f = skinned_getsliceframe(&a->atlas, skin);
+    if (!f && fallback) f = skinned_getsliceframe(&a->atlas, fallback);
     if (!f) skinned_draw_missing_rect(r);
     else skinned_draw_sprite(a->scale, &a->atlas, f, r);
 }
 
-void skinned_getskinsize(void *userdata, const char *skin, vec2 *size) {
+void skinned_getskinsize(void *userdata, const char *skin, const char *fallback, vec2 *size) {
     skinned_t *a = C_CAST(skinned_t*, userdata);
 
     atlas_slice_frame_t *f = skinned_getsliceframe(&a->atlas, skin);
+    if (!f && fallback) f = skinned_getsliceframe(&a->atlas, fallback);
     if (f) {
         size->x = (f->bounds.z-f->bounds.x)*a->scale;
         size->y = (f->bounds.w-f->bounds.y)*a->scale;
@@ -359283,9 +361068,10 @@ void skinned_getskinsize(void *userdata, const char *skin, vec2 *size) {
 }
 
 static
-void skinned_getscissorrect(void* userdata, const char *skin, vec4 rect, vec4 *dims) {
+void skinned_getscissorrect(void* userdata, const char *skin, const char *fallback, vec4 rect, vec4 *dims) {
     skinned_t *a = C_CAST(skinned_t*, userdata);
     atlas_slice_frame_t *f = skinned_getsliceframe(&a->atlas, skin);
+    if (!f && fallback) f = skinned_getsliceframe(&a->atlas, fallback);
     if (!f) return;
 
     *dims = rect;
diff --git a/engine/split/3rd_lightmapper.h b/engine/split/3rd_lightmapper.h
new file mode 100644
index 0000000..671db74
--- /dev/null
+++ b/engine/split/3rd_lightmapper.h
@@ -0,0 +1,1761 @@
+/***********************************************************
+* A single header file OpenGL lightmapping library         *
+* https://github.com/ands/lightmapper                      *
+* no warranty implied | use at your own risk               *
+* author: Andreas Mantler (ands) | last change: 10.05.2018 *
+*                                                          *
+* License:                                                 *
+* This software is in the public domain.                   *
+* Where that dedication is not recognized,                 *
+* you are granted a perpetual, irrevocable license to copy *
+* and modify this file however you want.                   *
+***********************************************************/
+
+#ifndef LIGHTMAPPER_H
+#define LIGHTMAPPER_H
+
+#ifdef __cplusplus
+#define LM_DEFAULT_VALUE(value) = value
+#else
+#define LM_DEFAULT_VALUE(value)
+#endif
+
+#ifndef LM_CALLOC
+#define LM_CALLOC(count, size) calloc(count, size)
+#endif
+
+#ifndef LM_FREE
+#define LM_FREE(ptr) free(ptr)
+#endif
+
+typedef int lm_bool;
+#define LM_FALSE 0
+#define LM_TRUE  1
+
+typedef int lm_type;
+#define LM_NONE           0
+#define LM_UNSIGNED_BYTE  GL_UNSIGNED_BYTE
+#define LM_UNSIGNED_SHORT GL_UNSIGNED_SHORT
+#define LM_UNSIGNED_INT   GL_UNSIGNED_INT
+#define LM_FLOAT          GL_FLOAT
+
+typedef struct lm_context lm_context;
+
+// creates a lightmapper instance. it can be used to render multiple lightmaps.
+lm_context *lmCreate(
+	int hemisphereSize,                                                                                // hemisphereSize: resolution of the hemisphere renderings. must be a power of two! typical: 64.
+	float zNear, float zFar,                                                                           // zNear/zFar: hemisphere min/max draw distances.
+	float clearR, float clearG, float clearB,                                                          // clear color / background color / sky color.
+	int interpolationPasses, float interpolationThreshold,                                             // passes: hierarchical selective interpolation passes (0-8; initial step size = 2^passes).
+	                                                                                                   // threshold: error value below which lightmap pixels are interpolated instead of rendered.
+	                                                                                                   // use output image from LM_DEBUG_INTERPOLATION to determine a good value.
+	                                                                                                   // values around and below 0.01 are probably ok.
+	                                                                                                   // the lower the value, the more hemispheres are rendered -> slower, but possibly better quality.
+	float cameraToSurfaceDistanceModifier LM_DEFAULT_VALUE(0.0f));                                     // modifier for the height of the rendered hemispheres above the surface
+	                                                                                                   // -1.0f => stick to surface, 0.0f => minimum height for interpolated surface normals,
+	                                                                                                   // > 0.0f => improves gradients on surfaces with interpolated normals due to the flat surface horizon,
+	                                                                                                   // but may introduce other artifacts.
+
+// optional: set material characteristics by specifying cos(theta)-dependent weights for incoming light.
+typedef float (*lm_weight_func)(float cos_theta, void *userdata);
+void lmSetHemisphereWeights(lm_context *ctx, lm_weight_func f, void *userdata);                        // precalculates weights for incoming light depending on its angle. (default: all weights are 1.0f)
+
+// specify an output lightmap image buffer with w * h * c * sizeof(float) bytes of memory.
+void lmSetTargetLightmap(lm_context *ctx, float *outLightmap, int w, int h, int c);                    // output HDR lightmap (linear 32bit float channels; c: 1->Greyscale, 2->Greyscale+Alpha, 3->RGB, 4->RGBA).
+
+// set the geometry to map to the currently set target lightmap (set the target lightmap before calling this!).
+void lmSetGeometry(lm_context *ctx,
+	const float *transformationMatrix,                                                                 // 4x4 object-to-world transform for the geometry or NULL (no transformation).
+	lm_type positionsType, const void *positionsXYZ, int positionsStride,                              // triangle mesh in object space.
+	lm_type normalsType, const void *normalsXYZ, int normalsStride,                                    // optional normals for the mesh in object space (Use LM_NONE type in case you only need flat surfaces).
+	lm_type lightmapCoordsType, const void *lightmapCoordsUV, int lightmapCoordsStride,                // lightmap atlas texture coordinates for the mesh [0..1]x[0..1] (integer types are normalized to 0..1 range).
+	int count, lm_type indicesType LM_DEFAULT_VALUE(LM_NONE), const void *indices LM_DEFAULT_VALUE(0));// if mesh indices are used, count = number of indices else count = number of vertices.
+
+
+// as long as lmBegin returns true, the scene has to be rendered with the
+// returned camera and view parameters to the currently bound framebuffer.
+// if lmBegin returns true, it must be followed by lmEnd after rendering!
+lm_bool lmBegin(lm_context *ctx,
+	int* outViewport4,                                                                                 // output of the current viewport: { x, y, w, h }. use these to call glViewport()!
+	float* outView4x4,                                                                                 // output of the current camera view matrix.
+	float* outProjection4x4);                                                                          // output of the current camera projection matrix.
+
+float lmProgress(lm_context *ctx);                                                                     // should only be called between lmBegin/lmEnd!
+                                                                                                       // provides the light mapping progress as a value increasing from 0.0 to 1.0.
+
+void lmEnd(lm_context *ctx);
+
+// destroys the lightmapper instance. should be called to free resources.
+void lmDestroy(lm_context *ctx);
+
+
+// image based post processing (c is the number of color channels in the image, m a channel mask for the operation)
+#define LM_ALL_CHANNELS 0x0f
+float lmImageMin(const float *image, int w, int h, int c, int m LM_DEFAULT_VALUE(LM_ALL_CHANNELS));                    // find the minimum value (across the specified channels)
+float lmImageMax(const float *image, int w, int h, int c, int m LM_DEFAULT_VALUE(LM_ALL_CHANNELS));                    // find the maximum value (across the specified channels)
+void lmImageAdd(float *image, int w, int h, int c, float value, int m LM_DEFAULT_VALUE(LM_ALL_CHANNELS));              // in-place add to the specified channels
+void lmImageScale(float *image, int w, int h, int c, float factor, int m LM_DEFAULT_VALUE(LM_ALL_CHANNELS));           // in-place scaling of the specified channels
+void lmImagePower(float *image, int w, int h, int c, float exponent, int m LM_DEFAULT_VALUE(LM_ALL_CHANNELS));         // in-place powf(v, exponent) of the specified channels (for gamma)
+void lmImageDilate(const float *image, float *outImage, int w, int h, int c);                                          // widen the populated non-zero areas by 1 pixel.
+void lmImageSmooth(const float *image, float *outImage, int w, int h, int c);                                          // simple box filter on only the non-zero values.
+void lmImageDownsample(const float *image, float *outImage, int w, int h, int c);                                      // downsamples [0..w]x[0..h] to [0..w/2]x[0..h/2] by avereging only the non-zero values
+void lmImageFtoUB(const float *image, unsigned char *outImage, int w, int h, int c, float max LM_DEFAULT_VALUE(0.0f)); // casts a floating point image to an 8bit/channel image
+
+// TGA file output helpers
+lm_bool lmImageSaveTGAub(const char *filename, const unsigned char *image, int w, int h, int c);
+lm_bool lmImageSaveTGAf(const char *filename, const float *image, int w, int h, int c, float max LM_DEFAULT_VALUE(0.0f));
+
+#endif
+////////////////////// END OF HEADER //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef LIGHTMAPPER_IMPLEMENTATION
+#undef LIGHTMAPPER_IMPLEMENTATION
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <float.h>
+#include <assert.h>
+#include <limits.h>
+
+#define LM_SWAP(type, a, b) { type tmp = (a); (a) = (b); (b) = tmp; }
+
+#if defined(_MSC_VER) && !defined(__cplusplus)
+#define inline __inline
+#endif
+
+#if defined(_MSC_VER) && (_MSC_VER <= 1700)
+static inline lm_bool lm_finite(float a) { return _finite(a); }
+#else
+static inline lm_bool lm_finite(float a) { return isfinite(a); }
+#endif
+
+static inline int      lm_mini      (int     a, int     b) { return a < b ? a : b; }
+static inline int      lm_maxi      (int     a, int     b) { return a > b ? a : b; }
+static inline int      lm_absi      (int     a           ) { return a < 0 ? -a : a; }
+static inline float    lm_minf      (float   a, float   b) { return a < b ? a : b; }
+static inline float    lm_maxf      (float   a, float   b) { return a > b ? a : b; }
+static inline float    lm_absf      (float   a           ) { return a < 0.0f ? -a : a; }
+static inline float    lm_pmodf     (float   a, float   b) { return (a < 0.0f ? 1.0f : 0.0f) + (float)fmod(a, b); } // positive mod
+
+typedef struct lm_ivec2 { int x, y; } lm_ivec2;
+static inline lm_ivec2 lm_i2        (int     x, int     y) { lm_ivec2 v = { x, y }; return v; }
+
+typedef struct lm_vec2 { float x, y; } lm_vec2;
+static inline lm_vec2  lm_v2i       (int     x, int     y) { lm_vec2 v = { (float)x, (float)y }; return v; }
+static inline lm_vec2  lm_v2        (float   x, float   y) { lm_vec2 v = { x, y }; return v; }
+static inline lm_vec2  lm_negate2   (lm_vec2 a           ) { return lm_v2(-a.x, -a.y); }
+static inline lm_vec2  lm_add2      (lm_vec2 a, lm_vec2 b) { return lm_v2(a.x + b.x, a.y + b.y); }
+static inline lm_vec2  lm_sub2      (lm_vec2 a, lm_vec2 b) { return lm_v2(a.x - b.x, a.y - b.y); }
+static inline lm_vec2  lm_mul2      (lm_vec2 a, lm_vec2 b) { return lm_v2(a.x * b.x, a.y * b.y); }
+static inline lm_vec2  lm_scale2    (lm_vec2 a, float   b) { return lm_v2(a.x * b, a.y * b); }
+static inline lm_vec2  lm_div2      (lm_vec2 a, float   b) { return lm_scale2(a, 1.0f / b); }
+static inline lm_vec2  lm_pmod2     (lm_vec2 a, float   b) { return lm_v2(lm_pmodf(a.x, b), lm_pmodf(a.y, b)); }
+static inline lm_vec2  lm_min2      (lm_vec2 a, lm_vec2 b) { return lm_v2(lm_minf(a.x, b.x), lm_minf(a.y, b.y)); }
+static inline lm_vec2  lm_max2      (lm_vec2 a, lm_vec2 b) { return lm_v2(lm_maxf(a.x, b.x), lm_maxf(a.y, b.y)); }
+static inline lm_vec2  lm_abs2      (lm_vec2 a           ) { return lm_v2(lm_absf(a.x), lm_absf(a.y)); }
+static inline lm_vec2  lm_floor2    (lm_vec2 a           ) { return lm_v2(floorf(a.x), floorf(a.y)); }
+static inline lm_vec2  lm_ceil2     (lm_vec2 a           ) { return lm_v2(ceilf (a.x), ceilf (a.y)); }
+static inline float    lm_dot2      (lm_vec2 a, lm_vec2 b) { return a.x * b.x + a.y * b.y; }
+static inline float    lm_cross2    (lm_vec2 a, lm_vec2 b) { return a.x * b.y - a.y * b.x; } // pseudo cross product
+static inline float    lm_length2sq (lm_vec2 a           ) { return a.x * a.x + a.y * a.y; }
+static inline float    lm_length2   (lm_vec2 a           ) { return sqrtf(lm_length2sq(a)); }
+static inline lm_vec2  lm_normalize2(lm_vec2 a           ) { return lm_div2(a, lm_length2(a)); }
+static inline lm_bool  lm_finite2   (lm_vec2 a           ) { return lm_finite(a.x) && lm_finite(a.y); }
+
+typedef struct lm_vec3 { float x, y, z; } lm_vec3;
+static inline lm_vec3  lm_v3        (float   x, float   y, float   z) { lm_vec3 v = { x, y, z }; return v; }
+static inline lm_vec3  lm_negate3   (lm_vec3 a           ) { return lm_v3(-a.x, -a.y, -a.z); }
+static inline lm_vec3  lm_add3      (lm_vec3 a, lm_vec3 b) { return lm_v3(a.x + b.x, a.y + b.y, a.z + b.z); }
+static inline lm_vec3  lm_sub3      (lm_vec3 a, lm_vec3 b) { return lm_v3(a.x - b.x, a.y - b.y, a.z - b.z); }
+static inline lm_vec3  lm_mul3      (lm_vec3 a, lm_vec3 b) { return lm_v3(a.x * b.x, a.y * b.y, a.z * b.z); }
+static inline lm_vec3  lm_scale3    (lm_vec3 a, float   b) { return lm_v3(a.x * b, a.y * b, a.z * b); }
+static inline lm_vec3  lm_div3      (lm_vec3 a, float   b) { return lm_scale3(a, 1.0f / b); }
+static inline lm_vec3  lm_pmod3     (lm_vec3 a, float   b) { return lm_v3(lm_pmodf(a.x, b), lm_pmodf(a.y, b), lm_pmodf(a.z, b)); }
+static inline lm_vec3  lm_min3      (lm_vec3 a, lm_vec3 b) { return lm_v3(lm_minf(a.x, b.x), lm_minf(a.y, b.y), lm_minf(a.z, b.z)); }
+static inline lm_vec3  lm_max3      (lm_vec3 a, lm_vec3 b) { return lm_v3(lm_maxf(a.x, b.x), lm_maxf(a.y, b.y), lm_maxf(a.z, b.z)); }
+static inline lm_vec3  lm_abs3      (lm_vec3 a           ) { return lm_v3(lm_absf(a.x), lm_absf(a.y), lm_absf(a.z)); }
+static inline lm_vec3  lm_floor3    (lm_vec3 a           ) { return lm_v3(floorf(a.x), floorf(a.y), floorf(a.z)); }
+static inline lm_vec3  lm_ceil3     (lm_vec3 a           ) { return lm_v3(ceilf (a.x), ceilf (a.y), ceilf (a.z)); }
+static inline float    lm_dot3      (lm_vec3 a, lm_vec3 b) { return a.x * b.x + a.y * b.y + a.z * b.z; }
+static inline lm_vec3  lm_cross3    (lm_vec3 a, lm_vec3 b) { return lm_v3(a.y * b.z - b.y * a.z, a.z * b.x - b.z * a.x, a.x * b.y - b.x * a.y); }
+static inline float    lm_length3sq (lm_vec3 a           ) { return a.x * a.x + a.y * a.y + a.z * a.z; }
+static inline float    lm_length3   (lm_vec3 a           ) { return sqrtf(lm_length3sq(a)); }
+static inline lm_vec3  lm_normalize3(lm_vec3 a           ) { return lm_div3(a, lm_length3(a)); }
+static inline lm_bool  lm_finite3   (lm_vec3 a           ) { return lm_finite(a.x) && lm_finite(a.y) && lm_finite(a.z); }
+
+static lm_vec2 lm_toBarycentric(lm_vec2 p1, lm_vec2 p2, lm_vec2 p3, lm_vec2 p)
+{
+	// http://www.blackpawn.com/texts/pointinpoly/
+	// Compute vectors
+	lm_vec2 v0 = lm_sub2(p3, p1);
+	lm_vec2 v1 = lm_sub2(p2, p1);
+	lm_vec2 v2 = lm_sub2(p, p1);
+	// Compute dot products
+	float dot00 = lm_dot2(v0, v0);
+	float dot01 = lm_dot2(v0, v1);
+	float dot02 = lm_dot2(v0, v2);
+	float dot11 = lm_dot2(v1, v1);
+	float dot12 = lm_dot2(v1, v2);
+	// Compute barycentric coordinates
+	float invDenom = 1.0f / (dot00 * dot11 - dot01 * dot01);
+	float u = (dot11 * dot02 - dot01 * dot12) * invDenom;
+	float v = (dot00 * dot12 - dot01 * dot02) * invDenom;
+	return lm_v2(u, v);
+}
+
+static inline int lm_leftOf(lm_vec2 a, lm_vec2 b, lm_vec2 c)
+{
+	float x = lm_cross2(lm_sub2(b, a), lm_sub2(c, b));
+	return x < 0 ? -1 : x > 0;
+}
+
+static lm_bool lm_lineIntersection(lm_vec2 x0, lm_vec2 x1, lm_vec2 y0, lm_vec2 y1, lm_vec2* res)
+{
+	lm_vec2 dx = lm_sub2(x1, x0);
+	lm_vec2 dy = lm_sub2(y1, y0);
+	lm_vec2 d = lm_sub2(x0, y0);
+	float dyx = lm_cross2(dy, dx);
+	if (dyx == 0.0f)
+		return LM_FALSE;
+	dyx = lm_cross2(d, dx) / dyx;
+	if (dyx <= 0 || dyx >= 1)
+		return LM_FALSE;
+	res->x = y0.x + dyx * dy.x;
+	res->y = y0.y + dyx * dy.y;
+	return LM_TRUE;
+}
+
+// this modifies the poly array! the poly array must be big enough to hold the result!
+// res must be big enough to hold the result!
+static int lm_convexClip(lm_vec2 *poly, int nPoly, const lm_vec2 *clip, int nClip, lm_vec2 *res)
+{
+	int nRes = nPoly;
+	int dir = lm_leftOf(clip[0], clip[1], clip[2]);
+	for (int i = 0, j = nClip - 1; i < nClip && nRes; j = i++)
+	{
+		if (i != 0)
+			for (nPoly = 0; nPoly < nRes; nPoly++)
+				poly[nPoly] = res[nPoly];
+		nRes = 0;
+		lm_vec2 v0 = poly[nPoly - 1];
+		int side0 = lm_leftOf(clip[j], clip[i], v0);
+		if (side0 != -dir)
+			res[nRes++] = v0;
+		for (int k = 0; k < nPoly; k++)
+		{
+			lm_vec2 v1 = poly[k], x;
+			int side1 = lm_leftOf(clip[j], clip[i], v1);
+			if (side0 + side1 == 0 && side0 && lm_lineIntersection(clip[j], clip[i], v0, v1, &x))
+				res[nRes++] = x;
+			if (k == nPoly - 1)
+				break;
+			if (side1 != -dir)
+				res[nRes++] = v1;
+			v0 = v1;
+			side0 = side1;
+		}
+	}
+
+	return nRes;
+}
+
+struct lm_context
+{
+	struct
+	{
+		const float *modelMatrix;
+		float normalMatrix[9];
+
+		const unsigned char *positions;
+		lm_type positionsType;
+		int positionsStride;
+		const unsigned char *normals;
+		lm_type normalsType;
+		int normalsStride;
+		const unsigned char *uvs;
+		lm_type uvsType;
+		int uvsStride;
+		const unsigned char *indices;
+		lm_type indicesType;
+		unsigned int count;
+	} mesh;
+
+	struct
+	{
+		int pass;
+		int passCount;
+
+		struct
+		{
+			unsigned int baseIndex;
+			lm_vec3 p[3];
+			lm_vec3 n[3];
+			lm_vec2 uv[3];
+		} triangle;
+
+		struct
+		{
+			int minx, miny;
+			int maxx, maxy;
+			int x, y;
+		} rasterizer;
+
+		struct
+		{
+			lm_vec3 position;
+			lm_vec3 direction;
+			lm_vec3 up;
+		} sample;
+
+		struct
+		{
+			int side;
+		} hemisphere;
+	} meshPosition;
+
+	struct
+	{
+		int width;
+		int height;
+		int channels;
+		float *data;
+
+#ifdef LM_DEBUG_INTERPOLATION
+		unsigned char *debug;
+#endif
+	} lightmap;
+
+	struct
+	{
+		unsigned int size;
+		float zNear, zFar;
+		float cameraToSurfaceDistanceModifier;
+		struct { float r, g, b; } clearColor;
+
+		unsigned int fbHemiCountX;
+		unsigned int fbHemiCountY;
+		unsigned int fbHemiIndex;
+		lm_ivec2 *fbHemiToLightmapLocation;
+		GLuint fbTexture[2];
+		GLuint fb[2];
+		GLuint fbDepth;
+		GLuint vao;
+		struct
+		{
+			GLuint programID;
+			GLuint hemispheresTextureID;
+			GLuint weightsTextureID;
+			GLuint weightsTexture;
+		} firstPass;
+		struct
+		{
+			GLuint programID;
+			GLuint hemispheresTextureID;
+		} downsamplePass;
+		struct
+		{
+			GLuint texture;
+			lm_ivec2 writePosition;
+			lm_ivec2 *toLightmapLocation;
+		} storage;
+	} hemisphere;
+
+	float interpolationThreshold;
+};
+
+// pass order of one 4x4 interpolation patch for two interpolation steps (and the next neighbors right of/below it)
+// 0 4 1 4 0
+// 5 6 5 6 5
+// 2 4 3 4 2
+// 5 6 5 6 5
+// 0 4 1 4 0
+
+static unsigned int lm_passStepSize(lm_context *ctx)
+{
+	unsigned int shift = ctx->meshPosition.passCount / 3 - (ctx->meshPosition.pass - 1) / 3;
+	unsigned int step = (1 << shift);
+	assert(step > 0);
+	return step;
+}
+
+static unsigned int lm_passOffsetX(lm_context *ctx)
+{
+	if (!ctx->meshPosition.pass)
+		return 0;
+	int passType = (ctx->meshPosition.pass - 1) % 3;
+	unsigned int halfStep = lm_passStepSize(ctx) >> 1;
+	return passType != 1 ? halfStep : 0;
+}
+
+static unsigned int lm_passOffsetY(lm_context *ctx)
+{
+	if (!ctx->meshPosition.pass)
+		return 0;
+	int passType = (ctx->meshPosition.pass - 1) % 3;
+	unsigned int halfStep = lm_passStepSize(ctx) >> 1;
+	return passType != 0 ? halfStep : 0;
+}
+
+static lm_bool lm_hasConservativeTriangleRasterizerFinished(lm_context *ctx)
+{
+	return ctx->meshPosition.rasterizer.y >= ctx->meshPosition.rasterizer.maxy;
+}
+
+static void lm_moveToNextPotentialConservativeTriangleRasterizerPosition(lm_context *ctx)
+{
+	unsigned int step = lm_passStepSize(ctx);
+	ctx->meshPosition.rasterizer.x += step;
+	while (ctx->meshPosition.rasterizer.x >= ctx->meshPosition.rasterizer.maxx)
+	{
+		ctx->meshPosition.rasterizer.x = ctx->meshPosition.rasterizer.minx + lm_passOffsetX(ctx);
+		ctx->meshPosition.rasterizer.y += step;
+		if (lm_hasConservativeTriangleRasterizerFinished(ctx))
+			break;
+	}
+}
+
+static float *lm_getLightmapPixel(lm_context *ctx, int x, int y)
+{
+	assert(x >= 0 && x < ctx->lightmap.width && y >= 0 && y < ctx->lightmap.height);
+	return ctx->lightmap.data + (y * ctx->lightmap.width + x) * ctx->lightmap.channels;
+}
+
+static void lm_setLightmapPixel(lm_context *ctx, int x, int y, float *in)
+{
+	assert(x >= 0 && x < ctx->lightmap.width && y >= 0 && y < ctx->lightmap.height);
+	float *p = ctx->lightmap.data + (y * ctx->lightmap.width + x) * ctx->lightmap.channels;
+	for (int j = 0; j < ctx->lightmap.channels; j++)
+		*p++ = *in++;
+}
+
+#define lm_baseAngle 0.1f
+static const float lm_baseAngles[3][3] = {
+	{ lm_baseAngle, lm_baseAngle + 1.0f / 3.0f, lm_baseAngle + 2.0f / 3.0f },
+	{ lm_baseAngle + 1.0f / 3.0f, lm_baseAngle + 2.0f / 3.0f, lm_baseAngle },
+	{ lm_baseAngle + 2.0f / 3.0f, lm_baseAngle, lm_baseAngle + 1.0f / 3.0f }
+};
+
+static lm_bool lm_trySamplingConservativeTriangleRasterizerPosition(lm_context *ctx)
+{
+	if (lm_hasConservativeTriangleRasterizerFinished(ctx))
+		return LM_FALSE;
+
+	// check if lightmap pixel was already set
+	float *pixelValue = lm_getLightmapPixel(ctx, ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y);
+	for (int j = 0; j < ctx->lightmap.channels; j++)
+		if (pixelValue[j] != 0.0f)
+			return LM_FALSE;
+
+	// try calculating centroid by clipping the pixel against the triangle
+	lm_vec2 pixel[16];
+	pixel[0] = lm_v2i(ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y);
+	pixel[1] = lm_v2i(ctx->meshPosition.rasterizer.x + 1, ctx->meshPosition.rasterizer.y);
+	pixel[2] = lm_v2i(ctx->meshPosition.rasterizer.x + 1, ctx->meshPosition.rasterizer.y + 1);
+	pixel[3] = lm_v2i(ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y + 1);
+
+	lm_vec2 res[16];
+	int nRes = lm_convexClip(pixel, 4, ctx->meshPosition.triangle.uv, 3, res);
+	if (nRes == 0)
+		return LM_FALSE; // nothing left
+
+	// calculate centroid position and area
+	lm_vec2 centroid = res[0];
+	float area = res[nRes - 1].x * res[0].y - res[nRes - 1].y * res[0].x;
+	for (int i = 1; i < nRes; i++)
+	{
+		centroid = lm_add2(centroid, res[i]);
+		area += res[i - 1].x * res[i].y - res[i - 1].y * res[i].x;
+	}
+	centroid = lm_div2(centroid, (float)nRes);
+	area = lm_absf(area / 2.0f);
+
+	if (area <= 0.0f)
+		return LM_FALSE; // no area left
+
+	// calculate barycentric coords
+	lm_vec2 uv = lm_toBarycentric(
+		ctx->meshPosition.triangle.uv[0],
+		ctx->meshPosition.triangle.uv[1],
+		ctx->meshPosition.triangle.uv[2],
+		centroid);
+
+	if (!lm_finite2(uv))
+		return LM_FALSE; // degenerate
+
+	// try to interpolate color from neighbors:
+	if (ctx->meshPosition.pass > 0)
+	{
+		float *neighbors[4];
+		int neighborCount = 0;
+		int neighborsExpected = 0;
+		int d = (int)lm_passStepSize(ctx) / 2;
+		int dirs = ((ctx->meshPosition.pass - 1) % 3) + 1;
+		if (dirs & 1) // check x-neighbors with distance d
+		{
+			neighborsExpected += 2;
+			if (ctx->meshPosition.rasterizer.x - d >= ctx->meshPosition.rasterizer.minx &&
+				ctx->meshPosition.rasterizer.x + d <= ctx->meshPosition.rasterizer.maxx)
+			{
+				neighbors[neighborCount++] = lm_getLightmapPixel(ctx, ctx->meshPosition.rasterizer.x - d, ctx->meshPosition.rasterizer.y);
+				neighbors[neighborCount++] = lm_getLightmapPixel(ctx, ctx->meshPosition.rasterizer.x + d, ctx->meshPosition.rasterizer.y);
+			}
+		}
+		if (dirs & 2) // check y-neighbors with distance d
+		{
+			neighborsExpected += 2;
+			if (ctx->meshPosition.rasterizer.y - d >= ctx->meshPosition.rasterizer.miny &&
+				ctx->meshPosition.rasterizer.y + d <= ctx->meshPosition.rasterizer.maxy)
+			{
+				neighbors[neighborCount++] = lm_getLightmapPixel(ctx, ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y - d);
+				neighbors[neighborCount++] = lm_getLightmapPixel(ctx, ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y + d);
+			}
+		}
+		if (neighborCount == neighborsExpected) // are all interpolation neighbors available?
+		{
+			// calculate average neighbor pixel value
+			float avg[4] = { 0 };
+			for (int i = 0; i < neighborCount; i++)
+				for (int j = 0; j < ctx->lightmap.channels; j++)
+					avg[j] += neighbors[i][j];
+			float ni = 1.0f / neighborCount;
+			for (int j = 0; j < ctx->lightmap.channels; j++)
+				avg[j] *= ni;
+
+			// check if error from average pixel to neighbors is above the interpolation threshold
+			lm_bool interpolate = LM_TRUE;
+			for (int i = 0; i < neighborCount; i++)
+			{
+				lm_bool zero = LM_TRUE;
+				for (int j = 0; j < ctx->lightmap.channels; j++)
+				{
+					if (neighbors[i][j] != 0.0f)
+						zero = LM_FALSE;
+					if (fabs(neighbors[i][j] - avg[j]) > ctx->interpolationThreshold)
+						interpolate = LM_FALSE;
+				}
+				if (zero)
+					interpolate = LM_FALSE;
+				if (!interpolate)
+					break;
+			}
+
+			// set interpolated value and return if interpolation is acceptable
+			if (interpolate)
+			{
+				lm_setLightmapPixel(ctx, ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y, avg);
+#ifdef LM_DEBUG_INTERPOLATION
+				// set interpolated pixel to green in debug output
+				ctx->lightmap.debug[(ctx->meshPosition.rasterizer.y * ctx->lightmap.width + ctx->meshPosition.rasterizer.x) * 3 + 1] = 255;
+#endif
+				return LM_FALSE;
+			}
+		}
+	}
+
+	// could not interpolate. must render a hemisphere.
+	// calculate 3D sample position and orientation
+	lm_vec3 p0 = ctx->meshPosition.triangle.p[0];
+	lm_vec3 p1 = ctx->meshPosition.triangle.p[1];
+	lm_vec3 p2 = ctx->meshPosition.triangle.p[2];
+	lm_vec3 v1 = lm_sub3(p1, p0);
+	lm_vec3 v2 = lm_sub3(p2, p0);
+	ctx->meshPosition.sample.position = lm_add3(p0, lm_add3(lm_scale3(v2, uv.x), lm_scale3(v1, uv.y)));
+
+	lm_vec3 n0 = ctx->meshPosition.triangle.n[0];
+	lm_vec3 n1 = ctx->meshPosition.triangle.n[1];
+	lm_vec3 n2 = ctx->meshPosition.triangle.n[2];
+	lm_vec3 nv1 = lm_sub3(n1, n0);
+	lm_vec3 nv2 = lm_sub3(n2, n0);
+	ctx->meshPosition.sample.direction = lm_normalize3(lm_add3(n0, lm_add3(lm_scale3(nv2, uv.x), lm_scale3(nv1, uv.y))));
+	ctx->meshPosition.sample.direction = lm_normalize3(ctx->meshPosition.sample.direction);
+	float cameraToSurfaceDistance = (1.0f + ctx->hemisphere.cameraToSurfaceDistanceModifier) * ctx->hemisphere.zNear * sqrtf(2.0f);
+	ctx->meshPosition.sample.position = lm_add3(ctx->meshPosition.sample.position, lm_scale3(ctx->meshPosition.sample.direction, cameraToSurfaceDistance));
+
+	if (!lm_finite3(ctx->meshPosition.sample.position) ||
+		!lm_finite3(ctx->meshPosition.sample.direction) ||
+		lm_length3sq(ctx->meshPosition.sample.direction) < 0.5f) // don't allow 0.0f. should always be ~1.0f
+		return LM_FALSE;
+
+	lm_vec3 up = lm_v3(0.0f, 1.0f, 0.0f);
+	if (lm_absf(lm_dot3(up, ctx->meshPosition.sample.direction)) > 0.8f)
+		up = lm_v3(0.0f, 0.0f, 1.0f);
+
+#if 0
+	// triangle-consistent up vector
+	ctx->meshPosition.sample.up = lm_normalize3(lm_cross3(up, ctx->meshPosition.sample.direction));
+	return LM_TRUE;
+#else
+	// "randomized" rotation with pattern
+	lm_vec3 side = lm_normalize3(lm_cross3(up, ctx->meshPosition.sample.direction));
+	up = lm_normalize3(lm_cross3(side, ctx->meshPosition.sample.direction));
+	int rx = ctx->meshPosition.rasterizer.x % 3;
+	int ry = ctx->meshPosition.rasterizer.y % 3;
+	static const float lm_pi = 3.14159265358979f;
+	float phi = 2.0f * lm_pi * lm_baseAngles[ry][rx] + 0.1f * ((float)rand() / (float)RAND_MAX);
+	ctx->meshPosition.sample.up = lm_normalize3(lm_add3(lm_scale3(side, cosf(phi)), lm_scale3(up, sinf(phi))));
+	return LM_TRUE;
+#endif
+}
+
+// returns true if a sampling position was found and
+// false if we finished rasterizing the current triangle
+static lm_bool lm_findFirstConservativeTriangleRasterizerPosition(lm_context *ctx)
+{
+	while (!lm_trySamplingConservativeTriangleRasterizerPosition(ctx))
+	{
+		lm_moveToNextPotentialConservativeTriangleRasterizerPosition(ctx);
+		if (lm_hasConservativeTriangleRasterizerFinished(ctx))
+			return LM_FALSE;
+	}
+	return LM_TRUE;
+}
+
+static lm_bool lm_findNextConservativeTriangleRasterizerPosition(lm_context *ctx)
+{
+	lm_moveToNextPotentialConservativeTriangleRasterizerPosition(ctx);
+	return lm_findFirstConservativeTriangleRasterizerPosition(ctx);
+}
+
+static void lm_integrateHemisphereBatch(lm_context *ctx)
+{
+	if (!ctx->hemisphere.fbHemiIndex)
+		return; // nothing to do
+
+	glDisable(GL_DEPTH_TEST);
+	glBindVertexArray(ctx->hemisphere.vao);
+
+	int fbRead = 0;
+	int fbWrite = 1;
+
+	// weighted downsampling pass
+	int outHemiSize = ctx->hemisphere.size / 2;
+	glBindFramebuffer(GL_FRAMEBUFFER, ctx->hemisphere.fb[fbWrite]);
+	glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, ctx->hemisphere.fbTexture[fbWrite], 0);
+	glViewport(0, 0, outHemiSize * ctx->hemisphere.fbHemiCountX, outHemiSize * ctx->hemisphere.fbHemiCountY);
+	glUseProgram(ctx->hemisphere.firstPass.programID);
+	glUniform1i(ctx->hemisphere.firstPass.hemispheresTextureID, 0);
+	glActiveTexture(GL_TEXTURE0);
+	glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.fbTexture[fbRead]);
+	glUniform1i(ctx->hemisphere.firstPass.weightsTextureID, 1);
+	glActiveTexture(GL_TEXTURE1);
+	glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.firstPass.weightsTexture);
+	glActiveTexture(GL_TEXTURE0);
+	glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+	//glBindTexture(GL_TEXTURE_2D, 0);
+
+#if 0
+	// debug output
+	int w = outHemiSize * ctx->hemisphere.fbHemiCountX, h = outHemiSize * ctx->hemisphere.fbHemiCountY;
+	glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
+	glBindFramebuffer(GL_READ_FRAMEBUFFER, ctx->hemisphere.fb[fbWrite]);
+	glReadBuffer(GL_COLOR_ATTACHMENT0);
+	float *image = new float[3 * w * h];
+	glReadPixels(0, 0, w, h, GL_RGB, GL_FLOAT, image);
+	lmImageSaveTGAf("firstpass.png", image, w, h, 3);
+	delete[] image;
+#endif
+
+	// downsampling passes
+	glUseProgram(ctx->hemisphere.downsamplePass.programID);
+	glUniform1i(ctx->hemisphere.downsamplePass.hemispheresTextureID, 0);
+	while (outHemiSize > 1)
+	{
+		LM_SWAP(int, fbRead, fbWrite);
+		outHemiSize /= 2;
+		glBindFramebuffer(GL_FRAMEBUFFER, ctx->hemisphere.fb[fbWrite]);
+		glViewport(0, 0, outHemiSize * ctx->hemisphere.fbHemiCountX, outHemiSize * ctx->hemisphere.fbHemiCountY);
+		glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.fbTexture[fbRead]);
+		glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+		//glBindTexture(GL_TEXTURE_2D, 0);
+	}
+
+	// copy results to storage texture
+	glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.storage.texture);
+	glCopyTexSubImage2D(GL_TEXTURE_2D, 0,
+		ctx->hemisphere.storage.writePosition.x, ctx->hemisphere.storage.writePosition.y,
+		0, 0, ctx->hemisphere.fbHemiCountX, ctx->hemisphere.fbHemiCountY);
+	glBindTexture(GL_TEXTURE_2D, 0);
+	glBindFramebuffer(GL_FRAMEBUFFER, 0);
+	glBindVertexArray(0);
+	glEnable(GL_DEPTH_TEST);
+
+	// copy position mapping to storage
+	for (unsigned int y = 0; y < ctx->hemisphere.fbHemiCountY; y++)
+	{
+		int sy = ctx->hemisphere.storage.writePosition.y + y;
+		for (unsigned int x = 0; x < ctx->hemisphere.fbHemiCountX; x++)
+		{
+			int sx = ctx->hemisphere.storage.writePosition.x + x;
+			unsigned int hemiIndex = y * ctx->hemisphere.fbHemiCountX + x;
+			if (hemiIndex >= ctx->hemisphere.fbHemiIndex)
+				ctx->hemisphere.storage.toLightmapLocation[sy * ctx->lightmap.width + sx] = lm_i2(-1, -1);
+			else
+				ctx->hemisphere.storage.toLightmapLocation[sy * ctx->lightmap.width + sx] = ctx->hemisphere.fbHemiToLightmapLocation[hemiIndex];
+		}
+	}
+
+	// advance storage texture write position
+	ctx->hemisphere.storage.writePosition.x += ctx->hemisphere.fbHemiCountX;
+	if (ctx->hemisphere.storage.writePosition.x + (int)ctx->hemisphere.fbHemiCountX > ctx->lightmap.width)
+	{
+		ctx->hemisphere.storage.writePosition.x = 0;
+		ctx->hemisphere.storage.writePosition.y += ctx->hemisphere.fbHemiCountY;
+		assert(ctx->hemisphere.storage.writePosition.y + (int)ctx->hemisphere.fbHemiCountY < ctx->lightmap.height);
+	}
+
+	ctx->hemisphere.fbHemiIndex = 0;
+}
+
+static void lm_writeResultsToLightmap(lm_context *ctx)
+{
+	// do the GPU->CPU transfer of downsampled hemispheres
+	float *hemi = (float*)LM_CALLOC(ctx->lightmap.width * ctx->lightmap.height, 4 * sizeof(float));
+	glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.storage.texture);
+	glGetTexImage(GL_TEXTURE_2D, 0, GL_RGBA, GL_FLOAT, hemi);
+
+	// write results to lightmap texture
+	for (int y = 0; y < ctx->hemisphere.storage.writePosition.y + (int)ctx->hemisphere.fbHemiCountY; y++)
+	{
+		for (int x = 0; x < ctx->lightmap.width; x++)
+		{
+			lm_ivec2 lmUV = ctx->hemisphere.storage.toLightmapLocation[y * ctx->lightmap.width + x];
+			if (lmUV.x >= 0)
+			{
+				float *c = hemi + (y * ctx->lightmap.width + x) * 4;
+				float validity = c[3];
+				float *lm = ctx->lightmap.data + (lmUV.y * ctx->lightmap.width + lmUV.x) * ctx->lightmap.channels;
+				if (!lm[0] && validity > 0.9)
+				{
+					float scale = 1.0f / validity;
+					switch (ctx->lightmap.channels)
+					{
+					case 1:
+						lm[0] = lm_maxf((c[0] + c[1] + c[2]) * scale / 3.0f, FLT_MIN);
+						break;
+					case 2:
+						lm[0] = lm_maxf((c[0] + c[1] + c[2]) * scale / 3.0f, FLT_MIN);
+						lm[1] = 1.0f; // do we want to support this format?
+						break;
+					case 3:
+						lm[0] = lm_maxf(c[0] * scale, FLT_MIN);
+						lm[1] = lm_maxf(c[1] * scale, FLT_MIN);
+						lm[2] = lm_maxf(c[2] * scale, FLT_MIN);
+						break;
+					case 4:
+						lm[0] = lm_maxf(c[0] * scale, FLT_MIN);
+						lm[1] = lm_maxf(c[1] * scale, FLT_MIN);
+						lm[2] = lm_maxf(c[2] * scale, FLT_MIN);
+						lm[3] = 1.0f;
+						break;
+					default:
+						assert(LM_FALSE);
+						break;
+					}
+
+#ifdef LM_DEBUG_INTERPOLATION
+					// set sampled pixel to red in debug output
+					ctx->lightmap.debug[(lmUV.y * ctx->lightmap.width + lmUV.x) * 3 + 0] = 255;
+#endif
+				}
+			}
+			ctx->hemisphere.storage.toLightmapLocation[y * ctx->lightmap.width + x].x = -1; // reset
+		}
+	}
+
+	LM_FREE(hemi);
+	ctx->hemisphere.storage.writePosition = lm_i2(0, 0);
+}
+
+static void lm_setView(
+	int* viewport, int x, int y, int w, int h,
+	float* view,   lm_vec3 pos, lm_vec3 dir, lm_vec3 up,
+	float* proj,   float l, float r, float b, float t, float n, float f)
+{
+	// viewport
+	viewport[0] = x; viewport[1] = y; viewport[2] = w; viewport[3] = h;
+
+	// view matrix: lookAt(pos, pos + dir, up)
+	lm_vec3 side = lm_cross3(dir, up);
+	//up = cross(side, dir);
+	dir = lm_negate3(dir); pos = lm_negate3(pos);
+	view[ 0] = side.x;             view[ 1] = up.x;             view[ 2] = dir.x;             view[ 3] = 0.0f;
+	view[ 4] = side.y;             view[ 5] = up.y;             view[ 6] = dir.y;             view[ 7] = 0.0f;
+	view[ 8] = side.z;             view[ 9] = up.z;             view[10] = dir.z;             view[11] = 0.0f;
+	view[12] = lm_dot3(side, pos); view[13] = lm_dot3(up, pos); view[14] = lm_dot3(dir, pos); view[15] = 1.0f;
+
+	// projection matrix: frustum(l, r, b, t, n, f)
+	float ilr = 1.0f / (r - l), ibt = 1.0f / (t - b), ninf = -1.0f / (f - n), n2 = 2.0f * n;
+	proj[ 0] = n2 * ilr;      proj[ 1] = 0.0f;          proj[ 2] = 0.0f;           proj[ 3] = 0.0f;
+	proj[ 4] = 0.0f;          proj[ 5] = n2 * ibt;      proj[ 6] = 0.0f;           proj[ 7] = 0.0f;
+	proj[ 8] = (r + l) * ilr; proj[ 9] = (t + b) * ibt; proj[10] = (f + n) * ninf; proj[11] = -1.0f;
+	proj[12] = 0.0f;          proj[13] = 0.0f;          proj[14] = f * n2 * ninf;  proj[15] = 0.0f;
+}
+
+// returns true if a hemisphere side was prepared for rendering and
+// false if we finished the current hemisphere
+static lm_bool lm_beginSampleHemisphere(lm_context *ctx, int* viewport, float* view, float* proj)
+{
+	if (ctx->meshPosition.hemisphere.side >= 5)
+		return LM_FALSE;
+
+	if (ctx->meshPosition.hemisphere.side == 0)
+	{
+		// prepare hemisphere
+		glBindFramebuffer(GL_FRAMEBUFFER, ctx->hemisphere.fb[0]);
+		if (ctx->hemisphere.fbHemiIndex == 0)
+		{
+			// prepare hemisphere batch
+			glClearColor( // clear to valid background pixels!
+				ctx->hemisphere.clearColor.r,
+				ctx->hemisphere.clearColor.g,
+				ctx->hemisphere.clearColor.b, 1.0f);
+			glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+		}
+		ctx->hemisphere.fbHemiToLightmapLocation[ctx->hemisphere.fbHemiIndex] =
+			lm_i2(ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y);
+	}
+
+	// find the target position in the batch
+	int x = (ctx->hemisphere.fbHemiIndex % ctx->hemisphere.fbHemiCountX) * ctx->hemisphere.size * 3;
+	int y = (ctx->hemisphere.fbHemiIndex / ctx->hemisphere.fbHemiCountX) * ctx->hemisphere.size;
+
+	int size = ctx->hemisphere.size;
+	float zNear = ctx->hemisphere.zNear;
+	float zFar = ctx->hemisphere.zFar;
+
+	lm_vec3 pos = ctx->meshPosition.sample.position;
+	lm_vec3 dir = ctx->meshPosition.sample.direction;
+	lm_vec3 up = ctx->meshPosition.sample.up;
+	lm_vec3 right = lm_cross3(dir, up);
+
+	// find the view parameters of the hemisphere side that we will render next
+	// hemisphere layout in the framebuffer:
+	//       +-------+---+---+-------+
+	//       |       |   |   |   D   |
+	//       |   C   | R | L +-------+
+	//       |       |   |   |   U   |
+	//       +-------+---+---+-------+
+	switch (ctx->meshPosition.hemisphere.side)
+	{
+	case 0: // center
+		lm_setView(viewport, x, y, size, size,
+				   view,     pos, dir, up,
+				   proj,     -zNear, zNear, -zNear, zNear, zNear, zFar);
+		break;
+	case 1: // right
+		lm_setView(viewport, size + x, y, size / 2, size,
+				   view,     pos, right, up,
+				   proj,     -zNear, 0.0f, -zNear, zNear, zNear, zFar);
+		break;
+	case 2: // left
+		lm_setView(viewport, size + x + size / 2, y, size / 2, size,
+				   view,     pos, lm_negate3(right), up,
+				   proj,     0.0f, zNear, -zNear, zNear, zNear, zFar);
+		break;
+	case 3: // down
+		lm_setView(viewport, 2 * size + x, y + size / 2, size, size / 2,
+				   view,     pos, lm_negate3(up), dir,
+				   proj,     -zNear, zNear, 0.0f, zNear, zNear, zFar);
+		break;
+	case 4: // up
+		lm_setView(viewport, 2 * size + x, y, size, size / 2,
+				   view,     pos, up, lm_negate3(dir),
+				   proj,     -zNear, zNear, -zNear, 0.0f, zNear, zFar);
+		break;
+	default:
+		assert(LM_FALSE);
+		break;
+	}
+
+	return LM_TRUE;
+}
+
+static void lm_endSampleHemisphere(lm_context *ctx)
+{
+	if (++ctx->meshPosition.hemisphere.side == 5)
+	{
+		// finish hemisphere
+		glBindFramebuffer(GL_FRAMEBUFFER, 0);
+		if (++ctx->hemisphere.fbHemiIndex == ctx->hemisphere.fbHemiCountX * ctx->hemisphere.fbHemiCountY)
+		{
+			// downsample new hemisphere batch and store the results
+			lm_integrateHemisphereBatch(ctx);
+		}
+	}
+}
+
+static void lm_inverseTranspose(const float *m44, float *n33)
+{
+	if (!m44)
+	{
+		n33[0] = 1.0f; n33[1] = 0.0f; n33[2] = 0.0f;
+		n33[3] = 0.0f; n33[4] = 1.0f; n33[5] = 0.0f;
+		n33[6] = 0.0f; n33[7] = 0.0f; n33[8] = 1.0f;
+		return;
+	}
+
+	float determinant = m44[ 0] * (m44[ 5] * m44[10] - m44[ 9] * m44[ 6])
+					  - m44[ 1] * (m44[ 4] * m44[10] - m44[ 6] * m44[ 8])
+					  + m44[ 2] * (m44[ 4] * m44[ 9] - m44[ 5] * m44[ 8]);
+
+	assert(fabs(determinant) > FLT_EPSILON);
+	float rcpDeterminant = 1.0f / determinant;
+
+	n33[0] =  (m44[ 5] * m44[10] - m44[ 9] * m44[ 6]) * rcpDeterminant;
+	n33[3] = -(m44[ 1] * m44[10] - m44[ 2] * m44[ 9]) * rcpDeterminant;
+	n33[6] =  (m44[ 1] * m44[ 6] - m44[ 2] * m44[ 5]) * rcpDeterminant;
+	n33[1] = -(m44[ 4] * m44[10] - m44[ 6] * m44[ 8]) * rcpDeterminant;
+	n33[4] =  (m44[ 0] * m44[10] - m44[ 2] * m44[ 8]) * rcpDeterminant;
+	n33[7] = -(m44[ 0] * m44[ 6] - m44[ 4] * m44[ 2]) * rcpDeterminant;
+	n33[2] =  (m44[ 4] * m44[ 9] - m44[ 8] * m44[ 5]) * rcpDeterminant;
+	n33[5] = -(m44[ 0] * m44[ 9] - m44[ 8] * m44[ 1]) * rcpDeterminant;
+	n33[8] =  (m44[ 0] * m44[ 5] - m44[ 4] * m44[ 1]) * rcpDeterminant;
+}
+
+static lm_vec3 lm_transformNormal(const float *m, lm_vec3 n)
+{
+	lm_vec3 r;
+	r.x = m[0] * n.x + m[3] * n.y + m[6] * n.z;
+	r.y = m[1] * n.x + m[4] * n.y + m[7] * n.z;
+	r.z = m[2] * n.x + m[5] * n.y + m[8] * n.z;
+	return r;
+}
+
+static lm_vec3 lm_transformPosition(const float *m, lm_vec3 v)
+{
+	if (!m)
+		return v;
+	lm_vec3 r;
+	r.x =     m[0] * v.x + m[4] * v.y + m[ 8] * v.z + m[12];
+	r.y =     m[1] * v.x + m[5] * v.y + m[ 9] * v.z + m[13];
+	r.z =     m[2] * v.x + m[6] * v.y + m[10] * v.z + m[14];
+	float d = m[3] * v.x + m[7] * v.y + m[11] * v.z + m[15];
+	assert(lm_absf(d - 1.0f) < 0.00001f); // could divide by d, but this shouldn't be a projection transform!
+	return r;
+}
+
+static void lm_setMeshPosition(lm_context *ctx, unsigned int indicesTriangleBaseIndex)
+{
+	// fetch triangle at the specified indicesTriangleBaseIndex
+	ctx->meshPosition.triangle.baseIndex = indicesTriangleBaseIndex;
+
+	// load and transform triangle to process next
+	lm_vec2 uvMin = lm_v2(FLT_MAX, FLT_MAX), uvMax = lm_v2(-FLT_MAX, -FLT_MAX);
+	lm_vec2 uvScale = lm_v2i(ctx->lightmap.width, ctx->lightmap.height);
+	unsigned int vIndices[3];
+	for (int i = 0; i < 3; i++)
+	{
+		// decode index
+		unsigned int vIndex;
+		switch (ctx->mesh.indicesType)
+		{
+		case LM_NONE:
+			vIndex = ctx->meshPosition.triangle.baseIndex + i;
+			break;
+		case LM_UNSIGNED_BYTE:
+			vIndex = ((const unsigned char*)ctx->mesh.indices + ctx->meshPosition.triangle.baseIndex)[i];
+			break;
+		case LM_UNSIGNED_SHORT:
+			vIndex = ((const unsigned short*)ctx->mesh.indices + ctx->meshPosition.triangle.baseIndex)[i];
+			break;
+		case LM_UNSIGNED_INT:
+			vIndex = ((const unsigned int*)ctx->mesh.indices + ctx->meshPosition.triangle.baseIndex)[i];
+			break;
+		default:
+			assert(LM_FALSE);
+			break;
+		}
+		vIndices[i] = vIndex;
+
+		// decode and pre-transform vertex position
+		const void *pPtr = ctx->mesh.positions + vIndex * ctx->mesh.positionsStride;
+		lm_vec3 p;
+		switch (ctx->mesh.positionsType)
+		{
+		// TODO: signed formats
+		case LM_UNSIGNED_BYTE: {
+			const unsigned char *uc = (const unsigned char*)pPtr;
+			p = lm_v3(uc[0], uc[1], uc[2]);
+		} break;
+		case LM_UNSIGNED_SHORT: {
+			const unsigned short *us = (const unsigned short*)pPtr;
+			p = lm_v3(us[0], us[1], us[2]);
+		} break;
+		case LM_UNSIGNED_INT: {
+			const unsigned int *ui = (const unsigned int*)pPtr;
+			p = lm_v3((float)ui[0], (float)ui[1], (float)ui[2]);
+		} break;
+		case LM_FLOAT: {
+			p = *(const lm_vec3*)pPtr;
+		} break;
+		default: {
+			assert(LM_FALSE);
+		} break;
+		}
+		ctx->meshPosition.triangle.p[i] = lm_transformPosition(ctx->mesh.modelMatrix, p);
+
+		// decode and scale (to lightmap resolution) vertex lightmap texture coords
+		const void *uvPtr = ctx->mesh.uvs + vIndex * ctx->mesh.uvsStride;
+		lm_vec2 uv;
+		switch (ctx->mesh.uvsType)
+		{
+		case LM_UNSIGNED_BYTE: {
+			const unsigned char *uc = (const unsigned char*)uvPtr;
+			uv = lm_v2(uc[0] / (float)UCHAR_MAX, uc[1] / (float)UCHAR_MAX);
+		} break;
+		case LM_UNSIGNED_SHORT: {
+			const unsigned short *us = (const unsigned short*)uvPtr;
+			uv = lm_v2(us[0] / (float)USHRT_MAX, us[1] / (float)USHRT_MAX);
+		} break;
+		case LM_UNSIGNED_INT: {
+			const unsigned int *ui = (const unsigned int*)uvPtr;
+			uv = lm_v2(ui[0] / (float)UINT_MAX, ui[1] / (float)UINT_MAX);
+		} break;
+		case LM_FLOAT: {
+			uv = *(const lm_vec2*)uvPtr;
+		} break;
+		default: {
+			assert(LM_FALSE);
+		} break;
+		}
+
+		ctx->meshPosition.triangle.uv[i] = lm_mul2(lm_pmod2(uv, 1.0f), uvScale); // maybe clamp to 0.0-1.0 instead of pmod?
+
+		// update bounds on lightmap
+		uvMin = lm_min2(uvMin, ctx->meshPosition.triangle.uv[i]);
+		uvMax = lm_max2(uvMax, ctx->meshPosition.triangle.uv[i]);
+	}
+
+	lm_vec3 flatNormal = lm_cross3(
+		lm_sub3(ctx->meshPosition.triangle.p[1], ctx->meshPosition.triangle.p[0]),
+		lm_sub3(ctx->meshPosition.triangle.p[2], ctx->meshPosition.triangle.p[0]));
+
+	for (int i = 0; i < 3; i++)
+	{
+		// decode and pre-transform vertex normal
+		const void *nPtr = ctx->mesh.normals + vIndices[i] * ctx->mesh.normalsStride;
+		lm_vec3 n;
+		switch (ctx->mesh.normalsType)
+		{
+		// TODO: signed formats
+		case LM_FLOAT: {
+			n = *(const lm_vec3*)nPtr;
+		} break;
+		case LM_NONE: {
+			n = flatNormal;
+		} break;
+		default: {
+			assert(LM_FALSE);
+		} break;
+		}
+		ctx->meshPosition.triangle.n[i] = lm_normalize3(lm_transformNormal(ctx->mesh.normalMatrix, n));
+	}
+
+	// calculate area of interest (on lightmap) for conservative rasterization
+	lm_vec2 bbMin = lm_floor2(uvMin);
+	lm_vec2 bbMax = lm_ceil2 (uvMax);
+	ctx->meshPosition.rasterizer.minx = lm_maxi((int)bbMin.x - 1, 0);
+	ctx->meshPosition.rasterizer.miny = lm_maxi((int)bbMin.y - 1, 0);
+	ctx->meshPosition.rasterizer.maxx = lm_mini((int)bbMax.x + 1, ctx->lightmap.width - 1);
+	ctx->meshPosition.rasterizer.maxy = lm_mini((int)bbMax.y + 1, ctx->lightmap.height - 1);
+	assert(ctx->meshPosition.rasterizer.minx <= ctx->meshPosition.rasterizer.maxx &&
+		   ctx->meshPosition.rasterizer.miny <= ctx->meshPosition.rasterizer.maxy);
+	ctx->meshPosition.rasterizer.x = ctx->meshPosition.rasterizer.minx + lm_passOffsetX(ctx);
+	ctx->meshPosition.rasterizer.y = ctx->meshPosition.rasterizer.miny + lm_passOffsetY(ctx);
+
+	// try moving to first valid sample position
+	if (ctx->meshPosition.rasterizer.x <= ctx->meshPosition.rasterizer.maxx &&
+		ctx->meshPosition.rasterizer.y <= ctx->meshPosition.rasterizer.maxy &&
+		lm_findFirstConservativeTriangleRasterizerPosition(ctx))
+		ctx->meshPosition.hemisphere.side = 0; // we can start sampling the hemisphere
+	else
+		ctx->meshPosition.hemisphere.side = 5; // no samples on this triangle! put hemisphere sampler into finished state
+}
+
+static GLuint lm_LoadShader(GLenum type, const char *source)
+{
+	GLuint shader = glCreateShader(type);
+	if (shader == 0)
+	{
+		fprintf(stderr, "Could not create shader!\n");
+		return 0;
+	}
+	glShaderSource(shader, 1, &source, NULL);
+	glCompileShader(shader);
+	GLint compiled;
+	glGetShaderiv(shader, GL_COMPILE_STATUS, &compiled);
+	if (!compiled)
+	{
+		fprintf(stderr, "Could not compile shader!\n");
+		GLint infoLen = 0;
+		glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &infoLen);
+		if (infoLen)
+		{
+			char* infoLog = (char*)malloc(infoLen);
+			glGetShaderInfoLog(shader, infoLen, NULL, infoLog);
+			fprintf(stderr, "%s\n", infoLog);
+			free(infoLog);
+		}
+		glDeleteShader(shader);
+		return 0;
+	}
+	return shader;
+}
+
+static GLuint lm_LoadProgram(const char *vp, const char *fp)
+{
+	GLuint program = glCreateProgram();
+	if (program == 0)
+	{
+		fprintf(stderr, "Could not create program!\n");
+		return 0;
+	}
+	GLuint vertexShader = lm_LoadShader(GL_VERTEX_SHADER, vp);
+	GLuint fragmentShader = lm_LoadShader(GL_FRAGMENT_SHADER, fp);
+	glAttachShader(program, vertexShader);
+	glAttachShader(program, fragmentShader);
+	glLinkProgram(program);
+	glDeleteShader(vertexShader);
+	glDeleteShader(fragmentShader);
+	GLint linked;
+	glGetProgramiv(program, GL_LINK_STATUS, &linked);
+	if (!linked)
+	{
+		fprintf(stderr, "Could not link program!\n");
+		GLint infoLen = 0;
+		glGetProgramiv(program, GL_INFO_LOG_LENGTH, &infoLen);
+		if (infoLen)
+		{
+			char* infoLog = (char*)malloc(sizeof(char) * infoLen);
+			glGetProgramInfoLog(program, infoLen, NULL, infoLog);
+			fprintf(stderr, "%s\n", infoLog);
+			free(infoLog);
+		}
+		glDeleteProgram(program);
+		return 0;
+	}
+	return program;
+}
+
+static float lm_defaultWeights(float cos_theta, void *userdata)
+{
+	return 1.0f;
+}
+
+lm_context *lmCreate(int hemisphereSize, float zNear, float zFar,
+	float clearR, float clearG, float clearB,
+	int interpolationPasses, float interpolationThreshold,
+	float cameraToSurfaceDistanceModifier)
+{
+	assert(hemisphereSize == 512 || hemisphereSize == 256 || hemisphereSize == 128 ||
+		   hemisphereSize ==  64 || hemisphereSize ==  32 || hemisphereSize ==  16);
+	assert(zNear < zFar && zNear > 0.0f);
+	assert(cameraToSurfaceDistanceModifier >= -1.0f);
+	assert(interpolationPasses >= 0 && interpolationPasses <= 8);
+	assert(interpolationThreshold >= 0.0f);
+
+	lm_context *ctx = (lm_context*)LM_CALLOC(1, sizeof(lm_context));
+
+	ctx->meshPosition.passCount = 1 + 3 * interpolationPasses;
+	ctx->interpolationThreshold = interpolationThreshold;
+	ctx->hemisphere.size = hemisphereSize;
+	ctx->hemisphere.zNear = zNear;
+	ctx->hemisphere.zFar = zFar;
+	ctx->hemisphere.cameraToSurfaceDistanceModifier = cameraToSurfaceDistanceModifier;
+	ctx->hemisphere.clearColor.r = clearR;
+	ctx->hemisphere.clearColor.g = clearG;
+	ctx->hemisphere.clearColor.b = clearB;
+
+	// calculate hemisphere batch size
+	ctx->hemisphere.fbHemiCountX = 1536 / (3 * ctx->hemisphere.size);
+	ctx->hemisphere.fbHemiCountY = 512 / ctx->hemisphere.size;
+
+	// hemisphere batch framebuffers
+	unsigned int w[] = {
+		ctx->hemisphere.fbHemiCountX * ctx->hemisphere.size * 3,
+		ctx->hemisphere.fbHemiCountX * ctx->hemisphere.size / 2 };
+	unsigned int h[] = {
+		ctx->hemisphere.fbHemiCountY * ctx->hemisphere.size,
+		ctx->hemisphere.fbHemiCountY * ctx->hemisphere.size / 2 };
+
+	glGenTextures(2, ctx->hemisphere.fbTexture);
+	glGenFramebuffers(2, ctx->hemisphere.fb);
+	glGenRenderbuffers(1, &ctx->hemisphere.fbDepth);
+
+	glBindRenderbuffer(GL_RENDERBUFFER, ctx->hemisphere.fbDepth);
+	glRenderbufferStorage(GL_RENDERBUFFER, GL_DEPTH_COMPONENT24, w[0], h[0]);
+	glBindFramebuffer(GL_FRAMEBUFFER, ctx->hemisphere.fb[0]);
+	glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_RENDERBUFFER, ctx->hemisphere.fbDepth);
+	for (int i = 0; i < 2; i++)
+	{
+		glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.fbTexture[i]);
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+		glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F, w[i], h[i], 0, GL_RGBA, GL_FLOAT, 0);
+
+		glBindFramebuffer(GL_FRAMEBUFFER, ctx->hemisphere.fb[i]);
+		glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, ctx->hemisphere.fbTexture[i], 0);
+		GLenum status = glCheckFramebufferStatus(GL_FRAMEBUFFER);
+		if (status != GL_FRAMEBUFFER_COMPLETE)
+		{
+			fprintf(stderr, "Could not create framebuffer!\n");
+			glDeleteRenderbuffers(1, &ctx->hemisphere.fbDepth);
+			glDeleteFramebuffers(2, ctx->hemisphere.fb);
+			glDeleteTextures(2, ctx->hemisphere.fbTexture);
+			LM_FREE(ctx);
+			return NULL;
+		}
+	}
+	glBindFramebuffer(GL_FRAMEBUFFER, 0);
+
+	// dummy vao for fullscreen quad rendering
+	glGenVertexArrays(1, &ctx->hemisphere.vao);
+
+	// hemisphere shader (weighted downsampling of the 3x1 hemisphere layout to a 0.5x0.5 square)
+	{
+		const char *vs =
+			"#version 150 core\n"
+			"const vec2 ps[4] = vec2[](vec2(1, -1), vec2(1, 1), vec2(-1, -1), vec2(-1, 1));\n"
+			"void main()\n"
+			"{\n"
+				"gl_Position = vec4(ps[gl_VertexID], 0, 1);\n"
+			"}\n";
+		const char *fs =
+			"#version 150 core\n"
+			"uniform sampler2D hemispheres;\n"
+			"uniform sampler2D weights;\n"
+
+			"layout(pixel_center_integer) in vec4 gl_FragCoord;\n" // whole integer values represent pixel centers, GL_ARB_fragment_coord_conventions
+
+			"out vec4 outColor;\n"
+
+			"vec4 weightedSample(ivec2 h_uv, ivec2 w_uv, ivec2 quadrant)\n"
+			"{\n"
+				"vec4 sample = texelFetch(hemispheres, h_uv + quadrant, 0);\n"
+				"vec2 weight = texelFetch(weights, w_uv + quadrant, 0).rg;\n"
+				"return vec4(sample.rgb * weight.r, sample.a * weight.g);\n"
+			"}\n"
+
+			"vec4 threeWeightedSamples(ivec2 h_uv, ivec2 w_uv, ivec2 offset)\n"
+			"{\n" // horizontal triple sum
+				"vec4 sum = weightedSample(h_uv, w_uv, offset);\n"
+				"sum += weightedSample(h_uv, w_uv, offset + ivec2(2, 0));\n"
+				"sum += weightedSample(h_uv, w_uv, offset + ivec2(4, 0));\n"
+				"return sum;\n"
+			"}\n"
+
+			"void main()\n"
+			"{\n" // this is a weighted sum downsampling pass (alpha component contains the weighted valid sample count)
+				"vec2 in_uv = gl_FragCoord.xy * vec2(6.0, 2.0) + vec2(0.5);\n"
+				"ivec2 h_uv = ivec2(in_uv);\n"
+				"ivec2 w_uv = ivec2(mod(in_uv, vec2(textureSize(weights, 0))));\n" // there's no integer modulo :(
+				"vec4 lb = threeWeightedSamples(h_uv, w_uv, ivec2(0, 0));\n"
+				"vec4 rb = threeWeightedSamples(h_uv, w_uv, ivec2(1, 0));\n"
+				"vec4 lt = threeWeightedSamples(h_uv, w_uv, ivec2(0, 1));\n"
+				"vec4 rt = threeWeightedSamples(h_uv, w_uv, ivec2(1, 1));\n"
+				"outColor = lb + rb + lt + rt;\n"
+			"}\n";
+		ctx->hemisphere.firstPass.programID = lm_LoadProgram(vs, fs);
+		if (!ctx->hemisphere.firstPass.programID)
+		{
+			fprintf(stderr, "Error loading the hemisphere first pass shader program... leaving!\n");
+			glDeleteVertexArrays(1, &ctx->hemisphere.vao);
+			glDeleteRenderbuffers(1, &ctx->hemisphere.fbDepth);
+			glDeleteFramebuffers(2, ctx->hemisphere.fb);
+			glDeleteTextures(2, ctx->hemisphere.fbTexture);
+			LM_FREE(ctx);
+			return NULL;
+		}
+		ctx->hemisphere.firstPass.hemispheresTextureID = glGetUniformLocation(ctx->hemisphere.firstPass.programID, "hemispheres");
+		ctx->hemisphere.firstPass.weightsTextureID = glGetUniformLocation(ctx->hemisphere.firstPass.programID, "weights");
+	}
+
+	// downsample shader
+	{
+		const char *vs =
+			"#version 150 core\n"
+			"const vec2 ps[4] = vec2[](vec2(1, -1), vec2(1, 1), vec2(-1, -1), vec2(-1, 1));\n"
+			"void main()\n"
+			"{\n"
+				"gl_Position = vec4(ps[gl_VertexID], 0, 1);\n"
+			"}\n";
+		const char *fs =
+			"#version 150 core\n"
+			"uniform sampler2D hemispheres;\n"
+
+			"layout(pixel_center_integer) in vec4 gl_FragCoord;\n" // whole integer values represent pixel centers, GL_ARB_fragment_coord_conventions
+
+			"out vec4 outColor;\n"
+
+			"void main()\n"
+			"{\n" // this is a sum downsampling pass (alpha component contains the weighted valid sample count)
+				"ivec2 h_uv = ivec2(gl_FragCoord.xy) * 2;\n"
+				"vec4 lb = texelFetch(hemispheres, h_uv + ivec2(0, 0), 0);\n"
+				"vec4 rb = texelFetch(hemispheres, h_uv + ivec2(1, 0), 0);\n"
+				"vec4 lt = texelFetch(hemispheres, h_uv + ivec2(0, 1), 0);\n"
+				"vec4 rt = texelFetch(hemispheres, h_uv + ivec2(1, 1), 0);\n"
+				"outColor = lb + rb + lt + rt;\n"
+			"}\n";
+		ctx->hemisphere.downsamplePass.programID = lm_LoadProgram(vs, fs);
+		if (!ctx->hemisphere.downsamplePass.programID)
+		{
+			fprintf(stderr, "Error loading the hemisphere downsample pass shader program... leaving!\n");
+			glDeleteProgram(ctx->hemisphere.firstPass.programID);
+			glDeleteVertexArrays(1, &ctx->hemisphere.vao);
+			glDeleteRenderbuffers(1, &ctx->hemisphere.fbDepth);
+			glDeleteFramebuffers(2, ctx->hemisphere.fb);
+			glDeleteTextures(2, ctx->hemisphere.fbTexture);
+			LM_FREE(ctx);
+			return NULL;
+		}
+		ctx->hemisphere.downsamplePass.hemispheresTextureID = glGetUniformLocation(ctx->hemisphere.downsamplePass.programID, "hemispheres");
+	}
+
+	// hemisphere weights texture
+	glGenTextures(1, &ctx->hemisphere.firstPass.weightsTexture);
+	lmSetHemisphereWeights(ctx, lm_defaultWeights, 0);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
+
+	// allocate batchPosition-to-lightmapPosition map
+	ctx->hemisphere.fbHemiToLightmapLocation = (lm_ivec2*)LM_CALLOC(ctx->hemisphere.fbHemiCountX * ctx->hemisphere.fbHemiCountY, sizeof(lm_ivec2));
+
+	return ctx;
+}
+
+void lmDestroy(lm_context *ctx)
+{
+	// reset state
+	glUseProgram(0);
+	glBindTexture(GL_TEXTURE_2D, 0);
+	glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
+	glBindVertexArray(0);
+	glBindFramebuffer(GL_FRAMEBUFFER, 0);
+	glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
+	glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+
+	// delete gl objects
+	glDeleteTextures(1, &ctx->hemisphere.firstPass.weightsTexture);
+	glDeleteTextures(1, &ctx->hemisphere.storage.texture);
+	glDeleteProgram(ctx->hemisphere.downsamplePass.programID);
+	glDeleteProgram(ctx->hemisphere.firstPass.programID);
+	glDeleteVertexArrays(1, &ctx->hemisphere.vao);
+	glDeleteRenderbuffers(1, &ctx->hemisphere.fbDepth);
+	glDeleteFramebuffers(2, ctx->hemisphere.fb);
+	glDeleteTextures(2, ctx->hemisphere.fbTexture);
+	glDeleteTextures(1, &ctx->hemisphere.storage.texture);
+
+	// free memory
+	LM_FREE(ctx->hemisphere.storage.toLightmapLocation);
+	LM_FREE(ctx->hemisphere.fbHemiToLightmapLocation);
+#ifdef LM_DEBUG_INTERPOLATION
+	LM_FREE(ctx->lightmap.debug);
+#endif
+	LM_FREE(ctx);
+}
+
+void lmSetHemisphereWeights(lm_context *ctx, lm_weight_func f, void *userdata)
+{
+	// hemisphere weights texture. bakes in material dependent attenuation behaviour.
+	float *weights = (float*)LM_CALLOC(2 * 3 * ctx->hemisphere.size * ctx->hemisphere.size, sizeof(float));
+	float center = (ctx->hemisphere.size - 1) * 0.5f;
+	double sum = 0.0;
+	for (unsigned int y = 0; y < ctx->hemisphere.size; y++)
+	{
+		float dy = 2.0f * (y - center) / (float)ctx->hemisphere.size;
+		for (unsigned int x = 0; x < ctx->hemisphere.size; x++)
+		{
+			float dx = 2.0f * (x - center) / (float)ctx->hemisphere.size;
+			lm_vec3 v = lm_normalize3(lm_v3(dx, dy, 1.0f));
+
+			float solidAngle = v.z * v.z * v.z;
+
+			float *w0 = weights + 2 * (y * (3 * ctx->hemisphere.size) + x);
+			float *w1 = w0 + 2 * ctx->hemisphere.size;
+			float *w2 = w1 + 2 * ctx->hemisphere.size;
+
+			// center weights
+			w0[0] = solidAngle * f(v.z, userdata);
+			w0[1] = solidAngle;
+
+			// left/right side weights
+			w1[0] = solidAngle * f(lm_absf(v.x), userdata);
+			w1[1] = solidAngle;
+
+			// up/down side weights
+			w2[0] = solidAngle * f(lm_absf(v.y), userdata);
+			w2[1] = solidAngle;
+
+			sum += 3.0 * (double)solidAngle;
+		}
+	}
+
+	// normalize weights
+	float weightScale = (float)(1.0 / sum);
+	for (unsigned int i = 0; i < 2 * 3 * ctx->hemisphere.size * ctx->hemisphere.size; i++)
+		weights[i] *= weightScale;
+
+	// upload weight texture
+	glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.firstPass.weightsTexture);
+	glTexImage2D(GL_TEXTURE_2D, 0, GL_RG32F, 3 * ctx->hemisphere.size, ctx->hemisphere.size, 0, GL_RG, GL_FLOAT, weights);
+	LM_FREE(weights);
+}
+
+void lmSetTargetLightmap(lm_context *ctx, float *outLightmap, int w, int h, int c)
+{
+	ctx->lightmap.data = outLightmap;
+	ctx->lightmap.width = w;
+	ctx->lightmap.height = h;
+	ctx->lightmap.channels = c;
+
+	// allocate storage texture
+	if (!ctx->hemisphere.storage.texture)
+		glGenTextures(1, &ctx->hemisphere.storage.texture);
+	glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.storage.texture);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+	glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F, w, h, 0, GL_RGBA, GL_FLOAT, 0);
+
+	// allocate storage position to lightmap position map
+	if (ctx->hemisphere.storage.toLightmapLocation)
+		LM_FREE(ctx->hemisphere.storage.toLightmapLocation);
+	ctx->hemisphere.storage.toLightmapLocation = (lm_ivec2*)LM_CALLOC(w * h, sizeof(lm_ivec2));
+	// invalidate all positions
+	for (int i = 0; i < w * h; i++)
+		ctx->hemisphere.storage.toLightmapLocation[i].x = -1;
+
+#ifdef LM_DEBUG_INTERPOLATION
+	if (ctx->lightmap.debug)
+		LM_FREE(ctx->lightmap.debug);
+	ctx->lightmap.debug = (unsigned char*)LM_CALLOC(ctx->lightmap.width * ctx->lightmap.height, 3);
+#endif
+}
+
+void lmSetGeometry(lm_context *ctx,
+	const float *transformationMatrix,
+	lm_type positionsType, const void *positionsXYZ, int positionsStride,
+	lm_type normalsType, const void *normalsXYZ, int normalsStride,
+	lm_type lightmapCoordsType, const void *lightmapCoordsUV, int lightmapCoordsStride,
+	int count, lm_type indicesType, const void *indices)
+{
+	ctx->mesh.modelMatrix = transformationMatrix;
+	ctx->mesh.positions = (const unsigned char*)positionsXYZ;
+	ctx->mesh.positionsType = positionsType;
+	ctx->mesh.positionsStride = positionsStride == 0 ? sizeof(lm_vec3) : positionsStride;
+	ctx->mesh.normals = (const unsigned char*)normalsXYZ;
+	ctx->mesh.normalsType = normalsType;
+	ctx->mesh.normalsStride = normalsStride == 0 ? sizeof(lm_vec3) : normalsStride;
+	ctx->mesh.uvs = (const unsigned char*)lightmapCoordsUV;
+	ctx->mesh.uvsType = lightmapCoordsType;
+	ctx->mesh.uvsStride = lightmapCoordsStride == 0 ? sizeof(lm_vec2) : lightmapCoordsStride;
+	ctx->mesh.indicesType = indicesType;
+	ctx->mesh.indices = (const unsigned char*)indices;
+	ctx->mesh.count = count;
+
+	lm_inverseTranspose(transformationMatrix, ctx->mesh.normalMatrix);
+
+	ctx->meshPosition.pass = 0;
+	lm_setMeshPosition(ctx, 0);
+}
+
+lm_bool lmBegin(lm_context *ctx, int* outViewport4, float* outView4x4, float* outProjection4x4)
+{
+	assert(ctx->meshPosition.triangle.baseIndex < ctx->mesh.count);
+	while (!lm_beginSampleHemisphere(ctx, outViewport4, outView4x4, outProjection4x4))
+	{ // as long as there are no hemisphere sides to render...
+		// try moving to the next rasterizer position
+		if (lm_findNextConservativeTriangleRasterizerPosition(ctx))
+		{ // if we successfully moved to the next sample position on the current triangle...
+			ctx->meshPosition.hemisphere.side = 0; // start sampling a hemisphere there
+		}
+		else
+		{ // if there are no valid sample positions on the current triangle...
+			if (ctx->meshPosition.triangle.baseIndex + 3 < ctx->mesh.count)
+			{ // ...and there are triangles left: move to the next triangle and continue sampling.
+				lm_setMeshPosition(ctx, ctx->meshPosition.triangle.baseIndex + 3);
+			}
+			else
+			{ // ...and there are no triangles left: finish
+				lm_integrateHemisphereBatch(ctx); // integrate and store last batch
+				lm_writeResultsToLightmap(ctx); // read storage data from gpu memory and write it to the lightmap
+
+				if (++ctx->meshPosition.pass == ctx->meshPosition.passCount)
+				{
+					ctx->meshPosition.pass = 0;
+					ctx->meshPosition.triangle.baseIndex = ctx->mesh.count; // set end condition (in case someone accidentally calls lmBegin again)
+
+#ifdef LM_DEBUG_INTERPOLATION
+					lmImageSaveTGAub("debug_interpolation.tga", ctx->lightmap.debug, ctx->lightmap.width, ctx->lightmap.height, 3);
+
+					// lightmap texel statistics
+					int rendered = 0, interpolated = 0, wasted = 0;
+					for (int y = 0; y < ctx->lightmap.height; y++)
+					{
+						for (int x = 0; x < ctx->lightmap.width; x++)
+						{
+							if (ctx->lightmap.debug[(y * ctx->lightmap.width + x) * 3 + 0])
+								rendered++;
+							else if (ctx->lightmap.debug[(y * ctx->lightmap.width + x) * 3 + 1])
+								interpolated++;
+							else
+								wasted++;
+						}
+					}
+					int used = rendered + interpolated;
+					int total = used + wasted;
+					printf("\n#######################################################################\n");
+					printf("%10d %6.2f%% rendered hemicubes integrated to lightmap texels.\n", rendered, 100.0f * (float)rendered / (float)total);
+					printf("%10d %6.2f%% interpolated lightmap texels.\n", interpolated, 100.0f * (float)interpolated / (float)total);
+					printf("%10d %6.2f%% wasted lightmap texels.\n", wasted, 100.0f * (float)wasted / (float)total);
+					printf("\n%17.2f%% of used texels were rendered.\n", 100.0f * (float)rendered / (float)used);
+					printf("#######################################################################\n");
+#endif
+
+					return LM_FALSE;
+				}
+
+				lm_setMeshPosition(ctx, 0); // start over with the next pass
+			}
+		}
+	}
+	return LM_TRUE;
+}
+
+float lmProgress(lm_context *ctx)
+{
+	float passProgress = (float)ctx->meshPosition.triangle.baseIndex / (float)ctx->mesh.count;
+	return ((float)ctx->meshPosition.pass + passProgress) / (float)ctx->meshPosition.passCount;
+}
+
+void lmEnd(lm_context *ctx)
+{
+	lm_endSampleHemisphere(ctx);
+}
+
+// these are not performance tuned since their impact on the whole lightmapping duration is insignificant
+float lmImageMin(const float *image, int w, int h, int c, int m)
+{
+	assert(c > 0 && m);
+	float minValue = FLT_MAX;
+	for (int i = 0; i < w * h; i++)
+		for (int j = 0; j < c; j++)
+			if (m & (1 << j))
+				minValue = lm_minf(minValue, image[i * c + j]);
+	return minValue;
+}
+
+float lmImageMax(const float *image, int w, int h, int c, int m)
+{
+	assert(c > 0 && m);
+	float maxValue = 0.0f;
+	for (int i = 0; i < w * h; i++)
+		for (int j = 0; j < c; j++)
+			if (m & (1 << j))
+				maxValue = lm_maxf(maxValue, image[i * c + j]);
+	return maxValue;
+}
+
+void lmImageAdd(float *image, int w, int h, int c, float value, int m)
+{
+	assert(c > 0 && m);
+	for (int i = 0; i < w * h; i++)
+		for (int j = 0; j < c; j++)
+			if (m & (1 << j))
+				image[i * c + j] += value;
+}
+
+void lmImageScale(float *image, int w, int h, int c, float factor, int m)
+{
+	assert(c > 0 && m);
+	for (int i = 0; i < w * h; i++)
+		for (int j = 0; j < c; j++)
+			if (m & (1 << j))
+				image[i * c + j] *= factor;
+}
+
+void lmImagePower(float *image, int w, int h, int c, float exponent, int m)
+{
+	assert(c > 0 && m);
+	for (int i = 0; i < w * h; i++)
+		for (int j = 0; j < c; j++)
+			if (m & (1 << j))
+				image[i * c + j] = powf(image[i * c + j], exponent);
+}
+
+void lmImageDilate(const float *image, float *outImage, int w, int h, int c)
+{
+	assert(c > 0 && c <= 4);
+	for (int y = 0; y < h; y++)
+	{
+		for (int x = 0; x < w; x++)
+		{
+			float color[4];
+			lm_bool valid = LM_FALSE;
+			for (int i = 0; i < c; i++)
+			{
+				color[i] = image[(y * w + x) * c + i];
+				valid |= color[i] > 0.0f;
+			}
+			if (!valid)
+			{
+				int n = 0;
+				const int dx[] = { -1, 0, 1,  0 };
+				const int dy[] = {  0, 1, 0, -1 };
+				for (int d = 0; d < 4; d++)
+				{
+					int cx = x + dx[d];
+					int cy = y + dy[d];
+					if (cx >= 0 && cx < w && cy >= 0 && cy < h)
+					{
+						float dcolor[4];
+						lm_bool dvalid = LM_FALSE;
+						for (int i = 0; i < c; i++)
+						{
+							dcolor[i] = image[(cy * w + cx) * c + i];
+							dvalid |= dcolor[i] > 0.0f;
+						}
+						if (dvalid)
+						{
+							for (int i = 0; i < c; i++)
+								color[i] += dcolor[i];
+							n++;
+						}
+					}
+				}
+				if (n)
+				{
+					float in = 1.0f / n;
+					for (int i = 0; i < c; i++)
+						color[i] *= in;
+				}
+			}
+			for (int i = 0; i < c; i++)
+				outImage[(y * w + x) * c + i] = color[i];
+		}
+	}
+}
+
+void lmImageSmooth(const float *image, float *outImage, int w, int h, int c)
+{
+	assert(c > 0 && c <= 4);
+	for (int y = 0; y < h; y++)
+	{
+		for (int x = 0; x < w; x++)
+		{
+			float color[4] = {0};
+			int n = 0;
+			for (int dy = -1; dy <= 1; dy++)
+			{
+				int cy = y + dy;
+				for (int dx = -1; dx <= 1; dx++)
+				{
+					int cx = x + dx;
+					if (cx >= 0 && cx < w && cy >= 0 && cy < h)
+					{
+						lm_bool valid = LM_FALSE;
+						for (int i = 0; i < c; i++)
+							valid |= image[(cy * w + cx) * c + i] > 0.0f;
+						if (valid)
+						{
+							for (int i = 0; i < c; i++)
+								color[i] += image[(cy * w + cx) * c + i];
+							n++;
+						}
+					}
+				}
+			}
+			for (int i = 0; i < c; i++)
+				outImage[(y * w + x) * c + i] = n ? color[i] / n : 0.0f;
+		}
+	}
+}
+
+void lmImageDownsample(const float *image, float *outImage, int w, int h, int c)
+{
+	assert(c > 0 && c <= 4);
+	for (int y = 0; y < h / 2; y++)
+	{
+		for (int x = 0; x < w / 2; x++)
+		{
+			int p0 = 2 * (y * w + x) * c;
+			int p1 = p0 + w * c;
+			int valid[2][2] = {0};
+			float sums[4] = {0};
+			for (int i = 0; i < c; i++)
+			{
+				valid[0][0] |= image[p0     + i] != 0.0f ? 1 : 0;
+				valid[0][1] |= image[p0 + c + i] != 0.0f ? 1 : 0;
+				valid[1][0] |= image[p1     + i] != 0.0f ? 1 : 0;
+				valid[1][1] |= image[p1 + c + i] != 0.0f ? 1 : 0;
+				sums[i] += image[p0 + i] + image[p0 + c + i] + image[p1 + i] + image[p1 + c + i];
+			}
+			int n = valid[0][0] + valid[0][1] + valid[1][0] + valid[1][1];
+			int p = (y * w / 2 + x) * c;
+			for (int i = 0; i < c; i++)
+				outImage[p + i] = n ? sums[i] / n : 0.0f;
+		}
+	}
+}
+
+void lmImageFtoUB(const float *image, unsigned char *outImage, int w, int h, int c, float max)
+{
+	assert(c > 0);
+	float scale = 255.0f / (max != 0.0f ? max : lmImageMax(image, w, h, c, LM_ALL_CHANNELS));
+	for (int i = 0; i < w * h * c; i++)
+		outImage[i] = (unsigned char)lm_minf(lm_maxf(image[i] * scale, 0.0f), 255.0f);
+}
+
+// TGA output helpers
+static void lm_swapRandBub(unsigned char *image, int w, int h, int c)
+{
+	assert(c >= 3);
+	for (int i = 0; i < w * h * c; i += c)
+		LM_SWAP(unsigned char, image[i], image[i + 2]);
+}
+
+lm_bool lmImageSaveTGAub(const char *filename, const unsigned char *image, int w, int h, int c)
+{
+	assert(c == 1 || c == 3 || c == 4);
+	lm_bool isGreyscale = c == 1;
+	lm_bool hasAlpha = c == 4;
+	unsigned char header[18] = {
+		0, 0, (unsigned char)(isGreyscale ? 3 : 2), 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		(unsigned char)(w & 0xff), (unsigned char)((w >> 8) & 0xff), (unsigned char)(h & 0xff), (unsigned char)((h >> 8) & 0xff),
+		(unsigned char)(8 * c), (unsigned char)(hasAlpha ? 8 : 0)
+	};
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+	FILE *file;
+	if (fopen_s(&file, filename, "wb") != 0) return LM_FALSE;
+#else
+	FILE *file = fopen(filename, "wb");
+	if (!file) return LM_FALSE;
+#endif
+	fwrite(header, 1, sizeof(header), file);
+
+	// we make sure to swap it back! trust me. :)
+	if (!isGreyscale)
+		lm_swapRandBub((unsigned char*)image, w, h, c);
+	fwrite(image, 1, w * h * c , file);
+	if (!isGreyscale)
+		lm_swapRandBub((unsigned char*)image, w, h, c);
+
+	fclose(file);
+	return LM_TRUE;
+}
+
+lm_bool lmImageSaveTGAf(const char *filename, const float *image, int w, int h, int c, float max)
+{
+	unsigned char *temp = (unsigned char*)LM_CALLOC(w * h * c, sizeof(unsigned char));
+	lmImageFtoUB(image, temp, w, h, c, max);
+	lm_bool success = lmImageSaveTGAub(filename, temp, w, h, c);
+	LM_FREE(temp);
+	return success;
+}
+
+#endif // LIGHTMAPPER_IMPLEMENTATION
diff --git a/engine/split/v4k.x.inl b/engine/split/v4k.x.inl
index 578ae18..bd140e9 100644
--- a/engine/split/v4k.x.inl
+++ b/engine/split/v4k.x.inl
@@ -53,6 +53,7 @@
 #define BQ_PLATFORM_IMPLEMENTATION            // websocket
 #define BQ_WEBSOCKET_IMPLEMENTATION           // websocket
 #define XML_C                                 // xml
+#define LIGHTMAPPER_IMPLEMENTATION            // lightmapper
 #ifdef __APPLE__
 #define MA_NO_RUNTIME_LINKING                 // miniaudio osx
 #define _GLFW_COCOA                           // glfw osx
@@ -217,4 +218,6 @@ static char *ui_filter = 0;
 {{FILE:3rd_lite_sys.h}}
 {{FILE:3rd_lite.h}}
 
+{{FILE:3rd_lightmapper.h}}
+
 #endif // V4K_3RD
diff --git a/engine/split/v4k_gui.c b/engine/split/v4k_gui.c
index c170636..80d252b 100644
--- a/engine/split/v4k_gui.c
+++ b/engine/split/v4k_gui.c
@@ -118,17 +118,23 @@ void *gui_userdata() {
     return last_skin->userdata;
 }
 
-vec2 gui_getskinsize(const char *skin) {
+vec2 gui_getskinsize(const char *skin, const char *fallback) {
     vec2 size={0};
-    if (last_skin->getskinsize) last_skin->getskinsize(last_skin->userdata, skin, &size);
+    if (last_skin->getskinsize) last_skin->getskinsize(last_skin->userdata, skin, fallback, &size);
     return size;
 }
 
-bool gui_ismouseinrect(const char *skin, vec4 rect) {
-    if (last_skin->ismouseinrect) return last_skin->ismouseinrect(last_skin->userdata, skin, rect);
+bool gui_ismouseinrect(const char *skin, const char *fallback, vec4 rect) {
+    if (last_skin->ismouseinrect) return last_skin->ismouseinrect(last_skin->userdata, skin, fallback, rect);
     return false; 
 }
 
+vec4 gui_getscissorrect(const char *skin, const char *fallback, vec4 rect) {
+    vec4 scissor = rect;
+    if (last_skin->getscissorrect) last_skin->getscissorrect(last_skin->userdata, skin, fallback, rect, &scissor);
+    return scissor;
+}
+
 static
 gui_state_t *gui_getstate(int id) {
     if (!ctl_states) map_init(ctl_states, less_int, hash_int);
@@ -138,8 +144,8 @@ gui_state_t *gui_getstate(int id) {
 void gui_panel_id(int id, vec4 rect, const char *skin) {
     (void)id;
     vec4 scissor={0, 0, window_width(), window_height()};
-    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, skin, rect);
-    if (last_skin->getscissorrect) last_skin->getscissorrect(last_skin->userdata, skin, rect, &scissor);
+    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, skin, NULL, rect);
+    scissor = gui_getscissorrect(skin, NULL, rect);
 
     if (!array_count(scissor_rects))
         glEnable(GL_SCISSOR_TEST);
@@ -164,7 +170,7 @@ bool gui_button_id(int id, vec4 r, const char *skin) {
 
     skin=skin?skin:"button";
     char *btn = va("%s%s", skin, entry->held?"_press":entry->hover?"_hover":"");
-    if (gui_ismouseinrect(btn, r)) {
+    if (gui_ismouseinrect(btn, skin, r)) {
         if (input_up(MOUSE_L) && entry->held) {
             was_clicked=1;
         }
@@ -174,24 +180,27 @@ bool gui_button_id(int id, vec4 r, const char *skin) {
             entry->hover = true;
         }
     }
-    else if (input_up(MOUSE_L) && entry->held) {
-        entry->held = false;
-        any_widget_used = false;
-    }
     else {
         entry->hover = false;
     }
 
-    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, btn, r);
+    if (input_up(MOUSE_L) && entry->held) {
+        entry->held = false;
+        any_widget_used = false;
+    }
+
+    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, btn, skin, r);
 
     return was_clicked;
 }
 
 bool gui_button_label_id(int id, const char *text, vec4 rect, const char *skin) {
+    gui_state_t *entry = gui_getstate(id);
     bool state = gui_button_id(id, rect, skin);
     vec2 buttonsize={0};
     skin=skin?skin:"button";
-    if (last_skin->getskinsize) last_skin->getskinsize(last_skin->userdata, skin, &buttonsize);
+    char *btn = va("%s%s", skin, entry->held?"_press":entry->hover?"_hover":"");
+    buttonsize = gui_getskinsize(btn, skin);
 
     vec2 textsize = font_rect(text);
     vec2 pos;
@@ -224,7 +233,8 @@ bool gui_slider_id(int id, vec4 rect, const char *skin, float min, float max, fl
 
     skin = skin?skin:"slider";
     char *cursorskin = va("%s_cursor%s", skin, entry->held?"_press":entry->hover?"_hover":"");
-    if (gui_ismouseinrect(skin, rect) && !any_widget_used) {
+    char *fbcursor = va("%s_cursor", skin);
+    if (gui_ismouseinrect(skin, NULL, rect) && !any_widget_used) {
         any_widget_used = entry->held = input_held(MOUSE_L);
         entry->hover = true;
     }
@@ -237,13 +247,12 @@ bool gui_slider_id(int id, vec4 rect, const char *skin, float min, float max, fl
     }
 
     float old_value = *value;
-    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, skin, rect);
+    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, skin, NULL, rect);
 
     vec2 slidersize={0}, cursorsize={0};
-    vec4 usablerect=rect;
-    if (last_skin->getscissorrect) last_skin->getscissorrect(last_skin->userdata, skin, rect, &usablerect);
-    if (last_skin->getskinsize) last_skin->getskinsize(last_skin->userdata, skin, &slidersize);
-    if (last_skin->getskinsize) last_skin->getskinsize(last_skin->userdata, cursorskin, &cursorsize);
+    vec4 usablerect = gui_getscissorrect(skin, NULL, rect);
+    slidersize = gui_getskinsize(skin, NULL);
+    cursorsize = gui_getskinsize(cursorskin, fbcursor);
     if (entry->held) {
         *value = posx2slider(usablerect, min, max, input(MOUSE_X), step);
     }
@@ -254,7 +263,7 @@ bool gui_slider_id(int id, vec4 rect, const char *skin, float min, float max, fl
     cursorrect.y += cursorpos.y;
     cursorrect.z = cursorsize.x;
     cursorrect.w = cursorsize.y;
-    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, cursorskin, cursorrect);
+    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, cursorskin, fbcursor, cursorrect);
 
     return entry->held && (old_value!=*value);
 }
@@ -263,7 +272,7 @@ bool gui_slider_label_id(int id, const char *text, vec4 rect, const char *skin,
     bool state = gui_slider_id(id, rect, skin, min, max, step, value);
     vec2 slidersize={0};
     skin=skin?skin:"slider";
-    if (last_skin->getskinsize) last_skin->getskinsize(last_skin->userdata, skin, &slidersize);
+    slidersize = gui_getskinsize(skin, NULL);
 
     vec2 textsize = font_rect(text);
     vec2 pos;
@@ -276,7 +285,7 @@ bool gui_slider_label_id(int id, const char *text, vec4 rect, const char *skin,
 
 void gui_rect_id(int id, vec4 r, const char *skin) {
     (void)id;
-    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, skin, r);
+    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, skin, NULL, r);
 }
 
 void gui_label_id(int id, const char *text, vec4 rect) {
@@ -300,7 +309,7 @@ atlas_slice_frame_t *skinned_getsliceframe(atlas_t *a, const char *name) {
     for (int i = 0; i < array_count(a->slices); i++) 
         if (!strcmp(quark_string(&a->db, a->slices[i].name), name))
             return &a->slice_frames[a->slices[i].frames[0]];
-    PRINTF("slice name: '%s' is missing in atlas!\n", name);
+    // PRINTF("slice name: '%s' is missing in atlas!\n", name);
     return NULL;
 }
 
@@ -311,9 +320,10 @@ void skinned_draw_missing_rect(vec4 r) {
 }
 
 static
-bool skinned_ismouseinrect(void *userdata, const char *skin, vec4 r) {
+bool skinned_ismouseinrect(void *userdata, const char *skin, const char *fallback, vec4 r) {
     skinned_t *a = C_CAST(skinned_t*, userdata);
     atlas_slice_frame_t *f = skinned_getsliceframe(&a->atlas, skin);
+    if (!f && fallback) f = skinned_getsliceframe(&a->atlas, fallback);
     if (!f) return false;
 
     vec4 outer = f->bounds;
@@ -396,18 +406,20 @@ void skinned_draw_sprite(float scale, atlas_t *a, atlas_slice_frame_t *f, vec4 r
 }
 
 static
-void skinned_draw_rect(void* userdata, const char *skin, vec4 r) {
+void skinned_draw_rect(void* userdata, const char *skin, const char *fallback, vec4 r) {
     skinned_t *a = C_CAST(skinned_t*, userdata);
 
     atlas_slice_frame_t *f = skinned_getsliceframe(&a->atlas, skin);
+    if (!f && fallback) f = skinned_getsliceframe(&a->atlas, fallback);
     if (!f) skinned_draw_missing_rect(r);
     else skinned_draw_sprite(a->scale, &a->atlas, f, r);
 }
 
-void skinned_getskinsize(void *userdata, const char *skin, vec2 *size) {
+void skinned_getskinsize(void *userdata, const char *skin, const char *fallback, vec2 *size) {
     skinned_t *a = C_CAST(skinned_t*, userdata);
 
     atlas_slice_frame_t *f = skinned_getsliceframe(&a->atlas, skin);
+    if (!f && fallback) f = skinned_getsliceframe(&a->atlas, fallback);
     if (f) {
         size->x = (f->bounds.z-f->bounds.x)*a->scale;
         size->y = (f->bounds.w-f->bounds.y)*a->scale;
@@ -415,9 +427,10 @@ void skinned_getskinsize(void *userdata, const char *skin, vec2 *size) {
 }
 
 static
-void skinned_getscissorrect(void* userdata, const char *skin, vec4 rect, vec4 *dims) {
+void skinned_getscissorrect(void* userdata, const char *skin, const char *fallback, vec4 rect, vec4 *dims) {
     skinned_t *a = C_CAST(skinned_t*, userdata);
     atlas_slice_frame_t *f = skinned_getsliceframe(&a->atlas, skin);
+    if (!f && fallback) f = skinned_getsliceframe(&a->atlas, fallback);
     if (!f) return;
 
     *dims = rect;
diff --git a/engine/split/v4k_gui.h b/engine/split/v4k_gui.h
index 4d6b40c..28377bd 100644
--- a/engine/split/v4k_gui.h
+++ b/engine/split/v4k_gui.h
@@ -2,18 +2,19 @@
 // game ui
 
 typedef struct guiskin_t {
-    void (*drawrect)(void* userdata, const char *skin, vec4 rect);
-    void (*getskinsize)(void* userdata, const char *skin, vec2 *size);
-    void (*getscissorrect)(void* userdata, const char *skin, vec4 rect, vec4 *dims);
-    bool (*ismouseinrect)(void* userdata, const char *skin, vec4 rect);
+    void (*drawrect)(void* userdata, const char *skin, const char *fallback, vec4 rect);
+    void (*getskinsize)(void* userdata, const char *skin, const char *fallback, vec2 *size);
+    void (*getscissorrect)(void* userdata, const char *skin, const char *fallback, vec4 rect, vec4 *dims);
+    bool (*ismouseinrect)(void* userdata, const char *skin, const char *fallback, vec4 rect);
     void (*free)(void* userdata);
     void *userdata;
 } guiskin_t;
 
 API void    gui_pushskin(guiskin_t skin);
 API void*       gui_userdata();
-API vec2        gui_getskinsize(const char *skin);
-API bool        gui_ismouseinrect(const char *skin, vec4 rect);
+API vec2        gui_getskinsize(const char *skin, const char *fallback);
+API bool        gui_ismouseinrect(const char *skin, const char *fallback, vec4 rect);
+API vec4        gui_getscissorrect(const char *skin, const char *fallback, vec4 rect);
 // --
 API void        gui_panel_id(int id, vec4 rect, const char *skin);
 API void            gui_rect_id(int id, vec4 rect, const char *skin);
diff --git a/engine/split/v4k_render.h b/engine/split/v4k_render.h
index 4f62d92..e784c97 100644
--- a/engine/split/v4k_render.h
+++ b/engine/split/v4k_render.h
@@ -577,6 +577,13 @@ typedef struct anims_t {
 
 API anims_t animations(const char *pathfile, int flags);
 
+// -----------------------------------------------------------------------------
+// lightmapping utils
+
+typedef struct lightmap_t {
+    struct lm_context *ctx; // private
+} lightmap_t;
+
 // -----------------------------------------------------------------------------
 // skyboxes
 
diff --git a/engine/v4k b/engine/v4k
index 7852c92..55b771d 100644
--- a/engine/v4k
+++ b/engine/v4k
@@ -13981,6 +13981,7 @@ int gladLoadGL( GLADloadfunc load) {
 #define BQ_PLATFORM_IMPLEMENTATION            // websocket
 #define BQ_WEBSOCKET_IMPLEMENTATION           // websocket
 #define XML_C                                 // xml
+#define LIGHTMAPPER_IMPLEMENTATION            // lightmapper
 #ifdef __APPLE__
 #define MA_NO_RUNTIME_LINKING                 // miniaudio osx
 #define _GLFW_COCOA                           // glfw osx
@@ -329043,4 +329044,1768 @@ void lt_tick(struct lua_State *L) {
 }
 #line 0
 
+#line 1 "3rd_lightmapper.h"
+/***********************************************************
+* A single header file OpenGL lightmapping library         *
+* https://github.com/ands/lightmapper                      *
+* no warranty implied | use at your own risk               *
+* author: Andreas Mantler (ands) | last change: 10.05.2018 *
+*                                                          *
+* License:                                                 *
+* This software is in the public domain.                   *
+* Where that dedication is not recognized,                 *
+* you are granted a perpetual, irrevocable license to copy *
+* and modify this file however you want.                   *
+***********************************************************/
+
+#ifndef LIGHTMAPPER_H
+#define LIGHTMAPPER_H
+
+#ifdef __cplusplus
+#define LM_DEFAULT_VALUE(value) = value
+#else
+#define LM_DEFAULT_VALUE(value)
+#endif
+
+#ifndef LM_CALLOC
+#define LM_CALLOC(count, size) calloc(count, size)
+#endif
+
+#ifndef LM_FREE
+#define LM_FREE(ptr) free(ptr)
+#endif
+
+typedef int lm_bool;
+#define LM_FALSE 0
+#define LM_TRUE  1
+
+typedef int lm_type;
+#define LM_NONE           0
+#define LM_UNSIGNED_BYTE  GL_UNSIGNED_BYTE
+#define LM_UNSIGNED_SHORT GL_UNSIGNED_SHORT
+#define LM_UNSIGNED_INT   GL_UNSIGNED_INT
+#define LM_FLOAT          GL_FLOAT
+
+typedef struct lm_context lm_context;
+
+// creates a lightmapper instance. it can be used to render multiple lightmaps.
+lm_context *lmCreate(
+	int hemisphereSize,                                                                                // hemisphereSize: resolution of the hemisphere renderings. must be a power of two! typical: 64.
+	float zNear, float zFar,                                                                           // zNear/zFar: hemisphere min/max draw distances.
+	float clearR, float clearG, float clearB,                                                          // clear color / background color / sky color.
+	int interpolationPasses, float interpolationThreshold,                                             // passes: hierarchical selective interpolation passes (0-8; initial step size = 2^passes).
+	                                                                                                   // threshold: error value below which lightmap pixels are interpolated instead of rendered.
+	                                                                                                   // use output image from LM_DEBUG_INTERPOLATION to determine a good value.
+	                                                                                                   // values around and below 0.01 are probably ok.
+	                                                                                                   // the lower the value, the more hemispheres are rendered -> slower, but possibly better quality.
+	float cameraToSurfaceDistanceModifier LM_DEFAULT_VALUE(0.0f));                                     // modifier for the height of the rendered hemispheres above the surface
+	                                                                                                   // -1.0f => stick to surface, 0.0f => minimum height for interpolated surface normals,
+	                                                                                                   // > 0.0f => improves gradients on surfaces with interpolated normals due to the flat surface horizon,
+	                                                                                                   // but may introduce other artifacts.
+
+// optional: set material characteristics by specifying cos(theta)-dependent weights for incoming light.
+typedef float (*lm_weight_func)(float cos_theta, void *userdata);
+void lmSetHemisphereWeights(lm_context *ctx, lm_weight_func f, void *userdata);                        // precalculates weights for incoming light depending on its angle. (default: all weights are 1.0f)
+
+// specify an output lightmap image buffer with w * h * c * sizeof(float) bytes of memory.
+void lmSetTargetLightmap(lm_context *ctx, float *outLightmap, int w, int h, int c);                    // output HDR lightmap (linear 32bit float channels; c: 1->Greyscale, 2->Greyscale+Alpha, 3->RGB, 4->RGBA).
+
+// set the geometry to map to the currently set target lightmap (set the target lightmap before calling this!).
+void lmSetGeometry(lm_context *ctx,
+	const float *transformationMatrix,                                                                 // 4x4 object-to-world transform for the geometry or NULL (no transformation).
+	lm_type positionsType, const void *positionsXYZ, int positionsStride,                              // triangle mesh in object space.
+	lm_type normalsType, const void *normalsXYZ, int normalsStride,                                    // optional normals for the mesh in object space (Use LM_NONE type in case you only need flat surfaces).
+	lm_type lightmapCoordsType, const void *lightmapCoordsUV, int lightmapCoordsStride,                // lightmap atlas texture coordinates for the mesh [0..1]x[0..1] (integer types are normalized to 0..1 range).
+	int count, lm_type indicesType LM_DEFAULT_VALUE(LM_NONE), const void *indices LM_DEFAULT_VALUE(0));// if mesh indices are used, count = number of indices else count = number of vertices.
+
+
+// as long as lmBegin returns true, the scene has to be rendered with the
+// returned camera and view parameters to the currently bound framebuffer.
+// if lmBegin returns true, it must be followed by lmEnd after rendering!
+lm_bool lmBegin(lm_context *ctx,
+	int* outViewport4,                                                                                 // output of the current viewport: { x, y, w, h }. use these to call glViewport()!
+	float* outView4x4,                                                                                 // output of the current camera view matrix.
+	float* outProjection4x4);                                                                          // output of the current camera projection matrix.
+
+float lmProgress(lm_context *ctx);                                                                     // should only be called between lmBegin/lmEnd!
+                                                                                                       // provides the light mapping progress as a value increasing from 0.0 to 1.0.
+
+void lmEnd(lm_context *ctx);
+
+// destroys the lightmapper instance. should be called to free resources.
+void lmDestroy(lm_context *ctx);
+
+
+// image based post processing (c is the number of color channels in the image, m a channel mask for the operation)
+#define LM_ALL_CHANNELS 0x0f
+float lmImageMin(const float *image, int w, int h, int c, int m LM_DEFAULT_VALUE(LM_ALL_CHANNELS));                    // find the minimum value (across the specified channels)
+float lmImageMax(const float *image, int w, int h, int c, int m LM_DEFAULT_VALUE(LM_ALL_CHANNELS));                    // find the maximum value (across the specified channels)
+void lmImageAdd(float *image, int w, int h, int c, float value, int m LM_DEFAULT_VALUE(LM_ALL_CHANNELS));              // in-place add to the specified channels
+void lmImageScale(float *image, int w, int h, int c, float factor, int m LM_DEFAULT_VALUE(LM_ALL_CHANNELS));           // in-place scaling of the specified channels
+void lmImagePower(float *image, int w, int h, int c, float exponent, int m LM_DEFAULT_VALUE(LM_ALL_CHANNELS));         // in-place powf(v, exponent) of the specified channels (for gamma)
+void lmImageDilate(const float *image, float *outImage, int w, int h, int c);                                          // widen the populated non-zero areas by 1 pixel.
+void lmImageSmooth(const float *image, float *outImage, int w, int h, int c);                                          // simple box filter on only the non-zero values.
+void lmImageDownsample(const float *image, float *outImage, int w, int h, int c);                                      // downsamples [0..w]x[0..h] to [0..w/2]x[0..h/2] by avereging only the non-zero values
+void lmImageFtoUB(const float *image, unsigned char *outImage, int w, int h, int c, float max LM_DEFAULT_VALUE(0.0f)); // casts a floating point image to an 8bit/channel image
+
+// TGA file output helpers
+lm_bool lmImageSaveTGAub(const char *filename, const unsigned char *image, int w, int h, int c);
+lm_bool lmImageSaveTGAf(const char *filename, const float *image, int w, int h, int c, float max LM_DEFAULT_VALUE(0.0f));
+
+#endif
+////////////////////// END OF HEADER //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef LIGHTMAPPER_IMPLEMENTATION
+#undef LIGHTMAPPER_IMPLEMENTATION
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <float.h>
+#include <assert.h>
+#include <limits.h>
+
+#define LM_SWAP(type, a, b) { type tmp = (a); (a) = (b); (b) = tmp; }
+
+#if defined(_MSC_VER) && !defined(__cplusplus)
+#define inline __inline
+#endif
+
+#if defined(_MSC_VER) && (_MSC_VER <= 1700)
+static inline lm_bool lm_finite(float a) { return _finite(a); }
+#else
+static inline lm_bool lm_finite(float a) { return isfinite(a); }
+#endif
+
+static inline int      lm_mini      (int     a, int     b) { return a < b ? a : b; }
+static inline int      lm_maxi      (int     a, int     b) { return a > b ? a : b; }
+static inline int      lm_absi      (int     a           ) { return a < 0 ? -a : a; }
+static inline float    lm_minf      (float   a, float   b) { return a < b ? a : b; }
+static inline float    lm_maxf      (float   a, float   b) { return a > b ? a : b; }
+static inline float    lm_absf      (float   a           ) { return a < 0.0f ? -a : a; }
+static inline float    lm_pmodf     (float   a, float   b) { return (a < 0.0f ? 1.0f : 0.0f) + (float)fmod(a, b); } // positive mod
+
+typedef struct lm_ivec2 { int x, y; } lm_ivec2;
+static inline lm_ivec2 lm_i2        (int     x, int     y) { lm_ivec2 v = { x, y }; return v; }
+
+typedef struct lm_vec2 { float x, y; } lm_vec2;
+static inline lm_vec2  lm_v2i       (int     x, int     y) { lm_vec2 v = { (float)x, (float)y }; return v; }
+static inline lm_vec2  lm_v2        (float   x, float   y) { lm_vec2 v = { x, y }; return v; }
+static inline lm_vec2  lm_negate2   (lm_vec2 a           ) { return lm_v2(-a.x, -a.y); }
+static inline lm_vec2  lm_add2      (lm_vec2 a, lm_vec2 b) { return lm_v2(a.x + b.x, a.y + b.y); }
+static inline lm_vec2  lm_sub2      (lm_vec2 a, lm_vec2 b) { return lm_v2(a.x - b.x, a.y - b.y); }
+static inline lm_vec2  lm_mul2      (lm_vec2 a, lm_vec2 b) { return lm_v2(a.x * b.x, a.y * b.y); }
+static inline lm_vec2  lm_scale2    (lm_vec2 a, float   b) { return lm_v2(a.x * b, a.y * b); }
+static inline lm_vec2  lm_div2      (lm_vec2 a, float   b) { return lm_scale2(a, 1.0f / b); }
+static inline lm_vec2  lm_pmod2     (lm_vec2 a, float   b) { return lm_v2(lm_pmodf(a.x, b), lm_pmodf(a.y, b)); }
+static inline lm_vec2  lm_min2      (lm_vec2 a, lm_vec2 b) { return lm_v2(lm_minf(a.x, b.x), lm_minf(a.y, b.y)); }
+static inline lm_vec2  lm_max2      (lm_vec2 a, lm_vec2 b) { return lm_v2(lm_maxf(a.x, b.x), lm_maxf(a.y, b.y)); }
+static inline lm_vec2  lm_abs2      (lm_vec2 a           ) { return lm_v2(lm_absf(a.x), lm_absf(a.y)); }
+static inline lm_vec2  lm_floor2    (lm_vec2 a           ) { return lm_v2(floorf(a.x), floorf(a.y)); }
+static inline lm_vec2  lm_ceil2     (lm_vec2 a           ) { return lm_v2(ceilf (a.x), ceilf (a.y)); }
+static inline float    lm_dot2      (lm_vec2 a, lm_vec2 b) { return a.x * b.x + a.y * b.y; }
+static inline float    lm_cross2    (lm_vec2 a, lm_vec2 b) { return a.x * b.y - a.y * b.x; } // pseudo cross product
+static inline float    lm_length2sq (lm_vec2 a           ) { return a.x * a.x + a.y * a.y; }
+static inline float    lm_length2   (lm_vec2 a           ) { return sqrtf(lm_length2sq(a)); }
+static inline lm_vec2  lm_normalize2(lm_vec2 a           ) { return lm_div2(a, lm_length2(a)); }
+static inline lm_bool  lm_finite2   (lm_vec2 a           ) { return lm_finite(a.x) && lm_finite(a.y); }
+
+typedef struct lm_vec3 { float x, y, z; } lm_vec3;
+static inline lm_vec3  lm_v3        (float   x, float   y, float   z) { lm_vec3 v = { x, y, z }; return v; }
+static inline lm_vec3  lm_negate3   (lm_vec3 a           ) { return lm_v3(-a.x, -a.y, -a.z); }
+static inline lm_vec3  lm_add3      (lm_vec3 a, lm_vec3 b) { return lm_v3(a.x + b.x, a.y + b.y, a.z + b.z); }
+static inline lm_vec3  lm_sub3      (lm_vec3 a, lm_vec3 b) { return lm_v3(a.x - b.x, a.y - b.y, a.z - b.z); }
+static inline lm_vec3  lm_mul3      (lm_vec3 a, lm_vec3 b) { return lm_v3(a.x * b.x, a.y * b.y, a.z * b.z); }
+static inline lm_vec3  lm_scale3    (lm_vec3 a, float   b) { return lm_v3(a.x * b, a.y * b, a.z * b); }
+static inline lm_vec3  lm_div3      (lm_vec3 a, float   b) { return lm_scale3(a, 1.0f / b); }
+static inline lm_vec3  lm_pmod3     (lm_vec3 a, float   b) { return lm_v3(lm_pmodf(a.x, b), lm_pmodf(a.y, b), lm_pmodf(a.z, b)); }
+static inline lm_vec3  lm_min3      (lm_vec3 a, lm_vec3 b) { return lm_v3(lm_minf(a.x, b.x), lm_minf(a.y, b.y), lm_minf(a.z, b.z)); }
+static inline lm_vec3  lm_max3      (lm_vec3 a, lm_vec3 b) { return lm_v3(lm_maxf(a.x, b.x), lm_maxf(a.y, b.y), lm_maxf(a.z, b.z)); }
+static inline lm_vec3  lm_abs3      (lm_vec3 a           ) { return lm_v3(lm_absf(a.x), lm_absf(a.y), lm_absf(a.z)); }
+static inline lm_vec3  lm_floor3    (lm_vec3 a           ) { return lm_v3(floorf(a.x), floorf(a.y), floorf(a.z)); }
+static inline lm_vec3  lm_ceil3     (lm_vec3 a           ) { return lm_v3(ceilf (a.x), ceilf (a.y), ceilf (a.z)); }
+static inline float    lm_dot3      (lm_vec3 a, lm_vec3 b) { return a.x * b.x + a.y * b.y + a.z * b.z; }
+static inline lm_vec3  lm_cross3    (lm_vec3 a, lm_vec3 b) { return lm_v3(a.y * b.z - b.y * a.z, a.z * b.x - b.z * a.x, a.x * b.y - b.x * a.y); }
+static inline float    lm_length3sq (lm_vec3 a           ) { return a.x * a.x + a.y * a.y + a.z * a.z; }
+static inline float    lm_length3   (lm_vec3 a           ) { return sqrtf(lm_length3sq(a)); }
+static inline lm_vec3  lm_normalize3(lm_vec3 a           ) { return lm_div3(a, lm_length3(a)); }
+static inline lm_bool  lm_finite3   (lm_vec3 a           ) { return lm_finite(a.x) && lm_finite(a.y) && lm_finite(a.z); }
+
+static lm_vec2 lm_toBarycentric(lm_vec2 p1, lm_vec2 p2, lm_vec2 p3, lm_vec2 p)
+{
+	// http://www.blackpawn.com/texts/pointinpoly/
+	// Compute vectors
+	lm_vec2 v0 = lm_sub2(p3, p1);
+	lm_vec2 v1 = lm_sub2(p2, p1);
+	lm_vec2 v2 = lm_sub2(p, p1);
+	// Compute dot products
+	float dot00 = lm_dot2(v0, v0);
+	float dot01 = lm_dot2(v0, v1);
+	float dot02 = lm_dot2(v0, v2);
+	float dot11 = lm_dot2(v1, v1);
+	float dot12 = lm_dot2(v1, v2);
+	// Compute barycentric coordinates
+	float invDenom = 1.0f / (dot00 * dot11 - dot01 * dot01);
+	float u = (dot11 * dot02 - dot01 * dot12) * invDenom;
+	float v = (dot00 * dot12 - dot01 * dot02) * invDenom;
+	return lm_v2(u, v);
+}
+
+static inline int lm_leftOf(lm_vec2 a, lm_vec2 b, lm_vec2 c)
+{
+	float x = lm_cross2(lm_sub2(b, a), lm_sub2(c, b));
+	return x < 0 ? -1 : x > 0;
+}
+
+static lm_bool lm_lineIntersection(lm_vec2 x0, lm_vec2 x1, lm_vec2 y0, lm_vec2 y1, lm_vec2* res)
+{
+	lm_vec2 dx = lm_sub2(x1, x0);
+	lm_vec2 dy = lm_sub2(y1, y0);
+	lm_vec2 d = lm_sub2(x0, y0);
+	float dyx = lm_cross2(dy, dx);
+	if (dyx == 0.0f)
+		return LM_FALSE;
+	dyx = lm_cross2(d, dx) / dyx;
+	if (dyx <= 0 || dyx >= 1)
+		return LM_FALSE;
+	res->x = y0.x + dyx * dy.x;
+	res->y = y0.y + dyx * dy.y;
+	return LM_TRUE;
+}
+
+// this modifies the poly array! the poly array must be big enough to hold the result!
+// res must be big enough to hold the result!
+static int lm_convexClip(lm_vec2 *poly, int nPoly, const lm_vec2 *clip, int nClip, lm_vec2 *res)
+{
+	int nRes = nPoly;
+	int dir = lm_leftOf(clip[0], clip[1], clip[2]);
+	for (int i = 0, j = nClip - 1; i < nClip && nRes; j = i++)
+	{
+		if (i != 0)
+			for (nPoly = 0; nPoly < nRes; nPoly++)
+				poly[nPoly] = res[nPoly];
+		nRes = 0;
+		lm_vec2 v0 = poly[nPoly - 1];
+		int side0 = lm_leftOf(clip[j], clip[i], v0);
+		if (side0 != -dir)
+			res[nRes++] = v0;
+		for (int k = 0; k < nPoly; k++)
+		{
+			lm_vec2 v1 = poly[k], x;
+			int side1 = lm_leftOf(clip[j], clip[i], v1);
+			if (side0 + side1 == 0 && side0 && lm_lineIntersection(clip[j], clip[i], v0, v1, &x))
+				res[nRes++] = x;
+			if (k == nPoly - 1)
+				break;
+			if (side1 != -dir)
+				res[nRes++] = v1;
+			v0 = v1;
+			side0 = side1;
+		}
+	}
+
+	return nRes;
+}
+
+struct lm_context
+{
+	struct
+	{
+		const float *modelMatrix;
+		float normalMatrix[9];
+
+		const unsigned char *positions;
+		lm_type positionsType;
+		int positionsStride;
+		const unsigned char *normals;
+		lm_type normalsType;
+		int normalsStride;
+		const unsigned char *uvs;
+		lm_type uvsType;
+		int uvsStride;
+		const unsigned char *indices;
+		lm_type indicesType;
+		unsigned int count;
+	} mesh;
+
+	struct
+	{
+		int pass;
+		int passCount;
+
+		struct
+		{
+			unsigned int baseIndex;
+			lm_vec3 p[3];
+			lm_vec3 n[3];
+			lm_vec2 uv[3];
+		} triangle;
+
+		struct
+		{
+			int minx, miny;
+			int maxx, maxy;
+			int x, y;
+		} rasterizer;
+
+		struct
+		{
+			lm_vec3 position;
+			lm_vec3 direction;
+			lm_vec3 up;
+		} sample;
+
+		struct
+		{
+			int side;
+		} hemisphere;
+	} meshPosition;
+
+	struct
+	{
+		int width;
+		int height;
+		int channels;
+		float *data;
+
+#ifdef LM_DEBUG_INTERPOLATION
+		unsigned char *debug;
+#endif
+	} lightmap;
+
+	struct
+	{
+		unsigned int size;
+		float zNear, zFar;
+		float cameraToSurfaceDistanceModifier;
+		struct { float r, g, b; } clearColor;
+
+		unsigned int fbHemiCountX;
+		unsigned int fbHemiCountY;
+		unsigned int fbHemiIndex;
+		lm_ivec2 *fbHemiToLightmapLocation;
+		GLuint fbTexture[2];
+		GLuint fb[2];
+		GLuint fbDepth;
+		GLuint vao;
+		struct
+		{
+			GLuint programID;
+			GLuint hemispheresTextureID;
+			GLuint weightsTextureID;
+			GLuint weightsTexture;
+		} firstPass;
+		struct
+		{
+			GLuint programID;
+			GLuint hemispheresTextureID;
+		} downsamplePass;
+		struct
+		{
+			GLuint texture;
+			lm_ivec2 writePosition;
+			lm_ivec2 *toLightmapLocation;
+		} storage;
+	} hemisphere;
+
+	float interpolationThreshold;
+};
+
+// pass order of one 4x4 interpolation patch for two interpolation steps (and the next neighbors right of/below it)
+// 0 4 1 4 0
+// 5 6 5 6 5
+// 2 4 3 4 2
+// 5 6 5 6 5
+// 0 4 1 4 0
+
+static unsigned int lm_passStepSize(lm_context *ctx)
+{
+	unsigned int shift = ctx->meshPosition.passCount / 3 - (ctx->meshPosition.pass - 1) / 3;
+	unsigned int step = (1 << shift);
+	assert(step > 0);
+	return step;
+}
+
+static unsigned int lm_passOffsetX(lm_context *ctx)
+{
+	if (!ctx->meshPosition.pass)
+		return 0;
+	int passType = (ctx->meshPosition.pass - 1) % 3;
+	unsigned int halfStep = lm_passStepSize(ctx) >> 1;
+	return passType != 1 ? halfStep : 0;
+}
+
+static unsigned int lm_passOffsetY(lm_context *ctx)
+{
+	if (!ctx->meshPosition.pass)
+		return 0;
+	int passType = (ctx->meshPosition.pass - 1) % 3;
+	unsigned int halfStep = lm_passStepSize(ctx) >> 1;
+	return passType != 0 ? halfStep : 0;
+}
+
+static lm_bool lm_hasConservativeTriangleRasterizerFinished(lm_context *ctx)
+{
+	return ctx->meshPosition.rasterizer.y >= ctx->meshPosition.rasterizer.maxy;
+}
+
+static void lm_moveToNextPotentialConservativeTriangleRasterizerPosition(lm_context *ctx)
+{
+	unsigned int step = lm_passStepSize(ctx);
+	ctx->meshPosition.rasterizer.x += step;
+	while (ctx->meshPosition.rasterizer.x >= ctx->meshPosition.rasterizer.maxx)
+	{
+		ctx->meshPosition.rasterizer.x = ctx->meshPosition.rasterizer.minx + lm_passOffsetX(ctx);
+		ctx->meshPosition.rasterizer.y += step;
+		if (lm_hasConservativeTriangleRasterizerFinished(ctx))
+			break;
+	}
+}
+
+static float *lm_getLightmapPixel(lm_context *ctx, int x, int y)
+{
+	assert(x >= 0 && x < ctx->lightmap.width && y >= 0 && y < ctx->lightmap.height);
+	return ctx->lightmap.data + (y * ctx->lightmap.width + x) * ctx->lightmap.channels;
+}
+
+static void lm_setLightmapPixel(lm_context *ctx, int x, int y, float *in)
+{
+	assert(x >= 0 && x < ctx->lightmap.width && y >= 0 && y < ctx->lightmap.height);
+	float *p = ctx->lightmap.data + (y * ctx->lightmap.width + x) * ctx->lightmap.channels;
+	for (int j = 0; j < ctx->lightmap.channels; j++)
+		*p++ = *in++;
+}
+
+#define lm_baseAngle 0.1f
+static const float lm_baseAngles[3][3] = {
+	{ lm_baseAngle, lm_baseAngle + 1.0f / 3.0f, lm_baseAngle + 2.0f / 3.0f },
+	{ lm_baseAngle + 1.0f / 3.0f, lm_baseAngle + 2.0f / 3.0f, lm_baseAngle },
+	{ lm_baseAngle + 2.0f / 3.0f, lm_baseAngle, lm_baseAngle + 1.0f / 3.0f }
+};
+
+static lm_bool lm_trySamplingConservativeTriangleRasterizerPosition(lm_context *ctx)
+{
+	if (lm_hasConservativeTriangleRasterizerFinished(ctx))
+		return LM_FALSE;
+
+	// check if lightmap pixel was already set
+	float *pixelValue = lm_getLightmapPixel(ctx, ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y);
+	for (int j = 0; j < ctx->lightmap.channels; j++)
+		if (pixelValue[j] != 0.0f)
+			return LM_FALSE;
+
+	// try calculating centroid by clipping the pixel against the triangle
+	lm_vec2 pixel[16];
+	pixel[0] = lm_v2i(ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y);
+	pixel[1] = lm_v2i(ctx->meshPosition.rasterizer.x + 1, ctx->meshPosition.rasterizer.y);
+	pixel[2] = lm_v2i(ctx->meshPosition.rasterizer.x + 1, ctx->meshPosition.rasterizer.y + 1);
+	pixel[3] = lm_v2i(ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y + 1);
+
+	lm_vec2 res[16];
+	int nRes = lm_convexClip(pixel, 4, ctx->meshPosition.triangle.uv, 3, res);
+	if (nRes == 0)
+		return LM_FALSE; // nothing left
+
+	// calculate centroid position and area
+	lm_vec2 centroid = res[0];
+	float area = res[nRes - 1].x * res[0].y - res[nRes - 1].y * res[0].x;
+	for (int i = 1; i < nRes; i++)
+	{
+		centroid = lm_add2(centroid, res[i]);
+		area += res[i - 1].x * res[i].y - res[i - 1].y * res[i].x;
+	}
+	centroid = lm_div2(centroid, (float)nRes);
+	area = lm_absf(area / 2.0f);
+
+	if (area <= 0.0f)
+		return LM_FALSE; // no area left
+
+	// calculate barycentric coords
+	lm_vec2 uv = lm_toBarycentric(
+		ctx->meshPosition.triangle.uv[0],
+		ctx->meshPosition.triangle.uv[1],
+		ctx->meshPosition.triangle.uv[2],
+		centroid);
+
+	if (!lm_finite2(uv))
+		return LM_FALSE; // degenerate
+
+	// try to interpolate color from neighbors:
+	if (ctx->meshPosition.pass > 0)
+	{
+		float *neighbors[4];
+		int neighborCount = 0;
+		int neighborsExpected = 0;
+		int d = (int)lm_passStepSize(ctx) / 2;
+		int dirs = ((ctx->meshPosition.pass - 1) % 3) + 1;
+		if (dirs & 1) // check x-neighbors with distance d
+		{
+			neighborsExpected += 2;
+			if (ctx->meshPosition.rasterizer.x - d >= ctx->meshPosition.rasterizer.minx &&
+				ctx->meshPosition.rasterizer.x + d <= ctx->meshPosition.rasterizer.maxx)
+			{
+				neighbors[neighborCount++] = lm_getLightmapPixel(ctx, ctx->meshPosition.rasterizer.x - d, ctx->meshPosition.rasterizer.y);
+				neighbors[neighborCount++] = lm_getLightmapPixel(ctx, ctx->meshPosition.rasterizer.x + d, ctx->meshPosition.rasterizer.y);
+			}
+		}
+		if (dirs & 2) // check y-neighbors with distance d
+		{
+			neighborsExpected += 2;
+			if (ctx->meshPosition.rasterizer.y - d >= ctx->meshPosition.rasterizer.miny &&
+				ctx->meshPosition.rasterizer.y + d <= ctx->meshPosition.rasterizer.maxy)
+			{
+				neighbors[neighborCount++] = lm_getLightmapPixel(ctx, ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y - d);
+				neighbors[neighborCount++] = lm_getLightmapPixel(ctx, ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y + d);
+			}
+		}
+		if (neighborCount == neighborsExpected) // are all interpolation neighbors available?
+		{
+			// calculate average neighbor pixel value
+			float avg[4] = { 0 };
+			for (int i = 0; i < neighborCount; i++)
+				for (int j = 0; j < ctx->lightmap.channels; j++)
+					avg[j] += neighbors[i][j];
+			float ni = 1.0f / neighborCount;
+			for (int j = 0; j < ctx->lightmap.channels; j++)
+				avg[j] *= ni;
+
+			// check if error from average pixel to neighbors is above the interpolation threshold
+			lm_bool interpolate = LM_TRUE;
+			for (int i = 0; i < neighborCount; i++)
+			{
+				lm_bool zero = LM_TRUE;
+				for (int j = 0; j < ctx->lightmap.channels; j++)
+				{
+					if (neighbors[i][j] != 0.0f)
+						zero = LM_FALSE;
+					if (fabs(neighbors[i][j] - avg[j]) > ctx->interpolationThreshold)
+						interpolate = LM_FALSE;
+				}
+				if (zero)
+					interpolate = LM_FALSE;
+				if (!interpolate)
+					break;
+			}
+
+			// set interpolated value and return if interpolation is acceptable
+			if (interpolate)
+			{
+				lm_setLightmapPixel(ctx, ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y, avg);
+#ifdef LM_DEBUG_INTERPOLATION
+				// set interpolated pixel to green in debug output
+				ctx->lightmap.debug[(ctx->meshPosition.rasterizer.y * ctx->lightmap.width + ctx->meshPosition.rasterizer.x) * 3 + 1] = 255;
+#endif
+				return LM_FALSE;
+			}
+		}
+	}
+
+	// could not interpolate. must render a hemisphere.
+	// calculate 3D sample position and orientation
+	lm_vec3 p0 = ctx->meshPosition.triangle.p[0];
+	lm_vec3 p1 = ctx->meshPosition.triangle.p[1];
+	lm_vec3 p2 = ctx->meshPosition.triangle.p[2];
+	lm_vec3 v1 = lm_sub3(p1, p0);
+	lm_vec3 v2 = lm_sub3(p2, p0);
+	ctx->meshPosition.sample.position = lm_add3(p0, lm_add3(lm_scale3(v2, uv.x), lm_scale3(v1, uv.y)));
+
+	lm_vec3 n0 = ctx->meshPosition.triangle.n[0];
+	lm_vec3 n1 = ctx->meshPosition.triangle.n[1];
+	lm_vec3 n2 = ctx->meshPosition.triangle.n[2];
+	lm_vec3 nv1 = lm_sub3(n1, n0);
+	lm_vec3 nv2 = lm_sub3(n2, n0);
+	ctx->meshPosition.sample.direction = lm_normalize3(lm_add3(n0, lm_add3(lm_scale3(nv2, uv.x), lm_scale3(nv1, uv.y))));
+	ctx->meshPosition.sample.direction = lm_normalize3(ctx->meshPosition.sample.direction);
+	float cameraToSurfaceDistance = (1.0f + ctx->hemisphere.cameraToSurfaceDistanceModifier) * ctx->hemisphere.zNear * sqrtf(2.0f);
+	ctx->meshPosition.sample.position = lm_add3(ctx->meshPosition.sample.position, lm_scale3(ctx->meshPosition.sample.direction, cameraToSurfaceDistance));
+
+	if (!lm_finite3(ctx->meshPosition.sample.position) ||
+		!lm_finite3(ctx->meshPosition.sample.direction) ||
+		lm_length3sq(ctx->meshPosition.sample.direction) < 0.5f) // don't allow 0.0f. should always be ~1.0f
+		return LM_FALSE;
+
+	lm_vec3 up = lm_v3(0.0f, 1.0f, 0.0f);
+	if (lm_absf(lm_dot3(up, ctx->meshPosition.sample.direction)) > 0.8f)
+		up = lm_v3(0.0f, 0.0f, 1.0f);
+
+#if 0
+	// triangle-consistent up vector
+	ctx->meshPosition.sample.up = lm_normalize3(lm_cross3(up, ctx->meshPosition.sample.direction));
+	return LM_TRUE;
+#else
+	// "randomized" rotation with pattern
+	lm_vec3 side = lm_normalize3(lm_cross3(up, ctx->meshPosition.sample.direction));
+	up = lm_normalize3(lm_cross3(side, ctx->meshPosition.sample.direction));
+	int rx = ctx->meshPosition.rasterizer.x % 3;
+	int ry = ctx->meshPosition.rasterizer.y % 3;
+	static const float lm_pi = 3.14159265358979f;
+	float phi = 2.0f * lm_pi * lm_baseAngles[ry][rx] + 0.1f * ((float)rand() / (float)RAND_MAX);
+	ctx->meshPosition.sample.up = lm_normalize3(lm_add3(lm_scale3(side, cosf(phi)), lm_scale3(up, sinf(phi))));
+	return LM_TRUE;
+#endif
+}
+
+// returns true if a sampling position was found and
+// false if we finished rasterizing the current triangle
+static lm_bool lm_findFirstConservativeTriangleRasterizerPosition(lm_context *ctx)
+{
+	while (!lm_trySamplingConservativeTriangleRasterizerPosition(ctx))
+	{
+		lm_moveToNextPotentialConservativeTriangleRasterizerPosition(ctx);
+		if (lm_hasConservativeTriangleRasterizerFinished(ctx))
+			return LM_FALSE;
+	}
+	return LM_TRUE;
+}
+
+static lm_bool lm_findNextConservativeTriangleRasterizerPosition(lm_context *ctx)
+{
+	lm_moveToNextPotentialConservativeTriangleRasterizerPosition(ctx);
+	return lm_findFirstConservativeTriangleRasterizerPosition(ctx);
+}
+
+static void lm_integrateHemisphereBatch(lm_context *ctx)
+{
+	if (!ctx->hemisphere.fbHemiIndex)
+		return; // nothing to do
+
+	glDisable(GL_DEPTH_TEST);
+	glBindVertexArray(ctx->hemisphere.vao);
+
+	int fbRead = 0;
+	int fbWrite = 1;
+
+	// weighted downsampling pass
+	int outHemiSize = ctx->hemisphere.size / 2;
+	glBindFramebuffer(GL_FRAMEBUFFER, ctx->hemisphere.fb[fbWrite]);
+	glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, ctx->hemisphere.fbTexture[fbWrite], 0);
+	glViewport(0, 0, outHemiSize * ctx->hemisphere.fbHemiCountX, outHemiSize * ctx->hemisphere.fbHemiCountY);
+	glUseProgram(ctx->hemisphere.firstPass.programID);
+	glUniform1i(ctx->hemisphere.firstPass.hemispheresTextureID, 0);
+	glActiveTexture(GL_TEXTURE0);
+	glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.fbTexture[fbRead]);
+	glUniform1i(ctx->hemisphere.firstPass.weightsTextureID, 1);
+	glActiveTexture(GL_TEXTURE1);
+	glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.firstPass.weightsTexture);
+	glActiveTexture(GL_TEXTURE0);
+	glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+	//glBindTexture(GL_TEXTURE_2D, 0);
+
+#if 0
+	// debug output
+	int w = outHemiSize * ctx->hemisphere.fbHemiCountX, h = outHemiSize * ctx->hemisphere.fbHemiCountY;
+	glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
+	glBindFramebuffer(GL_READ_FRAMEBUFFER, ctx->hemisphere.fb[fbWrite]);
+	glReadBuffer(GL_COLOR_ATTACHMENT0);
+	float *image = new float[3 * w * h];
+	glReadPixels(0, 0, w, h, GL_RGB, GL_FLOAT, image);
+	lmImageSaveTGAf("firstpass.png", image, w, h, 3);
+	delete[] image;
+#endif
+
+	// downsampling passes
+	glUseProgram(ctx->hemisphere.downsamplePass.programID);
+	glUniform1i(ctx->hemisphere.downsamplePass.hemispheresTextureID, 0);
+	while (outHemiSize > 1)
+	{
+		LM_SWAP(int, fbRead, fbWrite);
+		outHemiSize /= 2;
+		glBindFramebuffer(GL_FRAMEBUFFER, ctx->hemisphere.fb[fbWrite]);
+		glViewport(0, 0, outHemiSize * ctx->hemisphere.fbHemiCountX, outHemiSize * ctx->hemisphere.fbHemiCountY);
+		glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.fbTexture[fbRead]);
+		glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+		//glBindTexture(GL_TEXTURE_2D, 0);
+	}
+
+	// copy results to storage texture
+	glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.storage.texture);
+	glCopyTexSubImage2D(GL_TEXTURE_2D, 0,
+		ctx->hemisphere.storage.writePosition.x, ctx->hemisphere.storage.writePosition.y,
+		0, 0, ctx->hemisphere.fbHemiCountX, ctx->hemisphere.fbHemiCountY);
+	glBindTexture(GL_TEXTURE_2D, 0);
+	glBindFramebuffer(GL_FRAMEBUFFER, 0);
+	glBindVertexArray(0);
+	glEnable(GL_DEPTH_TEST);
+
+	// copy position mapping to storage
+	for (unsigned int y = 0; y < ctx->hemisphere.fbHemiCountY; y++)
+	{
+		int sy = ctx->hemisphere.storage.writePosition.y + y;
+		for (unsigned int x = 0; x < ctx->hemisphere.fbHemiCountX; x++)
+		{
+			int sx = ctx->hemisphere.storage.writePosition.x + x;
+			unsigned int hemiIndex = y * ctx->hemisphere.fbHemiCountX + x;
+			if (hemiIndex >= ctx->hemisphere.fbHemiIndex)
+				ctx->hemisphere.storage.toLightmapLocation[sy * ctx->lightmap.width + sx] = lm_i2(-1, -1);
+			else
+				ctx->hemisphere.storage.toLightmapLocation[sy * ctx->lightmap.width + sx] = ctx->hemisphere.fbHemiToLightmapLocation[hemiIndex];
+		}
+	}
+
+	// advance storage texture write position
+	ctx->hemisphere.storage.writePosition.x += ctx->hemisphere.fbHemiCountX;
+	if (ctx->hemisphere.storage.writePosition.x + (int)ctx->hemisphere.fbHemiCountX > ctx->lightmap.width)
+	{
+		ctx->hemisphere.storage.writePosition.x = 0;
+		ctx->hemisphere.storage.writePosition.y += ctx->hemisphere.fbHemiCountY;
+		assert(ctx->hemisphere.storage.writePosition.y + (int)ctx->hemisphere.fbHemiCountY < ctx->lightmap.height);
+	}
+
+	ctx->hemisphere.fbHemiIndex = 0;
+}
+
+static void lm_writeResultsToLightmap(lm_context *ctx)
+{
+	// do the GPU->CPU transfer of downsampled hemispheres
+	float *hemi = (float*)LM_CALLOC(ctx->lightmap.width * ctx->lightmap.height, 4 * sizeof(float));
+	glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.storage.texture);
+	glGetTexImage(GL_TEXTURE_2D, 0, GL_RGBA, GL_FLOAT, hemi);
+
+	// write results to lightmap texture
+	for (int y = 0; y < ctx->hemisphere.storage.writePosition.y + (int)ctx->hemisphere.fbHemiCountY; y++)
+	{
+		for (int x = 0; x < ctx->lightmap.width; x++)
+		{
+			lm_ivec2 lmUV = ctx->hemisphere.storage.toLightmapLocation[y * ctx->lightmap.width + x];
+			if (lmUV.x >= 0)
+			{
+				float *c = hemi + (y * ctx->lightmap.width + x) * 4;
+				float validity = c[3];
+				float *lm = ctx->lightmap.data + (lmUV.y * ctx->lightmap.width + lmUV.x) * ctx->lightmap.channels;
+				if (!lm[0] && validity > 0.9)
+				{
+					float scale = 1.0f / validity;
+					switch (ctx->lightmap.channels)
+					{
+					case 1:
+						lm[0] = lm_maxf((c[0] + c[1] + c[2]) * scale / 3.0f, FLT_MIN);
+						break;
+					case 2:
+						lm[0] = lm_maxf((c[0] + c[1] + c[2]) * scale / 3.0f, FLT_MIN);
+						lm[1] = 1.0f; // do we want to support this format?
+						break;
+					case 3:
+						lm[0] = lm_maxf(c[0] * scale, FLT_MIN);
+						lm[1] = lm_maxf(c[1] * scale, FLT_MIN);
+						lm[2] = lm_maxf(c[2] * scale, FLT_MIN);
+						break;
+					case 4:
+						lm[0] = lm_maxf(c[0] * scale, FLT_MIN);
+						lm[1] = lm_maxf(c[1] * scale, FLT_MIN);
+						lm[2] = lm_maxf(c[2] * scale, FLT_MIN);
+						lm[3] = 1.0f;
+						break;
+					default:
+						assert(LM_FALSE);
+						break;
+					}
+
+#ifdef LM_DEBUG_INTERPOLATION
+					// set sampled pixel to red in debug output
+					ctx->lightmap.debug[(lmUV.y * ctx->lightmap.width + lmUV.x) * 3 + 0] = 255;
+#endif
+				}
+			}
+			ctx->hemisphere.storage.toLightmapLocation[y * ctx->lightmap.width + x].x = -1; // reset
+		}
+	}
+
+	LM_FREE(hemi);
+	ctx->hemisphere.storage.writePosition = lm_i2(0, 0);
+}
+
+static void lm_setView(
+	int* viewport, int x, int y, int w, int h,
+	float* view,   lm_vec3 pos, lm_vec3 dir, lm_vec3 up,
+	float* proj,   float l, float r, float b, float t, float n, float f)
+{
+	// viewport
+	viewport[0] = x; viewport[1] = y; viewport[2] = w; viewport[3] = h;
+
+	// view matrix: lookAt(pos, pos + dir, up)
+	lm_vec3 side = lm_cross3(dir, up);
+	//up = cross(side, dir);
+	dir = lm_negate3(dir); pos = lm_negate3(pos);
+	view[ 0] = side.x;             view[ 1] = up.x;             view[ 2] = dir.x;             view[ 3] = 0.0f;
+	view[ 4] = side.y;             view[ 5] = up.y;             view[ 6] = dir.y;             view[ 7] = 0.0f;
+	view[ 8] = side.z;             view[ 9] = up.z;             view[10] = dir.z;             view[11] = 0.0f;
+	view[12] = lm_dot3(side, pos); view[13] = lm_dot3(up, pos); view[14] = lm_dot3(dir, pos); view[15] = 1.0f;
+
+	// projection matrix: frustum(l, r, b, t, n, f)
+	float ilr = 1.0f / (r - l), ibt = 1.0f / (t - b), ninf = -1.0f / (f - n), n2 = 2.0f * n;
+	proj[ 0] = n2 * ilr;      proj[ 1] = 0.0f;          proj[ 2] = 0.0f;           proj[ 3] = 0.0f;
+	proj[ 4] = 0.0f;          proj[ 5] = n2 * ibt;      proj[ 6] = 0.0f;           proj[ 7] = 0.0f;
+	proj[ 8] = (r + l) * ilr; proj[ 9] = (t + b) * ibt; proj[10] = (f + n) * ninf; proj[11] = -1.0f;
+	proj[12] = 0.0f;          proj[13] = 0.0f;          proj[14] = f * n2 * ninf;  proj[15] = 0.0f;
+}
+
+// returns true if a hemisphere side was prepared for rendering and
+// false if we finished the current hemisphere
+static lm_bool lm_beginSampleHemisphere(lm_context *ctx, int* viewport, float* view, float* proj)
+{
+	if (ctx->meshPosition.hemisphere.side >= 5)
+		return LM_FALSE;
+
+	if (ctx->meshPosition.hemisphere.side == 0)
+	{
+		// prepare hemisphere
+		glBindFramebuffer(GL_FRAMEBUFFER, ctx->hemisphere.fb[0]);
+		if (ctx->hemisphere.fbHemiIndex == 0)
+		{
+			// prepare hemisphere batch
+			glClearColor( // clear to valid background pixels!
+				ctx->hemisphere.clearColor.r,
+				ctx->hemisphere.clearColor.g,
+				ctx->hemisphere.clearColor.b, 1.0f);
+			glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+		}
+		ctx->hemisphere.fbHemiToLightmapLocation[ctx->hemisphere.fbHemiIndex] =
+			lm_i2(ctx->meshPosition.rasterizer.x, ctx->meshPosition.rasterizer.y);
+	}
+
+	// find the target position in the batch
+	int x = (ctx->hemisphere.fbHemiIndex % ctx->hemisphere.fbHemiCountX) * ctx->hemisphere.size * 3;
+	int y = (ctx->hemisphere.fbHemiIndex / ctx->hemisphere.fbHemiCountX) * ctx->hemisphere.size;
+
+	int size = ctx->hemisphere.size;
+	float zNear = ctx->hemisphere.zNear;
+	float zFar = ctx->hemisphere.zFar;
+
+	lm_vec3 pos = ctx->meshPosition.sample.position;
+	lm_vec3 dir = ctx->meshPosition.sample.direction;
+	lm_vec3 up = ctx->meshPosition.sample.up;
+	lm_vec3 right = lm_cross3(dir, up);
+
+	// find the view parameters of the hemisphere side that we will render next
+	// hemisphere layout in the framebuffer:
+	//       +-------+---+---+-------+
+	//       |       |   |   |   D   |
+	//       |   C   | R | L +-------+
+	//       |       |   |   |   U   |
+	//       +-------+---+---+-------+
+	switch (ctx->meshPosition.hemisphere.side)
+	{
+	case 0: // center
+		lm_setView(viewport, x, y, size, size,
+				   view,     pos, dir, up,
+				   proj,     -zNear, zNear, -zNear, zNear, zNear, zFar);
+		break;
+	case 1: // right
+		lm_setView(viewport, size + x, y, size / 2, size,
+				   view,     pos, right, up,
+				   proj,     -zNear, 0.0f, -zNear, zNear, zNear, zFar);
+		break;
+	case 2: // left
+		lm_setView(viewport, size + x + size / 2, y, size / 2, size,
+				   view,     pos, lm_negate3(right), up,
+				   proj,     0.0f, zNear, -zNear, zNear, zNear, zFar);
+		break;
+	case 3: // down
+		lm_setView(viewport, 2 * size + x, y + size / 2, size, size / 2,
+				   view,     pos, lm_negate3(up), dir,
+				   proj,     -zNear, zNear, 0.0f, zNear, zNear, zFar);
+		break;
+	case 4: // up
+		lm_setView(viewport, 2 * size + x, y, size, size / 2,
+				   view,     pos, up, lm_negate3(dir),
+				   proj,     -zNear, zNear, -zNear, 0.0f, zNear, zFar);
+		break;
+	default:
+		assert(LM_FALSE);
+		break;
+	}
+
+	return LM_TRUE;
+}
+
+static void lm_endSampleHemisphere(lm_context *ctx)
+{
+	if (++ctx->meshPosition.hemisphere.side == 5)
+	{
+		// finish hemisphere
+		glBindFramebuffer(GL_FRAMEBUFFER, 0);
+		if (++ctx->hemisphere.fbHemiIndex == ctx->hemisphere.fbHemiCountX * ctx->hemisphere.fbHemiCountY)
+		{
+			// downsample new hemisphere batch and store the results
+			lm_integrateHemisphereBatch(ctx);
+		}
+	}
+}
+
+static void lm_inverseTranspose(const float *m44, float *n33)
+{
+	if (!m44)
+	{
+		n33[0] = 1.0f; n33[1] = 0.0f; n33[2] = 0.0f;
+		n33[3] = 0.0f; n33[4] = 1.0f; n33[5] = 0.0f;
+		n33[6] = 0.0f; n33[7] = 0.0f; n33[8] = 1.0f;
+		return;
+	}
+
+	float determinant = m44[ 0] * (m44[ 5] * m44[10] - m44[ 9] * m44[ 6])
+					  - m44[ 1] * (m44[ 4] * m44[10] - m44[ 6] * m44[ 8])
+					  + m44[ 2] * (m44[ 4] * m44[ 9] - m44[ 5] * m44[ 8]);
+
+	assert(fabs(determinant) > FLT_EPSILON);
+	float rcpDeterminant = 1.0f / determinant;
+
+	n33[0] =  (m44[ 5] * m44[10] - m44[ 9] * m44[ 6]) * rcpDeterminant;
+	n33[3] = -(m44[ 1] * m44[10] - m44[ 2] * m44[ 9]) * rcpDeterminant;
+	n33[6] =  (m44[ 1] * m44[ 6] - m44[ 2] * m44[ 5]) * rcpDeterminant;
+	n33[1] = -(m44[ 4] * m44[10] - m44[ 6] * m44[ 8]) * rcpDeterminant;
+	n33[4] =  (m44[ 0] * m44[10] - m44[ 2] * m44[ 8]) * rcpDeterminant;
+	n33[7] = -(m44[ 0] * m44[ 6] - m44[ 4] * m44[ 2]) * rcpDeterminant;
+	n33[2] =  (m44[ 4] * m44[ 9] - m44[ 8] * m44[ 5]) * rcpDeterminant;
+	n33[5] = -(m44[ 0] * m44[ 9] - m44[ 8] * m44[ 1]) * rcpDeterminant;
+	n33[8] =  (m44[ 0] * m44[ 5] - m44[ 4] * m44[ 1]) * rcpDeterminant;
+}
+
+static lm_vec3 lm_transformNormal(const float *m, lm_vec3 n)
+{
+	lm_vec3 r;
+	r.x = m[0] * n.x + m[3] * n.y + m[6] * n.z;
+	r.y = m[1] * n.x + m[4] * n.y + m[7] * n.z;
+	r.z = m[2] * n.x + m[5] * n.y + m[8] * n.z;
+	return r;
+}
+
+static lm_vec3 lm_transformPosition(const float *m, lm_vec3 v)
+{
+	if (!m)
+		return v;
+	lm_vec3 r;
+	r.x =     m[0] * v.x + m[4] * v.y + m[ 8] * v.z + m[12];
+	r.y =     m[1] * v.x + m[5] * v.y + m[ 9] * v.z + m[13];
+	r.z =     m[2] * v.x + m[6] * v.y + m[10] * v.z + m[14];
+	float d = m[3] * v.x + m[7] * v.y + m[11] * v.z + m[15];
+	assert(lm_absf(d - 1.0f) < 0.00001f); // could divide by d, but this shouldn't be a projection transform!
+	return r;
+}
+
+static void lm_setMeshPosition(lm_context *ctx, unsigned int indicesTriangleBaseIndex)
+{
+	// fetch triangle at the specified indicesTriangleBaseIndex
+	ctx->meshPosition.triangle.baseIndex = indicesTriangleBaseIndex;
+
+	// load and transform triangle to process next
+	lm_vec2 uvMin = lm_v2(FLT_MAX, FLT_MAX), uvMax = lm_v2(-FLT_MAX, -FLT_MAX);
+	lm_vec2 uvScale = lm_v2i(ctx->lightmap.width, ctx->lightmap.height);
+	unsigned int vIndices[3];
+	for (int i = 0; i < 3; i++)
+	{
+		// decode index
+		unsigned int vIndex;
+		switch (ctx->mesh.indicesType)
+		{
+		case LM_NONE:
+			vIndex = ctx->meshPosition.triangle.baseIndex + i;
+			break;
+		case LM_UNSIGNED_BYTE:
+			vIndex = ((const unsigned char*)ctx->mesh.indices + ctx->meshPosition.triangle.baseIndex)[i];
+			break;
+		case LM_UNSIGNED_SHORT:
+			vIndex = ((const unsigned short*)ctx->mesh.indices + ctx->meshPosition.triangle.baseIndex)[i];
+			break;
+		case LM_UNSIGNED_INT:
+			vIndex = ((const unsigned int*)ctx->mesh.indices + ctx->meshPosition.triangle.baseIndex)[i];
+			break;
+		default:
+			assert(LM_FALSE);
+			break;
+		}
+		vIndices[i] = vIndex;
+
+		// decode and pre-transform vertex position
+		const void *pPtr = ctx->mesh.positions + vIndex * ctx->mesh.positionsStride;
+		lm_vec3 p;
+		switch (ctx->mesh.positionsType)
+		{
+		// TODO: signed formats
+		case LM_UNSIGNED_BYTE: {
+			const unsigned char *uc = (const unsigned char*)pPtr;
+			p = lm_v3(uc[0], uc[1], uc[2]);
+		} break;
+		case LM_UNSIGNED_SHORT: {
+			const unsigned short *us = (const unsigned short*)pPtr;
+			p = lm_v3(us[0], us[1], us[2]);
+		} break;
+		case LM_UNSIGNED_INT: {
+			const unsigned int *ui = (const unsigned int*)pPtr;
+			p = lm_v3((float)ui[0], (float)ui[1], (float)ui[2]);
+		} break;
+		case LM_FLOAT: {
+			p = *(const lm_vec3*)pPtr;
+		} break;
+		default: {
+			assert(LM_FALSE);
+		} break;
+		}
+		ctx->meshPosition.triangle.p[i] = lm_transformPosition(ctx->mesh.modelMatrix, p);
+
+		// decode and scale (to lightmap resolution) vertex lightmap texture coords
+		const void *uvPtr = ctx->mesh.uvs + vIndex * ctx->mesh.uvsStride;
+		lm_vec2 uv;
+		switch (ctx->mesh.uvsType)
+		{
+		case LM_UNSIGNED_BYTE: {
+			const unsigned char *uc = (const unsigned char*)uvPtr;
+			uv = lm_v2(uc[0] / (float)UCHAR_MAX, uc[1] / (float)UCHAR_MAX);
+		} break;
+		case LM_UNSIGNED_SHORT: {
+			const unsigned short *us = (const unsigned short*)uvPtr;
+			uv = lm_v2(us[0] / (float)USHRT_MAX, us[1] / (float)USHRT_MAX);
+		} break;
+		case LM_UNSIGNED_INT: {
+			const unsigned int *ui = (const unsigned int*)uvPtr;
+			uv = lm_v2(ui[0] / (float)UINT_MAX, ui[1] / (float)UINT_MAX);
+		} break;
+		case LM_FLOAT: {
+			uv = *(const lm_vec2*)uvPtr;
+		} break;
+		default: {
+			assert(LM_FALSE);
+		} break;
+		}
+
+		ctx->meshPosition.triangle.uv[i] = lm_mul2(lm_pmod2(uv, 1.0f), uvScale); // maybe clamp to 0.0-1.0 instead of pmod?
+
+		// update bounds on lightmap
+		uvMin = lm_min2(uvMin, ctx->meshPosition.triangle.uv[i]);
+		uvMax = lm_max2(uvMax, ctx->meshPosition.triangle.uv[i]);
+	}
+
+	lm_vec3 flatNormal = lm_cross3(
+		lm_sub3(ctx->meshPosition.triangle.p[1], ctx->meshPosition.triangle.p[0]),
+		lm_sub3(ctx->meshPosition.triangle.p[2], ctx->meshPosition.triangle.p[0]));
+
+	for (int i = 0; i < 3; i++)
+	{
+		// decode and pre-transform vertex normal
+		const void *nPtr = ctx->mesh.normals + vIndices[i] * ctx->mesh.normalsStride;
+		lm_vec3 n;
+		switch (ctx->mesh.normalsType)
+		{
+		// TODO: signed formats
+		case LM_FLOAT: {
+			n = *(const lm_vec3*)nPtr;
+		} break;
+		case LM_NONE: {
+			n = flatNormal;
+		} break;
+		default: {
+			assert(LM_FALSE);
+		} break;
+		}
+		ctx->meshPosition.triangle.n[i] = lm_normalize3(lm_transformNormal(ctx->mesh.normalMatrix, n));
+	}
+
+	// calculate area of interest (on lightmap) for conservative rasterization
+	lm_vec2 bbMin = lm_floor2(uvMin);
+	lm_vec2 bbMax = lm_ceil2 (uvMax);
+	ctx->meshPosition.rasterizer.minx = lm_maxi((int)bbMin.x - 1, 0);
+	ctx->meshPosition.rasterizer.miny = lm_maxi((int)bbMin.y - 1, 0);
+	ctx->meshPosition.rasterizer.maxx = lm_mini((int)bbMax.x + 1, ctx->lightmap.width - 1);
+	ctx->meshPosition.rasterizer.maxy = lm_mini((int)bbMax.y + 1, ctx->lightmap.height - 1);
+	assert(ctx->meshPosition.rasterizer.minx <= ctx->meshPosition.rasterizer.maxx &&
+		   ctx->meshPosition.rasterizer.miny <= ctx->meshPosition.rasterizer.maxy);
+	ctx->meshPosition.rasterizer.x = ctx->meshPosition.rasterizer.minx + lm_passOffsetX(ctx);
+	ctx->meshPosition.rasterizer.y = ctx->meshPosition.rasterizer.miny + lm_passOffsetY(ctx);
+
+	// try moving to first valid sample position
+	if (ctx->meshPosition.rasterizer.x <= ctx->meshPosition.rasterizer.maxx &&
+		ctx->meshPosition.rasterizer.y <= ctx->meshPosition.rasterizer.maxy &&
+		lm_findFirstConservativeTriangleRasterizerPosition(ctx))
+		ctx->meshPosition.hemisphere.side = 0; // we can start sampling the hemisphere
+	else
+		ctx->meshPosition.hemisphere.side = 5; // no samples on this triangle! put hemisphere sampler into finished state
+}
+
+static GLuint lm_LoadShader(GLenum type, const char *source)
+{
+	GLuint shader = glCreateShader(type);
+	if (shader == 0)
+	{
+		fprintf(stderr, "Could not create shader!\n");
+		return 0;
+	}
+	glShaderSource(shader, 1, &source, NULL);
+	glCompileShader(shader);
+	GLint compiled;
+	glGetShaderiv(shader, GL_COMPILE_STATUS, &compiled);
+	if (!compiled)
+	{
+		fprintf(stderr, "Could not compile shader!\n");
+		GLint infoLen = 0;
+		glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &infoLen);
+		if (infoLen)
+		{
+			char* infoLog = (char*)malloc(infoLen);
+			glGetShaderInfoLog(shader, infoLen, NULL, infoLog);
+			fprintf(stderr, "%s\n", infoLog);
+			free(infoLog);
+		}
+		glDeleteShader(shader);
+		return 0;
+	}
+	return shader;
+}
+
+static GLuint lm_LoadProgram(const char *vp, const char *fp)
+{
+	GLuint program = glCreateProgram();
+	if (program == 0)
+	{
+		fprintf(stderr, "Could not create program!\n");
+		return 0;
+	}
+	GLuint vertexShader = lm_LoadShader(GL_VERTEX_SHADER, vp);
+	GLuint fragmentShader = lm_LoadShader(GL_FRAGMENT_SHADER, fp);
+	glAttachShader(program, vertexShader);
+	glAttachShader(program, fragmentShader);
+	glLinkProgram(program);
+	glDeleteShader(vertexShader);
+	glDeleteShader(fragmentShader);
+	GLint linked;
+	glGetProgramiv(program, GL_LINK_STATUS, &linked);
+	if (!linked)
+	{
+		fprintf(stderr, "Could not link program!\n");
+		GLint infoLen = 0;
+		glGetProgramiv(program, GL_INFO_LOG_LENGTH, &infoLen);
+		if (infoLen)
+		{
+			char* infoLog = (char*)malloc(sizeof(char) * infoLen);
+			glGetProgramInfoLog(program, infoLen, NULL, infoLog);
+			fprintf(stderr, "%s\n", infoLog);
+			free(infoLog);
+		}
+		glDeleteProgram(program);
+		return 0;
+	}
+	return program;
+}
+
+static float lm_defaultWeights(float cos_theta, void *userdata)
+{
+	return 1.0f;
+}
+
+lm_context *lmCreate(int hemisphereSize, float zNear, float zFar,
+	float clearR, float clearG, float clearB,
+	int interpolationPasses, float interpolationThreshold,
+	float cameraToSurfaceDistanceModifier)
+{
+	assert(hemisphereSize == 512 || hemisphereSize == 256 || hemisphereSize == 128 ||
+		   hemisphereSize ==  64 || hemisphereSize ==  32 || hemisphereSize ==  16);
+	assert(zNear < zFar && zNear > 0.0f);
+	assert(cameraToSurfaceDistanceModifier >= -1.0f);
+	assert(interpolationPasses >= 0 && interpolationPasses <= 8);
+	assert(interpolationThreshold >= 0.0f);
+
+	lm_context *ctx = (lm_context*)LM_CALLOC(1, sizeof(lm_context));
+
+	ctx->meshPosition.passCount = 1 + 3 * interpolationPasses;
+	ctx->interpolationThreshold = interpolationThreshold;
+	ctx->hemisphere.size = hemisphereSize;
+	ctx->hemisphere.zNear = zNear;
+	ctx->hemisphere.zFar = zFar;
+	ctx->hemisphere.cameraToSurfaceDistanceModifier = cameraToSurfaceDistanceModifier;
+	ctx->hemisphere.clearColor.r = clearR;
+	ctx->hemisphere.clearColor.g = clearG;
+	ctx->hemisphere.clearColor.b = clearB;
+
+	// calculate hemisphere batch size
+	ctx->hemisphere.fbHemiCountX = 1536 / (3 * ctx->hemisphere.size);
+	ctx->hemisphere.fbHemiCountY = 512 / ctx->hemisphere.size;
+
+	// hemisphere batch framebuffers
+	unsigned int w[] = {
+		ctx->hemisphere.fbHemiCountX * ctx->hemisphere.size * 3,
+		ctx->hemisphere.fbHemiCountX * ctx->hemisphere.size / 2 };
+	unsigned int h[] = {
+		ctx->hemisphere.fbHemiCountY * ctx->hemisphere.size,
+		ctx->hemisphere.fbHemiCountY * ctx->hemisphere.size / 2 };
+
+	glGenTextures(2, ctx->hemisphere.fbTexture);
+	glGenFramebuffers(2, ctx->hemisphere.fb);
+	glGenRenderbuffers(1, &ctx->hemisphere.fbDepth);
+
+	glBindRenderbuffer(GL_RENDERBUFFER, ctx->hemisphere.fbDepth);
+	glRenderbufferStorage(GL_RENDERBUFFER, GL_DEPTH_COMPONENT24, w[0], h[0]);
+	glBindFramebuffer(GL_FRAMEBUFFER, ctx->hemisphere.fb[0]);
+	glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_RENDERBUFFER, ctx->hemisphere.fbDepth);
+	for (int i = 0; i < 2; i++)
+	{
+		glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.fbTexture[i]);
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+		glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F, w[i], h[i], 0, GL_RGBA, GL_FLOAT, 0);
+
+		glBindFramebuffer(GL_FRAMEBUFFER, ctx->hemisphere.fb[i]);
+		glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, ctx->hemisphere.fbTexture[i], 0);
+		GLenum status = glCheckFramebufferStatus(GL_FRAMEBUFFER);
+		if (status != GL_FRAMEBUFFER_COMPLETE)
+		{
+			fprintf(stderr, "Could not create framebuffer!\n");
+			glDeleteRenderbuffers(1, &ctx->hemisphere.fbDepth);
+			glDeleteFramebuffers(2, ctx->hemisphere.fb);
+			glDeleteTextures(2, ctx->hemisphere.fbTexture);
+			LM_FREE(ctx);
+			return NULL;
+		}
+	}
+	glBindFramebuffer(GL_FRAMEBUFFER, 0);
+
+	// dummy vao for fullscreen quad rendering
+	glGenVertexArrays(1, &ctx->hemisphere.vao);
+
+	// hemisphere shader (weighted downsampling of the 3x1 hemisphere layout to a 0.5x0.5 square)
+	{
+		const char *vs =
+			"#version 150 core\n"
+			"const vec2 ps[4] = vec2[](vec2(1, -1), vec2(1, 1), vec2(-1, -1), vec2(-1, 1));\n"
+			"void main()\n"
+			"{\n"
+				"gl_Position = vec4(ps[gl_VertexID], 0, 1);\n"
+			"}\n";
+		const char *fs =
+			"#version 150 core\n"
+			"uniform sampler2D hemispheres;\n"
+			"uniform sampler2D weights;\n"
+
+			"layout(pixel_center_integer) in vec4 gl_FragCoord;\n" // whole integer values represent pixel centers, GL_ARB_fragment_coord_conventions
+
+			"out vec4 outColor;\n"
+
+			"vec4 weightedSample(ivec2 h_uv, ivec2 w_uv, ivec2 quadrant)\n"
+			"{\n"
+				"vec4 sample = texelFetch(hemispheres, h_uv + quadrant, 0);\n"
+				"vec2 weight = texelFetch(weights, w_uv + quadrant, 0).rg;\n"
+				"return vec4(sample.rgb * weight.r, sample.a * weight.g);\n"
+			"}\n"
+
+			"vec4 threeWeightedSamples(ivec2 h_uv, ivec2 w_uv, ivec2 offset)\n"
+			"{\n" // horizontal triple sum
+				"vec4 sum = weightedSample(h_uv, w_uv, offset);\n"
+				"sum += weightedSample(h_uv, w_uv, offset + ivec2(2, 0));\n"
+				"sum += weightedSample(h_uv, w_uv, offset + ivec2(4, 0));\n"
+				"return sum;\n"
+			"}\n"
+
+			"void main()\n"
+			"{\n" // this is a weighted sum downsampling pass (alpha component contains the weighted valid sample count)
+				"vec2 in_uv = gl_FragCoord.xy * vec2(6.0, 2.0) + vec2(0.5);\n"
+				"ivec2 h_uv = ivec2(in_uv);\n"
+				"ivec2 w_uv = ivec2(mod(in_uv, vec2(textureSize(weights, 0))));\n" // there's no integer modulo :(
+				"vec4 lb = threeWeightedSamples(h_uv, w_uv, ivec2(0, 0));\n"
+				"vec4 rb = threeWeightedSamples(h_uv, w_uv, ivec2(1, 0));\n"
+				"vec4 lt = threeWeightedSamples(h_uv, w_uv, ivec2(0, 1));\n"
+				"vec4 rt = threeWeightedSamples(h_uv, w_uv, ivec2(1, 1));\n"
+				"outColor = lb + rb + lt + rt;\n"
+			"}\n";
+		ctx->hemisphere.firstPass.programID = lm_LoadProgram(vs, fs);
+		if (!ctx->hemisphere.firstPass.programID)
+		{
+			fprintf(stderr, "Error loading the hemisphere first pass shader program... leaving!\n");
+			glDeleteVertexArrays(1, &ctx->hemisphere.vao);
+			glDeleteRenderbuffers(1, &ctx->hemisphere.fbDepth);
+			glDeleteFramebuffers(2, ctx->hemisphere.fb);
+			glDeleteTextures(2, ctx->hemisphere.fbTexture);
+			LM_FREE(ctx);
+			return NULL;
+		}
+		ctx->hemisphere.firstPass.hemispheresTextureID = glGetUniformLocation(ctx->hemisphere.firstPass.programID, "hemispheres");
+		ctx->hemisphere.firstPass.weightsTextureID = glGetUniformLocation(ctx->hemisphere.firstPass.programID, "weights");
+	}
+
+	// downsample shader
+	{
+		const char *vs =
+			"#version 150 core\n"
+			"const vec2 ps[4] = vec2[](vec2(1, -1), vec2(1, 1), vec2(-1, -1), vec2(-1, 1));\n"
+			"void main()\n"
+			"{\n"
+				"gl_Position = vec4(ps[gl_VertexID], 0, 1);\n"
+			"}\n";
+		const char *fs =
+			"#version 150 core\n"
+			"uniform sampler2D hemispheres;\n"
+
+			"layout(pixel_center_integer) in vec4 gl_FragCoord;\n" // whole integer values represent pixel centers, GL_ARB_fragment_coord_conventions
+
+			"out vec4 outColor;\n"
+
+			"void main()\n"
+			"{\n" // this is a sum downsampling pass (alpha component contains the weighted valid sample count)
+				"ivec2 h_uv = ivec2(gl_FragCoord.xy) * 2;\n"
+				"vec4 lb = texelFetch(hemispheres, h_uv + ivec2(0, 0), 0);\n"
+				"vec4 rb = texelFetch(hemispheres, h_uv + ivec2(1, 0), 0);\n"
+				"vec4 lt = texelFetch(hemispheres, h_uv + ivec2(0, 1), 0);\n"
+				"vec4 rt = texelFetch(hemispheres, h_uv + ivec2(1, 1), 0);\n"
+				"outColor = lb + rb + lt + rt;\n"
+			"}\n";
+		ctx->hemisphere.downsamplePass.programID = lm_LoadProgram(vs, fs);
+		if (!ctx->hemisphere.downsamplePass.programID)
+		{
+			fprintf(stderr, "Error loading the hemisphere downsample pass shader program... leaving!\n");
+			glDeleteProgram(ctx->hemisphere.firstPass.programID);
+			glDeleteVertexArrays(1, &ctx->hemisphere.vao);
+			glDeleteRenderbuffers(1, &ctx->hemisphere.fbDepth);
+			glDeleteFramebuffers(2, ctx->hemisphere.fb);
+			glDeleteTextures(2, ctx->hemisphere.fbTexture);
+			LM_FREE(ctx);
+			return NULL;
+		}
+		ctx->hemisphere.downsamplePass.hemispheresTextureID = glGetUniformLocation(ctx->hemisphere.downsamplePass.programID, "hemispheres");
+	}
+
+	// hemisphere weights texture
+	glGenTextures(1, &ctx->hemisphere.firstPass.weightsTexture);
+	lmSetHemisphereWeights(ctx, lm_defaultWeights, 0);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
+
+	// allocate batchPosition-to-lightmapPosition map
+	ctx->hemisphere.fbHemiToLightmapLocation = (lm_ivec2*)LM_CALLOC(ctx->hemisphere.fbHemiCountX * ctx->hemisphere.fbHemiCountY, sizeof(lm_ivec2));
+
+	return ctx;
+}
+
+void lmDestroy(lm_context *ctx)
+{
+	// reset state
+	glUseProgram(0);
+	glBindTexture(GL_TEXTURE_2D, 0);
+	glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
+	glBindVertexArray(0);
+	glBindFramebuffer(GL_FRAMEBUFFER, 0);
+	glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
+	glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+
+	// delete gl objects
+	glDeleteTextures(1, &ctx->hemisphere.firstPass.weightsTexture);
+	glDeleteTextures(1, &ctx->hemisphere.storage.texture);
+	glDeleteProgram(ctx->hemisphere.downsamplePass.programID);
+	glDeleteProgram(ctx->hemisphere.firstPass.programID);
+	glDeleteVertexArrays(1, &ctx->hemisphere.vao);
+	glDeleteRenderbuffers(1, &ctx->hemisphere.fbDepth);
+	glDeleteFramebuffers(2, ctx->hemisphere.fb);
+	glDeleteTextures(2, ctx->hemisphere.fbTexture);
+	glDeleteTextures(1, &ctx->hemisphere.storage.texture);
+
+	// free memory
+	LM_FREE(ctx->hemisphere.storage.toLightmapLocation);
+	LM_FREE(ctx->hemisphere.fbHemiToLightmapLocation);
+#ifdef LM_DEBUG_INTERPOLATION
+	LM_FREE(ctx->lightmap.debug);
+#endif
+	LM_FREE(ctx);
+}
+
+void lmSetHemisphereWeights(lm_context *ctx, lm_weight_func f, void *userdata)
+{
+	// hemisphere weights texture. bakes in material dependent attenuation behaviour.
+	float *weights = (float*)LM_CALLOC(2 * 3 * ctx->hemisphere.size * ctx->hemisphere.size, sizeof(float));
+	float center = (ctx->hemisphere.size - 1) * 0.5f;
+	double sum = 0.0;
+	for (unsigned int y = 0; y < ctx->hemisphere.size; y++)
+	{
+		float dy = 2.0f * (y - center) / (float)ctx->hemisphere.size;
+		for (unsigned int x = 0; x < ctx->hemisphere.size; x++)
+		{
+			float dx = 2.0f * (x - center) / (float)ctx->hemisphere.size;
+			lm_vec3 v = lm_normalize3(lm_v3(dx, dy, 1.0f));
+
+			float solidAngle = v.z * v.z * v.z;
+
+			float *w0 = weights + 2 * (y * (3 * ctx->hemisphere.size) + x);
+			float *w1 = w0 + 2 * ctx->hemisphere.size;
+			float *w2 = w1 + 2 * ctx->hemisphere.size;
+
+			// center weights
+			w0[0] = solidAngle * f(v.z, userdata);
+			w0[1] = solidAngle;
+
+			// left/right side weights
+			w1[0] = solidAngle * f(lm_absf(v.x), userdata);
+			w1[1] = solidAngle;
+
+			// up/down side weights
+			w2[0] = solidAngle * f(lm_absf(v.y), userdata);
+			w2[1] = solidAngle;
+
+			sum += 3.0 * (double)solidAngle;
+		}
+	}
+
+	// normalize weights
+	float weightScale = (float)(1.0 / sum);
+	for (unsigned int i = 0; i < 2 * 3 * ctx->hemisphere.size * ctx->hemisphere.size; i++)
+		weights[i] *= weightScale;
+
+	// upload weight texture
+	glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.firstPass.weightsTexture);
+	glTexImage2D(GL_TEXTURE_2D, 0, GL_RG32F, 3 * ctx->hemisphere.size, ctx->hemisphere.size, 0, GL_RG, GL_FLOAT, weights);
+	LM_FREE(weights);
+}
+
+void lmSetTargetLightmap(lm_context *ctx, float *outLightmap, int w, int h, int c)
+{
+	ctx->lightmap.data = outLightmap;
+	ctx->lightmap.width = w;
+	ctx->lightmap.height = h;
+	ctx->lightmap.channels = c;
+
+	// allocate storage texture
+	if (!ctx->hemisphere.storage.texture)
+		glGenTextures(1, &ctx->hemisphere.storage.texture);
+	glBindTexture(GL_TEXTURE_2D, ctx->hemisphere.storage.texture);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+	glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F, w, h, 0, GL_RGBA, GL_FLOAT, 0);
+
+	// allocate storage position to lightmap position map
+	if (ctx->hemisphere.storage.toLightmapLocation)
+		LM_FREE(ctx->hemisphere.storage.toLightmapLocation);
+	ctx->hemisphere.storage.toLightmapLocation = (lm_ivec2*)LM_CALLOC(w * h, sizeof(lm_ivec2));
+	// invalidate all positions
+	for (int i = 0; i < w * h; i++)
+		ctx->hemisphere.storage.toLightmapLocation[i].x = -1;
+
+#ifdef LM_DEBUG_INTERPOLATION
+	if (ctx->lightmap.debug)
+		LM_FREE(ctx->lightmap.debug);
+	ctx->lightmap.debug = (unsigned char*)LM_CALLOC(ctx->lightmap.width * ctx->lightmap.height, 3);
+#endif
+}
+
+void lmSetGeometry(lm_context *ctx,
+	const float *transformationMatrix,
+	lm_type positionsType, const void *positionsXYZ, int positionsStride,
+	lm_type normalsType, const void *normalsXYZ, int normalsStride,
+	lm_type lightmapCoordsType, const void *lightmapCoordsUV, int lightmapCoordsStride,
+	int count, lm_type indicesType, const void *indices)
+{
+	ctx->mesh.modelMatrix = transformationMatrix;
+	ctx->mesh.positions = (const unsigned char*)positionsXYZ;
+	ctx->mesh.positionsType = positionsType;
+	ctx->mesh.positionsStride = positionsStride == 0 ? sizeof(lm_vec3) : positionsStride;
+	ctx->mesh.normals = (const unsigned char*)normalsXYZ;
+	ctx->mesh.normalsType = normalsType;
+	ctx->mesh.normalsStride = normalsStride == 0 ? sizeof(lm_vec3) : normalsStride;
+	ctx->mesh.uvs = (const unsigned char*)lightmapCoordsUV;
+	ctx->mesh.uvsType = lightmapCoordsType;
+	ctx->mesh.uvsStride = lightmapCoordsStride == 0 ? sizeof(lm_vec2) : lightmapCoordsStride;
+	ctx->mesh.indicesType = indicesType;
+	ctx->mesh.indices = (const unsigned char*)indices;
+	ctx->mesh.count = count;
+
+	lm_inverseTranspose(transformationMatrix, ctx->mesh.normalMatrix);
+
+	ctx->meshPosition.pass = 0;
+	lm_setMeshPosition(ctx, 0);
+}
+
+lm_bool lmBegin(lm_context *ctx, int* outViewport4, float* outView4x4, float* outProjection4x4)
+{
+	assert(ctx->meshPosition.triangle.baseIndex < ctx->mesh.count);
+	while (!lm_beginSampleHemisphere(ctx, outViewport4, outView4x4, outProjection4x4))
+	{ // as long as there are no hemisphere sides to render...
+		// try moving to the next rasterizer position
+		if (lm_findNextConservativeTriangleRasterizerPosition(ctx))
+		{ // if we successfully moved to the next sample position on the current triangle...
+			ctx->meshPosition.hemisphere.side = 0; // start sampling a hemisphere there
+		}
+		else
+		{ // if there are no valid sample positions on the current triangle...
+			if (ctx->meshPosition.triangle.baseIndex + 3 < ctx->mesh.count)
+			{ // ...and there are triangles left: move to the next triangle and continue sampling.
+				lm_setMeshPosition(ctx, ctx->meshPosition.triangle.baseIndex + 3);
+			}
+			else
+			{ // ...and there are no triangles left: finish
+				lm_integrateHemisphereBatch(ctx); // integrate and store last batch
+				lm_writeResultsToLightmap(ctx); // read storage data from gpu memory and write it to the lightmap
+
+				if (++ctx->meshPosition.pass == ctx->meshPosition.passCount)
+				{
+					ctx->meshPosition.pass = 0;
+					ctx->meshPosition.triangle.baseIndex = ctx->mesh.count; // set end condition (in case someone accidentally calls lmBegin again)
+
+#ifdef LM_DEBUG_INTERPOLATION
+					lmImageSaveTGAub("debug_interpolation.tga", ctx->lightmap.debug, ctx->lightmap.width, ctx->lightmap.height, 3);
+
+					// lightmap texel statistics
+					int rendered = 0, interpolated = 0, wasted = 0;
+					for (int y = 0; y < ctx->lightmap.height; y++)
+					{
+						for (int x = 0; x < ctx->lightmap.width; x++)
+						{
+							if (ctx->lightmap.debug[(y * ctx->lightmap.width + x) * 3 + 0])
+								rendered++;
+							else if (ctx->lightmap.debug[(y * ctx->lightmap.width + x) * 3 + 1])
+								interpolated++;
+							else
+								wasted++;
+						}
+					}
+					int used = rendered + interpolated;
+					int total = used + wasted;
+					printf("\n#######################################################################\n");
+					printf("%10d %6.2f%% rendered hemicubes integrated to lightmap texels.\n", rendered, 100.0f * (float)rendered / (float)total);
+					printf("%10d %6.2f%% interpolated lightmap texels.\n", interpolated, 100.0f * (float)interpolated / (float)total);
+					printf("%10d %6.2f%% wasted lightmap texels.\n", wasted, 100.0f * (float)wasted / (float)total);
+					printf("\n%17.2f%% of used texels were rendered.\n", 100.0f * (float)rendered / (float)used);
+					printf("#######################################################################\n");
+#endif
+
+					return LM_FALSE;
+				}
+
+				lm_setMeshPosition(ctx, 0); // start over with the next pass
+			}
+		}
+	}
+	return LM_TRUE;
+}
+
+float lmProgress(lm_context *ctx)
+{
+	float passProgress = (float)ctx->meshPosition.triangle.baseIndex / (float)ctx->mesh.count;
+	return ((float)ctx->meshPosition.pass + passProgress) / (float)ctx->meshPosition.passCount;
+}
+
+void lmEnd(lm_context *ctx)
+{
+	lm_endSampleHemisphere(ctx);
+}
+
+// these are not performance tuned since their impact on the whole lightmapping duration is insignificant
+float lmImageMin(const float *image, int w, int h, int c, int m)
+{
+	assert(c > 0 && m);
+	float minValue = FLT_MAX;
+	for (int i = 0; i < w * h; i++)
+		for (int j = 0; j < c; j++)
+			if (m & (1 << j))
+				minValue = lm_minf(minValue, image[i * c + j]);
+	return minValue;
+}
+
+float lmImageMax(const float *image, int w, int h, int c, int m)
+{
+	assert(c > 0 && m);
+	float maxValue = 0.0f;
+	for (int i = 0; i < w * h; i++)
+		for (int j = 0; j < c; j++)
+			if (m & (1 << j))
+				maxValue = lm_maxf(maxValue, image[i * c + j]);
+	return maxValue;
+}
+
+void lmImageAdd(float *image, int w, int h, int c, float value, int m)
+{
+	assert(c > 0 && m);
+	for (int i = 0; i < w * h; i++)
+		for (int j = 0; j < c; j++)
+			if (m & (1 << j))
+				image[i * c + j] += value;
+}
+
+void lmImageScale(float *image, int w, int h, int c, float factor, int m)
+{
+	assert(c > 0 && m);
+	for (int i = 0; i < w * h; i++)
+		for (int j = 0; j < c; j++)
+			if (m & (1 << j))
+				image[i * c + j] *= factor;
+}
+
+void lmImagePower(float *image, int w, int h, int c, float exponent, int m)
+{
+	assert(c > 0 && m);
+	for (int i = 0; i < w * h; i++)
+		for (int j = 0; j < c; j++)
+			if (m & (1 << j))
+				image[i * c + j] = powf(image[i * c + j], exponent);
+}
+
+void lmImageDilate(const float *image, float *outImage, int w, int h, int c)
+{
+	assert(c > 0 && c <= 4);
+	for (int y = 0; y < h; y++)
+	{
+		for (int x = 0; x < w; x++)
+		{
+			float color[4];
+			lm_bool valid = LM_FALSE;
+			for (int i = 0; i < c; i++)
+			{
+				color[i] = image[(y * w + x) * c + i];
+				valid |= color[i] > 0.0f;
+			}
+			if (!valid)
+			{
+				int n = 0;
+				const int dx[] = { -1, 0, 1,  0 };
+				const int dy[] = {  0, 1, 0, -1 };
+				for (int d = 0; d < 4; d++)
+				{
+					int cx = x + dx[d];
+					int cy = y + dy[d];
+					if (cx >= 0 && cx < w && cy >= 0 && cy < h)
+					{
+						float dcolor[4];
+						lm_bool dvalid = LM_FALSE;
+						for (int i = 0; i < c; i++)
+						{
+							dcolor[i] = image[(cy * w + cx) * c + i];
+							dvalid |= dcolor[i] > 0.0f;
+						}
+						if (dvalid)
+						{
+							for (int i = 0; i < c; i++)
+								color[i] += dcolor[i];
+							n++;
+						}
+					}
+				}
+				if (n)
+				{
+					float in = 1.0f / n;
+					for (int i = 0; i < c; i++)
+						color[i] *= in;
+				}
+			}
+			for (int i = 0; i < c; i++)
+				outImage[(y * w + x) * c + i] = color[i];
+		}
+	}
+}
+
+void lmImageSmooth(const float *image, float *outImage, int w, int h, int c)
+{
+	assert(c > 0 && c <= 4);
+	for (int y = 0; y < h; y++)
+	{
+		for (int x = 0; x < w; x++)
+		{
+			float color[4] = {0};
+			int n = 0;
+			for (int dy = -1; dy <= 1; dy++)
+			{
+				int cy = y + dy;
+				for (int dx = -1; dx <= 1; dx++)
+				{
+					int cx = x + dx;
+					if (cx >= 0 && cx < w && cy >= 0 && cy < h)
+					{
+						lm_bool valid = LM_FALSE;
+						for (int i = 0; i < c; i++)
+							valid |= image[(cy * w + cx) * c + i] > 0.0f;
+						if (valid)
+						{
+							for (int i = 0; i < c; i++)
+								color[i] += image[(cy * w + cx) * c + i];
+							n++;
+						}
+					}
+				}
+			}
+			for (int i = 0; i < c; i++)
+				outImage[(y * w + x) * c + i] = n ? color[i] / n : 0.0f;
+		}
+	}
+}
+
+void lmImageDownsample(const float *image, float *outImage, int w, int h, int c)
+{
+	assert(c > 0 && c <= 4);
+	for (int y = 0; y < h / 2; y++)
+	{
+		for (int x = 0; x < w / 2; x++)
+		{
+			int p0 = 2 * (y * w + x) * c;
+			int p1 = p0 + w * c;
+			int valid[2][2] = {0};
+			float sums[4] = {0};
+			for (int i = 0; i < c; i++)
+			{
+				valid[0][0] |= image[p0     + i] != 0.0f ? 1 : 0;
+				valid[0][1] |= image[p0 + c + i] != 0.0f ? 1 : 0;
+				valid[1][0] |= image[p1     + i] != 0.0f ? 1 : 0;
+				valid[1][1] |= image[p1 + c + i] != 0.0f ? 1 : 0;
+				sums[i] += image[p0 + i] + image[p0 + c + i] + image[p1 + i] + image[p1 + c + i];
+			}
+			int n = valid[0][0] + valid[0][1] + valid[1][0] + valid[1][1];
+			int p = (y * w / 2 + x) * c;
+			for (int i = 0; i < c; i++)
+				outImage[p + i] = n ? sums[i] / n : 0.0f;
+		}
+	}
+}
+
+void lmImageFtoUB(const float *image, unsigned char *outImage, int w, int h, int c, float max)
+{
+	assert(c > 0);
+	float scale = 255.0f / (max != 0.0f ? max : lmImageMax(image, w, h, c, LM_ALL_CHANNELS));
+	for (int i = 0; i < w * h * c; i++)
+		outImage[i] = (unsigned char)lm_minf(lm_maxf(image[i] * scale, 0.0f), 255.0f);
+}
+
+// TGA output helpers
+static void lm_swapRandBub(unsigned char *image, int w, int h, int c)
+{
+	assert(c >= 3);
+	for (int i = 0; i < w * h * c; i += c)
+		LM_SWAP(unsigned char, image[i], image[i + 2]);
+}
+
+lm_bool lmImageSaveTGAub(const char *filename, const unsigned char *image, int w, int h, int c)
+{
+	assert(c == 1 || c == 3 || c == 4);
+	lm_bool isGreyscale = c == 1;
+	lm_bool hasAlpha = c == 4;
+	unsigned char header[18] = {
+		0, 0, (unsigned char)(isGreyscale ? 3 : 2), 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		(unsigned char)(w & 0xff), (unsigned char)((w >> 8) & 0xff), (unsigned char)(h & 0xff), (unsigned char)((h >> 8) & 0xff),
+		(unsigned char)(8 * c), (unsigned char)(hasAlpha ? 8 : 0)
+	};
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+	FILE *file;
+	if (fopen_s(&file, filename, "wb") != 0) return LM_FALSE;
+#else
+	FILE *file = fopen(filename, "wb");
+	if (!file) return LM_FALSE;
+#endif
+	fwrite(header, 1, sizeof(header), file);
+
+	// we make sure to swap it back! trust me. :)
+	if (!isGreyscale)
+		lm_swapRandBub((unsigned char*)image, w, h, c);
+	fwrite(image, 1, w * h * c , file);
+	if (!isGreyscale)
+		lm_swapRandBub((unsigned char*)image, w, h, c);
+
+	fclose(file);
+	return LM_TRUE;
+}
+
+lm_bool lmImageSaveTGAf(const char *filename, const float *image, int w, int h, int c, float max)
+{
+	unsigned char *temp = (unsigned char*)LM_CALLOC(w * h * c, sizeof(unsigned char));
+	lmImageFtoUB(image, temp, w, h, c, max);
+	lm_bool success = lmImageSaveTGAub(filename, temp, w, h, c);
+	LM_FREE(temp);
+	return success;
+}
+
+#endif // LIGHTMAPPER_IMPLEMENTATION
+#line 0
+
 #endif // V4K_3RD
diff --git a/engine/v4k.c b/engine/v4k.c
index dc08b79..eb56f8d 100644
--- a/engine/v4k.c
+++ b/engine/v4k.c
@@ -11128,17 +11128,23 @@ void *gui_userdata() {
     return last_skin->userdata;
 }
 
-vec2 gui_getskinsize(const char *skin) {
+vec2 gui_getskinsize(const char *skin, const char *fallback) {
     vec2 size={0};
-    if (last_skin->getskinsize) last_skin->getskinsize(last_skin->userdata, skin, &size);
+    if (last_skin->getskinsize) last_skin->getskinsize(last_skin->userdata, skin, fallback, &size);
     return size;
 }
 
-bool gui_ismouseinrect(const char *skin, vec4 rect) {
-    if (last_skin->ismouseinrect) return last_skin->ismouseinrect(last_skin->userdata, skin, rect);
+bool gui_ismouseinrect(const char *skin, const char *fallback, vec4 rect) {
+    if (last_skin->ismouseinrect) return last_skin->ismouseinrect(last_skin->userdata, skin, fallback, rect);
     return false; 
 }
 
+vec4 gui_getscissorrect(const char *skin, const char *fallback, vec4 rect) {
+    vec4 scissor = rect;
+    if (last_skin->getscissorrect) last_skin->getscissorrect(last_skin->userdata, skin, fallback, rect, &scissor);
+    return scissor;
+}
+
 static
 gui_state_t *gui_getstate(int id) {
     if (!ctl_states) map_init(ctl_states, less_int, hash_int);
@@ -11148,8 +11154,8 @@ gui_state_t *gui_getstate(int id) {
 void gui_panel_id(int id, vec4 rect, const char *skin) {
     (void)id;
     vec4 scissor={0, 0, window_width(), window_height()};
-    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, skin, rect);
-    if (last_skin->getscissorrect) last_skin->getscissorrect(last_skin->userdata, skin, rect, &scissor);
+    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, skin, NULL, rect);
+    scissor = gui_getscissorrect(skin, NULL, rect);
 
     if (!array_count(scissor_rects))
         glEnable(GL_SCISSOR_TEST);
@@ -11174,7 +11180,7 @@ bool gui_button_id(int id, vec4 r, const char *skin) {
 
     skin=skin?skin:"button";
     char *btn = va("%s%s", skin, entry->held?"_press":entry->hover?"_hover":"");
-    if (gui_ismouseinrect(btn, r)) {
+    if (gui_ismouseinrect(btn, skin, r)) {
         if (input_up(MOUSE_L) && entry->held) {
             was_clicked=1;
         }
@@ -11184,24 +11190,27 @@ bool gui_button_id(int id, vec4 r, const char *skin) {
             entry->hover = true;
         }
     }
-    else if (input_up(MOUSE_L) && entry->held) {
-        entry->held = false;
-        any_widget_used = false;
-    }
     else {
         entry->hover = false;
     }
 
-    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, btn, r);
+    if (input_up(MOUSE_L) && entry->held) {
+        entry->held = false;
+        any_widget_used = false;
+    }
+
+    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, btn, skin, r);
 
     return was_clicked;
 }
 
 bool gui_button_label_id(int id, const char *text, vec4 rect, const char *skin) {
+    gui_state_t *entry = gui_getstate(id);
     bool state = gui_button_id(id, rect, skin);
     vec2 buttonsize={0};
     skin=skin?skin:"button";
-    if (last_skin->getskinsize) last_skin->getskinsize(last_skin->userdata, skin, &buttonsize);
+    char *btn = va("%s%s", skin, entry->held?"_press":entry->hover?"_hover":"");
+    buttonsize = gui_getskinsize(btn, skin);
 
     vec2 textsize = font_rect(text);
     vec2 pos;
@@ -11234,7 +11243,8 @@ bool gui_slider_id(int id, vec4 rect, const char *skin, float min, float max, fl
 
     skin = skin?skin:"slider";
     char *cursorskin = va("%s_cursor%s", skin, entry->held?"_press":entry->hover?"_hover":"");
-    if (gui_ismouseinrect(skin, rect) && !any_widget_used) {
+    char *fbcursor = va("%s_cursor", skin);
+    if (gui_ismouseinrect(skin, NULL, rect) && !any_widget_used) {
         any_widget_used = entry->held = input_held(MOUSE_L);
         entry->hover = true;
     }
@@ -11247,13 +11257,12 @@ bool gui_slider_id(int id, vec4 rect, const char *skin, float min, float max, fl
     }
 
     float old_value = *value;
-    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, skin, rect);
+    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, skin, NULL, rect);
 
     vec2 slidersize={0}, cursorsize={0};
-    vec4 usablerect=rect;
-    if (last_skin->getscissorrect) last_skin->getscissorrect(last_skin->userdata, skin, rect, &usablerect);
-    if (last_skin->getskinsize) last_skin->getskinsize(last_skin->userdata, skin, &slidersize);
-    if (last_skin->getskinsize) last_skin->getskinsize(last_skin->userdata, cursorskin, &cursorsize);
+    vec4 usablerect = gui_getscissorrect(skin, NULL, rect);
+    slidersize = gui_getskinsize(skin, NULL);
+    cursorsize = gui_getskinsize(cursorskin, fbcursor);
     if (entry->held) {
         *value = posx2slider(usablerect, min, max, input(MOUSE_X), step);
     }
@@ -11264,7 +11273,7 @@ bool gui_slider_id(int id, vec4 rect, const char *skin, float min, float max, fl
     cursorrect.y += cursorpos.y;
     cursorrect.z = cursorsize.x;
     cursorrect.w = cursorsize.y;
-    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, cursorskin, cursorrect);
+    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, cursorskin, fbcursor, cursorrect);
 
     return entry->held && (old_value!=*value);
 }
@@ -11273,7 +11282,7 @@ bool gui_slider_label_id(int id, const char *text, vec4 rect, const char *skin,
     bool state = gui_slider_id(id, rect, skin, min, max, step, value);
     vec2 slidersize={0};
     skin=skin?skin:"slider";
-    if (last_skin->getskinsize) last_skin->getskinsize(last_skin->userdata, skin, &slidersize);
+    slidersize = gui_getskinsize(skin, NULL);
 
     vec2 textsize = font_rect(text);
     vec2 pos;
@@ -11286,7 +11295,7 @@ bool gui_slider_label_id(int id, const char *text, vec4 rect, const char *skin,
 
 void gui_rect_id(int id, vec4 r, const char *skin) {
     (void)id;
-    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, skin, r);
+    if (last_skin->drawrect) last_skin->drawrect(last_skin->userdata, skin, NULL, r);
 }
 
 void gui_label_id(int id, const char *text, vec4 rect) {
@@ -11310,7 +11319,7 @@ atlas_slice_frame_t *skinned_getsliceframe(atlas_t *a, const char *name) {
     for (int i = 0; i < array_count(a->slices); i++) 
         if (!strcmp(quark_string(&a->db, a->slices[i].name), name))
             return &a->slice_frames[a->slices[i].frames[0]];
-    PRINTF("slice name: '%s' is missing in atlas!\n", name);
+    // PRINTF("slice name: '%s' is missing in atlas!\n", name);
     return NULL;
 }
 
@@ -11321,9 +11330,10 @@ void skinned_draw_missing_rect(vec4 r) {
 }
 
 static
-bool skinned_ismouseinrect(void *userdata, const char *skin, vec4 r) {
+bool skinned_ismouseinrect(void *userdata, const char *skin, const char *fallback, vec4 r) {
     skinned_t *a = C_CAST(skinned_t*, userdata);
     atlas_slice_frame_t *f = skinned_getsliceframe(&a->atlas, skin);
+    if (!f && fallback) f = skinned_getsliceframe(&a->atlas, fallback);
     if (!f) return false;
 
     vec4 outer = f->bounds;
@@ -11406,18 +11416,20 @@ void skinned_draw_sprite(float scale, atlas_t *a, atlas_slice_frame_t *f, vec4 r
 }
 
 static
-void skinned_draw_rect(void* userdata, const char *skin, vec4 r) {
+void skinned_draw_rect(void* userdata, const char *skin, const char *fallback, vec4 r) {
     skinned_t *a = C_CAST(skinned_t*, userdata);
 
     atlas_slice_frame_t *f = skinned_getsliceframe(&a->atlas, skin);
+    if (!f && fallback) f = skinned_getsliceframe(&a->atlas, fallback);
     if (!f) skinned_draw_missing_rect(r);
     else skinned_draw_sprite(a->scale, &a->atlas, f, r);
 }
 
-void skinned_getskinsize(void *userdata, const char *skin, vec2 *size) {
+void skinned_getskinsize(void *userdata, const char *skin, const char *fallback, vec2 *size) {
     skinned_t *a = C_CAST(skinned_t*, userdata);
 
     atlas_slice_frame_t *f = skinned_getsliceframe(&a->atlas, skin);
+    if (!f && fallback) f = skinned_getsliceframe(&a->atlas, fallback);
     if (f) {
         size->x = (f->bounds.z-f->bounds.x)*a->scale;
         size->y = (f->bounds.w-f->bounds.y)*a->scale;
@@ -11425,9 +11437,10 @@ void skinned_getskinsize(void *userdata, const char *skin, vec2 *size) {
 }
 
 static
-void skinned_getscissorrect(void* userdata, const char *skin, vec4 rect, vec4 *dims) {
+void skinned_getscissorrect(void* userdata, const char *skin, const char *fallback, vec4 rect, vec4 *dims) {
     skinned_t *a = C_CAST(skinned_t*, userdata);
     atlas_slice_frame_t *f = skinned_getsliceframe(&a->atlas, skin);
+    if (!f && fallback) f = skinned_getsliceframe(&a->atlas, fallback);
     if (!f) return;
 
     *dims = rect;
diff --git a/engine/v4k.h b/engine/v4k.h
index a4e6d53..f5253d9 100644
--- a/engine/v4k.h
+++ b/engine/v4k.h
@@ -3638,6 +3638,13 @@ typedef struct anims_t {
 
 API anims_t animations(const char *pathfile, int flags);
 
+// -----------------------------------------------------------------------------
+// lightmapping utils
+
+typedef struct lightmap_t {
+    struct lm_context *ctx; // private
+} lightmap_t;
+
 // -----------------------------------------------------------------------------
 // skyboxes
 
@@ -4184,18 +4191,19 @@ API void     sprite_setanim(sprite_t *s, unsigned name);
 // game ui
 
 typedef struct guiskin_t {
-    void (*drawrect)(void* userdata, const char *skin, vec4 rect);
-    void (*getskinsize)(void* userdata, const char *skin, vec2 *size);
-    void (*getscissorrect)(void* userdata, const char *skin, vec4 rect, vec4 *dims);
-    bool (*ismouseinrect)(void* userdata, const char *skin, vec4 rect);
+    void (*drawrect)(void* userdata, const char *skin, const char *fallback, vec4 rect);
+    void (*getskinsize)(void* userdata, const char *skin, const char *fallback, vec2 *size);
+    void (*getscissorrect)(void* userdata, const char *skin, const char *fallback, vec4 rect, vec4 *dims);
+    bool (*ismouseinrect)(void* userdata, const char *skin, const char *fallback, vec4 rect);
     void (*free)(void* userdata);
     void *userdata;
 } guiskin_t;
 
 API void    gui_pushskin(guiskin_t skin);
 API void*       gui_userdata();
-API vec2        gui_getskinsize(const char *skin);
-API bool        gui_ismouseinrect(const char *skin, vec4 rect);
+API vec2        gui_getskinsize(const char *skin, const char *fallback);
+API bool        gui_ismouseinrect(const char *skin, const char *fallback, vec4 rect);
+API vec4        gui_getscissorrect(const char *skin, const char *fallback, vec4 rect);
 // --
 API void        gui_panel_id(int id, vec4 rect, const char *skin);
 API void            gui_rect_id(int id, vec4 rect, const char *skin);
diff --git a/tools/linswap.c b/tools/linswap.c
new file mode 100644
index 0000000..9f81cb6
--- /dev/null
+++ b/tools/linswap.c
@@ -0,0 +1,34 @@
+#include "v4k.c"
+
+int main(int argc, char **argv) {
+	if (argc != 2) {
+		fprintf(stderr, "usage: %s [file]\n", argv[0]);
+		fprintf(stderr, "file: %s\n", "source file to process");
+		return 1;
+	}
+
+	char *buf = file_read(argv[1]);
+	if (!buf) {
+		fprintf(stderr, "error: %s\n", "file does not exist!");
+		return 2;
+	}
+	file_delete(argv[1]);
+
+	// determine swap mode
+	char mode = strstri(buf, "//@#line 1 \"") == NULL;
+	char **lines = strsplit(buf, "\n");
+
+	for (int i = 0; i < array_count(lines); i++) {
+		char *line = STRDUP(lines[i]);
+		if (!mode) {
+			strrepl(&line, "//@#line 1 \"", "#line 1 \"");
+		} else {
+			strrepl(&line, "#line 1 \"", "//@#line 1 \"");
+		}
+
+		// printf("%s\n", line);
+		file_append(argv[1], line, strlen(line));
+	}
+
+	return 0;
+}
diff --git a/tools/linswap.exe b/tools/linswap.exe
new file mode 100644
index 0000000..3f2e5ab
Binary files /dev/null and b/tools/linswap.exe differ
diff --git a/v4k.sublime-project b/v4k.sublime-project
index ced448a..f697819 100644
--- a/v4k.sublime-project
+++ b/v4k.sublime-project
@@ -3,8 +3,8 @@
 	[
 		{
 			"path": ".",
-			"file_exclude_patterns": ["*.exe.manifest", "*.zip", "*.ilk", "*.exp", "_mirror/", "engine/v4k", "engine/joint/", "engine/split/"],
-			"index_exclude_patterns": ["engine/joint/v4k.h", "engine/split/**", "_mirror/**"]
+			"file_exclude_patterns": ["*.exe.manifest", "*.zip", "*.ilk", "*.exp", "_mirror/", "engine/v4k", "engine/joint/", "engine/split/v4k_*"],
+			"index_exclude_patterns": ["engine/joint/v4k.h", "engine/split/v4k_**", "_mirror/**"]
 		}
 	],
 	"settings": {