// gdraw_shared.inl - author: Sean Barrett - copyright 2010 RAD Game Tools
//
// This file implements some common code that can be shared across
// all the sample implementations of GDraw.

#ifdef IGGY_DISABLE_GDRAW_ASSERT
#define assert(x)
#else
#include <assert.h>
#endif

#ifndef GDRAW_MAYBE_UNUSED
#define GDRAW_MAYBE_UNUSED
#endif

///////////////////////////////////////////////////////////////
//
// GDrawHandleCache manages resource "handles" used by Iggy
// (i.e. these handles wrap the platform resource handles,
// and this file provides those wrappers and facilities for
// LRU tracking them). Moreover, for console platforms, we
// actually implement our own managed resource pools.
//
// This is the main state machine when GDRAW_MANAGE_MEM is defined:
// (which covers all console platforms)
//
//       +------+         +--------+             |
//       | Live |<------->| Locked |             |
//       +------+         +--------+             |
//       /     \                  ^              |
//      /       \                  \             |
//     v         v                  \            |
// +------+    +------+    +------+  |           |
// | Dead |--->| Free |<---| User |  |           |
// +------+    +------+    +------+  |           |
//     ^         ^    ^       ^      |           |
//      \       /       \     |      |           |
//       \     /          v   |      |           |
//     +--------+         +-------+ /            |
//     | Pinned |<--------| Alloc |/             |
//     +--------+         +-------+              |
//
// "Free" handles are not in use and available for allocation.
// "Alloc" handles have been assigned by GDraw, but do not yet
//   have a system resource backing them. Resources stay in
//   this state until we know that for sure that we're going
//   to be able to successfully complete creation, at which
//   point the resource transitions to one of the regular states.
// "Live" handles correspond to resources that may be used
//   for rendering. They are kept in LRU order. Old resources
//   may be evicted to make space.
// "Locked" handles cover resources that are going to be used
//   in the next draw command. Once a resource is marked locked,
//   it may not be evicted until it's back to "Live".
// "Dead" handles describe resources that have been freed on the
//   CPU side, but are still in use by the GPU. Their memory may
//   only be reclaimed once the GPU is done with them, at which
//   point they are moved to the "Free" list. Items on the "Dead"
//   list appear ordered by the last time they were used by the
//   GPU - "most stale" first.
// "Pinned" resources can be used in any draw call without getting
//   locked first. They can never be LRU-freed, but their memory
//   is still managed by GDraw. Currently this is only used for
//   the Iggy font cache.
// "User" (user-owned) resources are exactly that. They act much like
//   pinned resources, but their memory isn't managed by GDraw.
//   When a user-owned resource is freed, we really need to free
//   it immediately (instead of marking it as "dead"), which might
//   necessitate stalling the CPU until the GPU is finished using
//   that resource. Since we don't own the memory, delayed frees
//   are not an option.
//
// Without GDRAW_MANAGE_MEM, there's no "Dead" resources, and all
// frees are performed immediately.

typedef struct GDrawHandleCache GDrawHandleCache;
typedef struct GDrawHandle GDrawHandle;

typedef struct
{
   U64 value;
} GDrawFence;

typedef enum
{
   GDRAW_HANDLE_STATE_free = 0,
   GDRAW_HANDLE_STATE_live,
   GDRAW_HANDLE_STATE_locked,
   GDRAW_HANDLE_STATE_dead,
   GDRAW_HANDLE_STATE_pinned,
   GDRAW_HANDLE_STATE_user_owned,
   GDRAW_HANDLE_STATE_alloc,
   GDRAW_HANDLE_STATE__count,

   // not an actual state!
   GDRAW_HANDLE_STATE_sentinel = GDRAW_HANDLE_STATE__count,
} GDrawHandleState;

struct GDrawHandle
{
   GDrawNativeHandle   handle;  // platform handle to a resource (variable size)
   void              * owner;   // 4/8 // opaque handle used to allow freeing resources without calling back to owner

   GDrawHandleCache  * cache;   // 4/8 // which cache this handle came from

   GDrawHandle       * next,*prev;  // 8/16 // doubly-linked list

   #ifdef GDRAW_MANAGE_MEM
   void              * raw_ptr; // 4/8 // pointer to allocation - when you're managing memory manually
   #ifdef GDRAW_CORRUPTION_CHECK
   U32                 cached_raw_value[4];
   rrbool              has_check_value;
   #endif
   #endif

   GDrawFence          fence;   // 8 // (optional) platform fence for resource
   // 4
   U32                 bytes:28;       // estimated storage cost to allow setting a loose limit
   U32                 state:4;        // state the handle is in
};

// validate alignment to make sure structure will pack correctly
#ifdef __RAD64__
RR_COMPILER_ASSERT((sizeof(GDrawHandle) & 7) == 0);
#else
RR_COMPILER_ASSERT((sizeof(GDrawHandle) & 3) == 0);
#endif

struct GDrawHandleCache
{
   S32          bytes_free;
   S32          total_bytes;
   S32          max_handles;
   U32          is_vertex : 1;      // vertex buffers have different warning codes and generate discard callbacks
   U32          is_thrashing : 1;
   U32          did_defragment : 1;
   // 30 unused bits
   GDrawHandle  state[GDRAW_HANDLE_STATE__count];  // sentinel nodes for all of the state lists
   #ifdef GDRAW_MANAGE_MEM
   struct gfx_allocator *alloc;
   #endif
   #ifdef GDRAW_MANAGE_MEM_TWOPOOL
   struct gfx_allocator *alloc_other;
   #endif
   GDrawFence   prev_frame_start, prev_frame_end; // fence value at start/end of previous frame, for thrashing detection
   GDrawHandle  handle[1]; // the rest of the handles must be stored right after this in the containing structure
};

#ifdef GDRAW_CORRUPTION_CHECK
// values for corruption checking
#define GDRAW_CORRUPTIONCHECK_renderbegin         0x10
#define GDRAW_CORRUPTIONCHECK_renderend           0x20
#define GDRAW_CORRUPTIONCHECK_nomoregdraw         0x30
#define GDRAW_CORRUPTIONCHECK_maketexbegin        0x40
#define GDRAW_CORRUPTIONCHECK_maketexend          0x50

#define GDRAW_CORRUPTIONCHECK_wrappedcreateend       0x60
#define GDRAW_CORRUPTIONCHECK_wrappedcreatebegin     0x61
#define GDRAW_CORRUPTIONCHECK_wrappeddestroyend      0x70
#define GDRAW_CORRUPTIONCHECK_wrappeddestroybegin    0x71

#define GDRAW_CORRUPTIONCHECK_allochandle             0x80
#define GDRAW_CORRUPTIONCHECK_allochandle_begin       0x81
#define GDRAW_CORRUPTIONCHECK_allochandle_postreap    0x82
#define GDRAW_CORRUPTIONCHECK_allochandle_postfree1   0x83
#define GDRAW_CORRUPTIONCHECK_allochandle_postfree2   0x84
#define GDRAW_CORRUPTIONCHECK_allochandle_postfree3   0x85
#define GDRAW_CORRUPTIONCHECK_allochandle_postalloc1  0x86
#define GDRAW_CORRUPTIONCHECK_allochandle_postalloc2  0x87
#define GDRAW_CORRUPTIONCHECK_allochandle_postalloc3  0x88
#define GDRAW_CORRUPTIONCHECK_allochandle_defrag      0x89

#define GDRAW_CORRUPTIONCHECK_freetex           0x90

static U32 *debug_raw_address(GDrawHandle *t, int choice)
{
   static int offset_table[4] = { 0x555555, 0xaaaaaa, 0x333333, 0x6e6e6e };
   U8 *base = (U8 *) t->raw_ptr;
   int offset = offset_table[choice] & (t->bytes-1) & ~3;
   return (U32 *) (base + offset);
}

static void debug_check_overlap_one(GDrawHandle *t, U8 *ptr, S32 len)
{
   assert(len >= 0);
   if (t->raw_ptr && t->raw_ptr != ptr) {
      assert(t->raw_ptr < ptr || t->raw_ptr >= ptr+len);  
   }
}

static void debug_check_overlap(GDrawHandleCache *c, U8 *ptr, S32 len)
{
   GDrawHandle *t = c->head;
   while (t) {
      debug_check_overlap_one(t, ptr, len);
      t = t->next;
   }
   t = c->active;
   while (t) {
      debug_check_overlap_one(t, ptr, len);
      t = t->next;
   }
}

static void debug_check_raw_values(GDrawHandleCache *c)
{
   GDrawHandle *t = c->head;
   while (t) {
      if (t->raw_ptr && t->has_check_value) {
         int i;
         for (i=0; i < 4; ++i) {
            if (*debug_raw_address(t, i) != t->cached_raw_value[i]) {
               //zlog("!Iggy texture corruption found\n");
               //zlog("t=%p, t->raw_ptr=%p\n", t, t->raw_ptr);
               //zlog("Cached values: %08x %08x %08x %08x\n", t->cached_raw_value[0], t->cached_raw_value[1], t->cached_raw_value[2], t->cached_raw_value[3]);
               //zlog("Current values: %08x %08x %08x %08x\n", *debug_raw_address(t,0), *debug_raw_address(t,1), *debug_raw_address(t,2), *debug_raw_address(t,3));
               assert(0);
            }
         }
         #if 0
         GDrawHandle *s;
         check_block_alloc(c->alloc, t->raw_ptr, 1);
         s = c->head;
         while (s != t) {
            assert(s->raw_ptr != t->raw_ptr);
            s = s->next;
         }
         s = c->active;
         while (s != NULL) {
            assert(s->raw_ptr != t->raw_ptr);
            s = s->next;
         }
         #endif
      }
      t = t->next;
   }
   t = c->active;
   while (t) {
      if (t->raw_ptr && t->has_check_value) {
         int i;
         for (i=0; i < 4; ++i) {
            if (*debug_raw_address(t, i) != t->cached_raw_value[i]) {
               //zlog("!Iggy texture corruption found\n");
               //zlog("t=%p, t->raw_ptr=%p\n", t, t->raw_ptr);
               //zlog("Cached values: %08x %08x %08x %08x\n", t->cached_raw_value[0], t->cached_raw_value[1], t->cached_raw_value[2], t->cached_raw_value[3]);
               //zlog("Current values: %08x %08x %08x %08x\n", *debug_raw_address(t,0), *debug_raw_address(t,1), *debug_raw_address(t,2), *debug_raw_address(t,3));
               assert(0);
            }
         }
         #if 0
         GDrawHandle *s;
         check_block_alloc(c->alloc, t->raw_ptr, 1);
         s = c->active;
         while (s != t) {
            assert(s->raw_ptr != t->raw_ptr);
            s = s->next;
         }
         #endif
      }
      t = t->next;
   }
}

#ifndef GDRAW_CORRUPTION_MASK
#define GDRAW_CORRUPTION_MASK 0
#endif
#define debug_check_raw_values_if(c,v) \
   if ((GDRAW_CORRUPTION_CHECK & ~GDRAW_CORRUPTION_MASK) == ((v) & ~GDRAW_CORRUPTION_MASK)) \
      debug_check_raw_values(c);                                                            \
   else

static void debug_set_raw_value(GDrawHandle *t)
{
   if (t->raw_ptr) {
      int i;
      for (i=0; i < 4; ++i)
         t->cached_raw_value[i] = *debug_raw_address(t, i);
      t->has_check_value = true;
   }
}

static void debug_unset_raw_value(GDrawHandle *t)
{
   t->has_check_value = false;
}

static void debug_check_value_is_unreferenced(GDrawHandleCache *c, void *ptr)
{
   GDrawHandle *t = c->head;
   while (t) {
      assert(t->raw_ptr != ptr);
      t = t->next;
   }
   t = c->active;
   while (t) {
      assert(t->raw_ptr != ptr);
      t = t->next;
   }
}

#else

#define debug_check_overlap(c,p,len)
#define debug_set_raw_value(t)
#define debug_check_value_is_unreferenced(c,p)
#define debug_unset_raw_value(t)
#define debug_check_raw_values(c)
#define debug_check_raw_values_if(c,v)
#endif

#ifdef SUPERDEBUG
static void check_lists(GDrawHandleCache *c)
{
   GDrawHandle *sentinel, *t;
   U32 state;

   // for all lists, verify that they are consistent and
   // properly linked
   for (state = 0; state < GDRAW_HANDLE_STATE__count; state++) {
      S32 count = 0;
      sentinel = &c->state[state];

      assert(!sentinel->cache);
      assert(sentinel->state == GDRAW_HANDLE_STATE_sentinel);
      for (t = sentinel->next; t != sentinel; t = t->next) {
         count++;
         assert(t->cache == c);
         assert(t->state == state);
         assert(t->prev->next == t);
         assert(t->next->prev == t);
         assert(count < 50000);
      }
   }

   // for dead list, additionally verify that it's in the right
   // order (namely, sorted by ascending fence index)
   sentinel = &c->state[GDRAW_HANDLE_STATE_dead];
   for (t = sentinel->next; t != sentinel; t = t->next) {
      assert(t->prev == sentinel || t->fence.value >= t->prev->fence.value);
   }
}

#include <stdio.h>

static const char *gdraw_StateName(U32 state)
{
   switch (state) {
   case GDRAW_HANDLE_STATE_free:       return "free";
   case GDRAW_HANDLE_STATE_live:       return "live";
   case GDRAW_HANDLE_STATE_locked:     return "locked";
   case GDRAW_HANDLE_STATE_dead:       return "dead";
   case GDRAW_HANDLE_STATE_pinned:     return "pinned";
   case GDRAW_HANDLE_STATE_user_owned: return "user-owned";
   case GDRAW_HANDLE_STATE_alloc:      return "alloc";
   case GDRAW_HANDLE_STATE_sentinel:   return "<sentinel>";
   default:                            return "???";
   }
}

#else
static RADINLINE void check_lists(GDrawHandleCache *c)
{
   RR_UNUSED_VARIABLE(c);
}
#endif

static void gdraw_HandleTransitionInsertBefore(GDrawHandle *t, GDrawHandleState new_state, GDrawHandle *succ)
{
   check_lists(t->cache);
   assert(t->state != GDRAW_HANDLE_STATE_sentinel); // sentinels should never get here!
   assert(t->state != (U32) new_state); // code should never call "transition" if it's not transitioning!
   // unlink from prev state
   t->prev->next = t->next;
   t->next->prev = t->prev;
   // add to list for new state
   t->next = succ;
   t->prev = succ->prev;
   t->prev->next = t;
   t->next->prev = t;
#ifdef SUPERDEBUG
   printf("GD %chandle %p %s->%s\n", t->cache->is_vertex ? 'v' : 't', t, gdraw_StateName(t->state), gdraw_StateName(new_state));
#endif
   t->state = new_state;
   check_lists(t->cache);
}

static RADINLINE void gdraw_HandleTransitionTo(GDrawHandle *t, GDrawHandleState new_state)
{
   gdraw_HandleTransitionInsertBefore(t, new_state, &t->cache->state[new_state]);
}

#ifdef GDRAW_MANAGE_MEM_TWOPOOL
static rrbool gdraw_MigrateResource(GDrawHandle *t, GDrawStats *stats);
static void gdraw_res_free(GDrawHandle *t, GDrawStats *stats);
#endif

static rrbool gdraw_HandleCacheLockStats(GDrawHandle *t, void *owner, GDrawStats *stats)
{
   RR_UNUSED_VARIABLE(stats);

   // if the GPU memory is owned by the user, then we never spontaneously
   // free it, and we can always report true. moreover, Iggy doesn't bother
   // keeping 'owner' consistent in this case, so we must check this before
   // verifying t->owner.
   if (t->state == GDRAW_HANDLE_STATE_user_owned)
      return true;

   // if t->owner has changed, then Iggy is trying to lock an old version
   // of this handle from before (the handle has already been recycled to
   // point to a new resource)
   if (t->owner != owner)
      return false;

   // otherwise, it's a valid resource and we should lock it until the next
   // unlock call
   assert(t->state == GDRAW_HANDLE_STATE_live || t->state == GDRAW_HANDLE_STATE_locked || t->state == GDRAW_HANDLE_STATE_pinned);
   if (t->state == GDRAW_HANDLE_STATE_live) {
#ifdef GDRAW_MANAGE_MEM_TWOPOOL
      // if we defragmented this frame, we can't just make resources live;
      // we need to migrate them to their new location. (which might fail
      // if we don't have enough memory left in the new pool)
      if (t->cache->did_defragment) {
         if (!gdraw_MigrateResource(t, stats)) {
            gdraw_res_free(t, stats);
            return false;
         }
      }
#endif
      gdraw_HandleTransitionTo(t, GDRAW_HANDLE_STATE_locked);
   }
   return true;
}

static rrbool gdraw_HandleCacheLock(GDrawHandle *t, void *owner)
{
   return gdraw_HandleCacheLockStats(t, owner, NULL);
}

static void gdraw_HandleCacheUnlock(GDrawHandle *t)
{
   assert(t->state == GDRAW_HANDLE_STATE_locked || t->state == GDRAW_HANDLE_STATE_pinned || t->state == GDRAW_HANDLE_STATE_user_owned);
   if (t->state == GDRAW_HANDLE_STATE_locked)
      gdraw_HandleTransitionTo(t, GDRAW_HANDLE_STATE_live);
}

static void gdraw_HandleCacheUnlockAll(GDrawHandleCache *c)
{
   GDrawHandle *sentinel = &c->state[GDRAW_HANDLE_STATE_locked];
   while (sentinel->next != sentinel)
      gdraw_HandleTransitionTo(sentinel->next, GDRAW_HANDLE_STATE_live);
}

static void gdraw_HandleCacheInit(GDrawHandleCache *c, S32 num_handles, S32 bytes)
{
   S32 i;
   assert(num_handles > 0);
   c->max_handles = num_handles;
   c->total_bytes = bytes;
   c->bytes_free = c->total_bytes;
   c->is_vertex = false;
   c->is_thrashing = false;
   c->did_defragment = false;
   for (i=0; i < GDRAW_HANDLE_STATE__count; i++) {
      c->state[i].owner = NULL;
      c->state[i].cache = NULL; // should never follow cache link from sentinels!
      c->state[i].next = c->state[i].prev = &c->state[i];
#ifdef GDRAW_MANAGE_MEM
      c->state[i].raw_ptr = NULL;
#endif
      c->state[i].fence.value = 0;
      c->state[i].bytes = 0;
      c->state[i].state = GDRAW_HANDLE_STATE_sentinel;
   }
   for (i=0; i < num_handles; ++i) {
      c->handle[i].cache  = c;
      c->handle[i].prev = (i == 0) ? &c->state[GDRAW_HANDLE_STATE_free] : &c->handle[i-1];
      c->handle[i].next = (i == num_handles - 1) ? &c->state[GDRAW_HANDLE_STATE_free] : &c->handle[i+1];
      c->handle[i].bytes = 0;
      c->handle[i].state = GDRAW_HANDLE_STATE_free;
#ifdef GDRAW_MANAGE_MEM
      c->handle[i].raw_ptr = NULL;
#endif
   }
   c->state[GDRAW_HANDLE_STATE_free].next = &c->handle[0];
   c->state[GDRAW_HANDLE_STATE_free].prev = &c->handle[num_handles - 1];
   c->prev_frame_start.value = 0;
   c->prev_frame_end.value = 0;
#ifdef GDRAW_MANAGE_MEM
   c->alloc = NULL;
#endif
#ifdef GDRAW_MANAGE_MEM_TWOPOOL
   c->alloc_other = NULL;
#endif
   check_lists(c);
}

static GDrawHandle *gdraw_HandleCacheAllocateBegin(GDrawHandleCache *c)
{
   GDrawHandle *free_list = &c->state[GDRAW_HANDLE_STATE_free];
   GDrawHandle *t = NULL;
   if (free_list->next != free_list) {
      t = free_list->next;
      gdraw_HandleTransitionTo(t, GDRAW_HANDLE_STATE_alloc);
      t->bytes = 0;
      t->owner = 0;
#ifdef GDRAW_MANAGE_MEM
      t->raw_ptr = NULL;
#endif
#ifdef GDRAW_CORRUPTION_CHECK
      t->has_check_value = false;
#endif
   }
   return t;
}

static void gdraw_HandleCacheAllocateEnd(GDrawHandle *t, S32 bytes, void *owner, GDrawHandleState new_state)
{
   assert(t->cache);
   assert(t->bytes == 0);
   assert(t->owner == 0);
   assert(t->state == GDRAW_HANDLE_STATE_alloc);
   // 4J Stu - Need to keep the braces here because of our version of assert
   if (bytes == 0)
   {  
      assert(new_state == GDRAW_HANDLE_STATE_user_owned);
   }
   else
   {
      assert(new_state == GDRAW_HANDLE_STATE_locked || new_state == GDRAW_HANDLE_STATE_pinned);
   }
   t->bytes = bytes;
   t->owner = owner;
   t->cache->bytes_free -= bytes;

   gdraw_HandleTransitionTo(t, new_state);
}

static void gdraw_HandleCacheFree(GDrawHandle *t)
{
   GDrawHandleCache *c = t->cache;
   assert(t->state != GDRAW_HANDLE_STATE_alloc && t->state != GDRAW_HANDLE_STATE_sentinel);
   c->bytes_free += t->bytes;
   t->bytes = 0;
   t->owner = 0;
#ifdef GDRAW_MANAGE_MEM
   t->raw_ptr = 0;
#endif
#ifdef GDRAW_CORRUPTION_CHECK
   t->has_check_value = false;
#endif
   gdraw_HandleTransitionTo(t, GDRAW_HANDLE_STATE_free);
}

static void gdraw_HandleCacheAllocateFail(GDrawHandle *t)
{
   assert(t->state == GDRAW_HANDLE_STATE_alloc);
   gdraw_HandleTransitionTo(t, GDRAW_HANDLE_STATE_free);
}

static GDrawHandle *gdraw_HandleCacheGetLRU(GDrawHandleCache *c)
{
   // TransitionTo always inserts at the end, which means that the resources
   // at the front of the LRU list are the oldest ones, since in-use resources
   // will get appended on every transition from "locked" to "live".
   GDrawHandle *sentinel = &c->state[GDRAW_HANDLE_STATE_live];
   return (sentinel->next != sentinel) ? sentinel->next : NULL;
}

static void gdraw_HandleCacheTick(GDrawHandleCache *c, GDrawFence now)
{
   c->prev_frame_start = c->prev_frame_end;
   c->prev_frame_end = now;

   // reset these flags every frame
   c->is_thrashing = false;
   c->did_defragment = false;
}

#ifdef GDRAW_MANAGE_MEM

static void gdraw_HandleCacheInsertDead(GDrawHandle *t)
{
   GDrawHandle *s, *sentinel;

   assert(t->state == GDRAW_HANDLE_STATE_live || t->state == GDRAW_HANDLE_STATE_locked || t->state == GDRAW_HANDLE_STATE_pinned);

   // figure out where t belongs in the dead list in "chronological order"
   // do this by finding its (chronological) successor s
   sentinel = &t->cache->state[GDRAW_HANDLE_STATE_dead];
   s = sentinel->next;
   while (s != sentinel && s->fence.value <= t->fence.value)
      s = s->next;

   // and then insert it there
   gdraw_HandleTransitionInsertBefore(t, GDRAW_HANDLE_STATE_dead, s);
}

#endif

////////////////////////////////////////////////////////////////////////
//
//   Set transformation matrices
//

// Our vertex shaders use this convention:
//   world: our world matrices always look like this
//       m00  m01  0  t0
//       m10  m11  0  t1
//        0    0   0   d
//        0    0   0   1
//
//      we just store the first two rows and insert d
//      in the first row, third column. our input position vectors are
//      always (x,y,0,1) or (x,y,0,0), so we can still just use dp4 to
//      compute final x/y. after that it's a single move to set the
//      correct depth value.
//
//   viewproj: our view-projection matrix is always just a 2D scale+translate,
//     i.e. the matrix looks like this:
//
//       p[0]  0    0   p[2]
//         0  p[1]  0   p[3]
//         0   0    1    0
//         0   0    0    1
//
//     just store (p[0],p[1],p[2],p[3]) in a 4-component vector and the projection
//     transform is a single multiply-add.
//
// The output is volatile since it's often in Write-Combined memory where we
// really don't want compiler reordering.

static RADINLINE void gdraw_PixelSpace(volatile F32 * RADRESTRICT vvec)
{
   // 1:1 pixel mapping - just identity since our "view space" is pixels
   vvec[0] = 1.0f;   vvec[1] = 0.0f;   vvec[2] = 0.0f;   vvec[3] = 0.0f;
   vvec[4] = 0.0f;   vvec[5] = 1.0f;   vvec[6] = 0.0f;   vvec[7] = 0.0f;
}

static RADINLINE void gdraw_WorldSpace(volatile F32 * RADRESTRICT vvec, F32 * RADRESTRICT world_to_pixel, F32 depth, F32 misc)
{
   // World->pixel space transform is just a scale
   vvec[0] = world_to_pixel[0];  vvec[1] = 0.0f;               vvec[2] = depth;  vvec[3] = 0.0f;
   vvec[4] = 0.0f;               vvec[5] = world_to_pixel[1];  vvec[6] = misc;   vvec[7] = 0.0f;
}

static RADINLINE void gdraw_ObjectSpace(volatile F32 * RADRESTRICT vvec, gswf_matrix * RADRESTRICT xform, F32 depth, F32 misc)
{
   // Object->pixel transform is a 2D homogeneous matrix transform
   F32 m00 = xform->m00;
   F32 m01 = xform->m01;
   F32 m10 = xform->m10;
   F32 m11 = xform->m11;
   F32 trans0 = xform->trans[0];
   F32 trans1 = xform->trans[1];

   vvec[0] = m00; vvec[1] = m01; vvec[2] = depth;  vvec[3] = trans0;
   vvec[4] = m10; vvec[5] = m11; vvec[6] = misc;   vvec[7] = trans1;
}

static void gdraw_GetObjectSpaceMatrix(F32 * RADRESTRICT mat, gswf_matrix * RADRESTRICT xform, F32 * RADRESTRICT proj, F32 depth, int out_col_major)
{
   int row = out_col_major ? 1 : 4;
   int col = out_col_major ? 4 : 1;

   F32 xs = proj[0];
   F32 ys = proj[1];

   mat[0*row+0*col] = xform->m00      * xs;
   mat[0*row+1*col] = xform->m01      * xs;
   mat[0*row+2*col] = 0.0f;
   mat[0*row+3*col] = xform->trans[0] * xs + proj[2];

   mat[1*row+0*col] = xform->m10      * ys;
   mat[1*row+1*col] = xform->m11      * ys;
   mat[1*row+2*col] = 0.0f;
   mat[1*row+3*col] = xform->trans[1] * ys + proj[3];

   mat[2*row+0*col] = 0.0f;
   mat[2*row+1*col] = 0.0f;
   mat[2*row+2*col] = 0.0f;
   mat[2*row+3*col] = depth;

   mat[3*row+0*col] = 0.0f;
   mat[3*row+1*col] = 0.0f;
   mat[3*row+2*col] = 0.0f;
   mat[3*row+3*col] = 1.0f;
}


////////////////////////////////////////////////////////////////////////
//
//   Blurs
//
// symmetrically expand a rectangle by ex/ey pixels on both sides, then clamp to tile bounds
static void gdraw_ExpandRect(gswf_recti *out, gswf_recti const *in, S32 ex, S32 ey, S32 w, S32 h)
{
   out->x0 = RR_MAX(in->x0 - ex, 0);
   out->y0 = RR_MAX(in->y0 - ey, 0);
   out->x1 = RR_MIN(in->x1 + ex, w);
   out->y1 = RR_MIN(in->y1 + ey, h);
}

static void gdraw_ShiftRect(gswf_recti *out, gswf_recti const *in, S32 dx, S32 dy)
{
   out->x0 = in->x0 + dx;
   out->y0 = in->y0 + dy;
   out->x1 = in->x1 + dx;
   out->y1 = in->y1 + dy;
}

#define MAX_TAPS  9  // max # of bilinear samples in one 'convolution' step

enum
{
   // basic shader family
   VAR_tex0 = 0,
   VAR_tex1,
   VAR_cmul,
   VAR_cadd,
   VAR_focal,

   // filter family
   VAR_filter_tex0  = 0,
   VAR_filter_tex1,
   VAR_filter_color,
   VAR_filter_tc_off,
   VAR_filter_tex2,
   VAR_filter_clamp0,
   VAR_filter_clamp1,
   VAR_filter_color2,
   MAX_VARS,

   // blur family
   VAR_blur_tex0 = 0,
   VAR_blur_tap,
   VAR_blur_clampv,

   // color matrix family
   VAR_colormatrix_tex0 = 0,
   VAR_colormatrix_data,

   // ihud family
   VAR_ihudv_worldview = 0,
   VAR_ihudv_material,
   VAR_ihudv_textmode,
};

typedef struct
{
   S32 w,h, frametex_width, frametex_height;
   void (*BlurPass)(GDrawRenderState *r, int taps,  float *data, gswf_recti *s, float *tc, float height_max, float *clampv, GDrawStats *gstats);
} GDrawBlurInfo;

static GDrawTexture *gdraw_BlurPass(GDrawFunctions *g, GDrawBlurInfo *c, GDrawRenderState *r, int taps, float *data, gswf_recti *draw_bounds, gswf_recti *sample_bounds, GDrawStats *gstats)
{
   F32 tc[4];
   F32 clamp[4];
   F32 t=0;
   F32 texel_scale_s = 1.0f / c->frametex_width;
   F32 texel_scale_t = 1.0f / c->frametex_height;
   S32 i;
   for (i=0; i < taps; ++i)
      t += data[4*i+2];
   assert(t >= 0.99f && t <= 1.01f);

   tc[0] = texel_scale_s * draw_bounds->x0;
   tc[1] = texel_scale_t * draw_bounds->y0;
   tc[2] = texel_scale_s * draw_bounds->x1;
   tc[3] = texel_scale_t * draw_bounds->y1;

   // sample_bounds is (x0,y0) inclusive, (x1,y1) exclusive
   // texel centers are offset by 0.5 from integer coordinates and we don't want to sample outside sample_bounds
   clamp[0] = texel_scale_s * (sample_bounds->x0 + 0.5f);
   clamp[1] = texel_scale_t * (sample_bounds->y0 + 0.5f);
   clamp[2] = texel_scale_s * (sample_bounds->x1 - 0.5f);
   clamp[3] = texel_scale_t * (sample_bounds->y1 - 0.5f);

   if (!g->TextureDrawBufferBegin(draw_bounds, GDRAW_TEXTURE_FORMAT_rgba32, GDRAW_TEXTUREDRAWBUFFER_FLAGS_needs_color | GDRAW_TEXTUREDRAWBUFFER_FLAGS_needs_alpha, 0, gstats))
      return r->tex[0];

   c->BlurPass(r, taps, data, draw_bounds, tc, (F32) c->h / c->frametex_height, clamp, gstats);
   return g->TextureDrawBufferEnd(gstats);
}

static GDrawTexture *gdraw_BlurPassDownsample(GDrawFunctions *g, GDrawBlurInfo *c, GDrawRenderState *r, int taps, float *data, gswf_recti *draw_bounds, int axis, int divisor, int tex_w, int tex_h, gswf_recti *sample_bounds, GDrawStats *gstats)
{
   S32 i;
   F32 t=0;
   F32 tc[4];
   F32 clamp[4];
   F32 texel_scale_s = 1.0f / tex_w;
   F32 texel_scale_t = 1.0f / tex_h;
   gswf_recti z;

   for (i=0; i < taps; ++i)
      t += data[4*i+2];
   assert(t >= 0.99f && t <= 1.01f);

   // following must be integer divides!
   if (axis == 0) {
      z.x0 = draw_bounds->x0 / divisor;
      z.x1 = (draw_bounds->x1-1) / divisor + 1;
      z.y0 = draw_bounds->y0;
      z.y1 = draw_bounds->y1;

      tc[0] = ((z.x0 - 0.5f)*divisor+0.5f)*texel_scale_s;
      tc[2] = ((z.x1 - 0.5f)*divisor+0.5f)*texel_scale_s;
      tc[1] = z.y0*texel_scale_t;
      tc[3] = z.y1*texel_scale_t;
   } else {
      z.x0 = draw_bounds->x0;
      z.x1 = draw_bounds->x1;
      z.y0 = draw_bounds->y0 / divisor;
      z.y1 = (draw_bounds->y1-1) / divisor + 1;

      tc[0] = z.x0*texel_scale_s;
      tc[2] = z.x1*texel_scale_s;
      tc[1] = ((z.y0 - 0.5f)*divisor+0.5f)*texel_scale_t;
      tc[3] = ((z.y1 - 0.5f)*divisor+0.5f)*texel_scale_t;
   }

   if (!g->TextureDrawBufferBegin(&z, GDRAW_TEXTURE_FORMAT_rgba32, GDRAW_TEXTUREDRAWBUFFER_FLAGS_needs_color | GDRAW_TEXTUREDRAWBUFFER_FLAGS_needs_alpha, 0, gstats))
      return r->tex[0];

   clamp[0] = texel_scale_s * (sample_bounds->x0 + 0.5f);
   clamp[1] = texel_scale_t * (sample_bounds->y0 + 0.5f);
   clamp[2] = texel_scale_s * (sample_bounds->x1 - 0.5f);
   clamp[3] = texel_scale_t * (sample_bounds->y1 - 0.5f);

   assert(clamp[0] <= clamp[2]);
   assert(clamp[1] <= clamp[3]);

   c->BlurPass(r, taps, data, &z, tc, (F32) c->h / c->frametex_height, clamp, gstats);
   return g->TextureDrawBufferEnd(gstats);
}

#define unmap(t,a,b)             (((t)-(a))/(F32) ((b)-(a)))
#define linear_remap(t,a,b,c,d)  ((c) + unmap(t,a,b)*((d)-(c)))

static void gdraw_BlurAxis(S32 axis, GDrawFunctions *g, GDrawBlurInfo *c, GDrawRenderState *r, F32 blur_width, F32 texel, gswf_recti *draw_bounds, gswf_recti *sample_bounds, GDrawTexture *protect, GDrawStats *gstats)
{
   GDrawTexture *t;
   F32 data[MAX_TAPS][4];
   S32 off_axis = 1-axis;
   S32 w = ((S32) ceil((blur_width-1)/2))*2+1;  // 1.2 => 3, 2.8 => 3, 3.2 => 5
   F32 edge_weight = 1 - (w - blur_width)/2;   // 3 => 0 => 1;   1.2 => 1.8 => 0.9 => 0.1
   F32 inverse_weight = 1.0f / blur_width;

   w = ((w-1) >> 1) + 1; // 3 => 2, 5 => 3, 7 => 4   (number of texture samples)

   if (!r->tex[0])
      return;

   // horizontal filter
   if (w > 1) {
      if (w <= MAX_TAPS) {
         // we have enough taps to just do it
         // use 'w' taps
         S32 i, expand;

         // just go through and place all the taps in the right place

         // if w is 2 (sample from -1,0,1)
         //    0 => -0.5
         //    1 =>  1

         // if w is 3:
         //    0 => -1.5   samples from -2,-1
         //    1 =>  0.5   samples from 0,1
         //    2 =>  2     samples from 2

         // if w is 4:
         //    0 => -2.5   samples from -3,-2
         //    1 => -0.5   samples from -1,0
         //    2 =>  1.5   samples from  1,2
         //    3 =>  3     samples from  3

         for (i=0; i < w; ++i) {
            // first texsample samples from -w+1 and -w+2, e.g. w=2 => -1,0,1
            data[i][axis] = (-w+1.5f + i*2)*texel;
            data[i][off_axis] = 0;
            data[i][2] = 2*inverse_weight; // 2 full-weight samples
            data[i][3] = 0;
         }
         // now reweight the last one
         data[i-1][axis] = (w-1)*texel;
         data[i-1][2] = edge_weight*inverse_weight;
         // now reweight the first one
         //     (ew*0 + 1*1)/(1+ew) = 1/(1+ew)
         data[0][axis] = (-w + 1.0f + 1/(edge_weight+1)) * texel;
         data[0][2] = (edge_weight+1)*inverse_weight;

         expand = w-1;
         gdraw_ExpandRect(draw_bounds, draw_bounds, axis ? 0 : expand, axis ? expand : 0, c->w, c->h);

         t = gdraw_BlurPass(g, c, r, w, data[0], draw_bounds, sample_bounds, gstats);
         if (r->tex[0] != protect && r->tex[0] != t)
            g->FreeTexture(r->tex[0], 0, gstats);
         r->tex[0] = t;
         gdraw_ExpandRect(sample_bounds, draw_bounds, 1, 1, c->w, c->h); // for next pass
      } else {
         // @OPTIMIZE: for symmetrical blurs we can get a 2-wide blur in the *off* axis at the same
         // time we get N-wide in the on axis, which could double our max width
         S32 i, expand;
         // @HACK: this is really a dumb way to do it, i kind of had a brain fart, you could get
         // the exact same result by just doing the downsample the naive way and then the
         // final sample uses texture samples spaced by a texel rather than spaced by two
         // texels -- the current method is just as inefficient, it just puts the inefficiency
         // in the way the downsampled texture is self-overlapping, so the downsampled texture
         // is twice as larger as it should be.

         // we COULD be exact by generating a mipmap, then sampling some number of samples
         // from the mipmap and some from the original, but that would require being polyphase.
         // instead we just are approximate. the mipmap weights the edge pixels by one half
         // and overlaps them by one sample, so then in phase two we sample N slightly-overlapping
         // mipmap samples
         //
         // instead we do the following.
         //    divide the source data up into clusters that are K samples long.
         //            ...K0...   ...K1...  ...K2...   ...K3...
         //
         // Suppose K[i] is the average of all the items in cluster i.
         //
         // We compute a downsampled texture where T[i] = K[i] + K[i+1].
         //
         // Now, we sample N taps from adjacent elements of T, allowing the texture unit
         // to bilerp. Suppose a given sample falls at coordinate i with sub-position p.
         // Then tap #j will compute:
         //      T[i+j]*(1-p) + T[i+j+1]*p
         // But tap #j+1 will compute:
         //      T[i+j+1]*(1-p) + T[i+j+2]*p
         // so we end up computing:
         //      sum(T[i+j]) except for the end samples.
         //
         // So, how do we create these initial clusters? That's easy, we use K taps
         // to sample 2K texels.
         //
         // What value of k do we use? Well, we're constrained to using MAX_TAPS
         // on each pass. So at the high end, we're bounded by:
         //      K = MAX_TAPS
         //      S = MAX_TAPS (S is number of samples in second pass)
         // S addresses S*2-1 texels of T, and each texel adds K more samples,
         // so (ignoring the edges) we basically have w = K*S

         // if w == MAX_TAPS*MAX_TAPS, then k = MAX_TAPS
         // if w == MAX_TAPS+1, then k = 2
         //
         // suppose we have 3 taps, then we can sample 5 samples in one pass, so then our
         // max coverage is 25 samples, or a filter width of 13. with 7 taps, we sample
         // 13 samples in one pass, max coverage is 13*13 samples or (13*13-1)/2 width,
         // which is ((2T-1)*(2T-1)-1)/2 or (4T^2 - 4T + 1 -1)/2 or 2T^2 - 2T or 2T*(T-1)
         S32 w_mip = (S32) ceil(linear_remap(w, MAX_TAPS+1, MAX_TAPS*MAX_TAPS, 2, MAX_TAPS));
         S32 downsample = w_mip;
         F32 sample_spacing = texel;
         if (downsample < 2) downsample = 2;
         if (w_mip > MAX_TAPS) {
            // if w_mip > MAX_TAPS, then we ought to use more than one mipmap pass, but
            // since that's a huge filter ( > 80 pixels) let's just try subsampling and
            // see if it's good enough.
            sample_spacing *= w_mip / MAX_TAPS;
            w_mip = MAX_TAPS;
         } else {
            assert(w / downsample <= MAX_TAPS);
         }
         inverse_weight = 1.0f / (2*w_mip);
         for (i=0; i < w_mip; ++i) {
            data[i][axis] = (-w_mip+1 + i*2+0.5f)*sample_spacing;
            data[i][off_axis] = 0;
            data[i][2] = 2*inverse_weight;
            data[i][3] = 0;
         }
         w = w*2 / w_mip;

         // @TODO: compute the correct bboxes for this size
         // the downsampled texture samples from -w_mip+1 to w_mip
         // the sample from within that samples w spots within that,
         // or w/2 of those, but they're overlapping by 50%.
         // so if a sample is a point i, it samples from the original
         // from -w_mip+1 to w_mip + i*w_mip.
         // So then the minimum is: -w_mip+1 + (w/2)*w_mip, and
         // the maximum is w_mip + (w/2)*w_mip
         expand = (((w+1)>>1)+1)*w_mip+1;
         gdraw_ExpandRect(draw_bounds, draw_bounds, axis ? 0 : expand, axis ? expand : 0, c->w, c->h);
         
         t = gdraw_BlurPassDownsample(g, c, r, w_mip, data[0], draw_bounds, axis, downsample, c->frametex_width, c->frametex_height, sample_bounds, gstats);
         if (r->tex[0] != protect && r->tex[0] != t)
            g->FreeTexture(r->tex[0], 0, gstats);
         r->tex[0] = t;
         gdraw_ExpandRect(sample_bounds, draw_bounds, 1, 1, c->w, c->h);
         if (!r->tex[0])
            return;

         // now do a regular blur pass sampling from that
         // the raw texture now contains 'downsample' samples per texel
         if (w > 2*MAX_TAPS) {
            sample_spacing = texel * (w-1) / (2*MAX_TAPS-1);
            w = 2*MAX_TAPS;
         } else {
            sample_spacing = texel;
         }
         //sample_spacing *= 1.0f/2;
         assert(w >= 2 && w <= 2*MAX_TAPS);

         if (w & 1) {
            // we just want to evenly weight even-spaced samples
            inverse_weight = 1.0f / w;

            // just go through and place all the taps in the right place

            w = (w+1)>>1;
            for (i=0; i < w; ++i) {
               data[i][axis] = (-w+1.0f + 0.5f + i*2)*sample_spacing;
               data[i][off_axis] = 0;
               data[i][2] = 2*inverse_weight; // 2 full-weight samples
               data[i][3] = 0;
            }

            // fix up the last tap

            // the following test is always true, but we're testing it here
            // explicitly so as to make VS2012's static analyzer not complain
            if (i > 0) {
               data[i-1][axis] = (-w+1.0f+(i-1)*2)*sample_spacing;
               data[i-1][2] = inverse_weight;
            }
         } else {
            // we just want to evenly weight even-spaced samples
            inverse_weight = 1.0f / w;

            // just go through and place all the taps in the right place
            w >>= 1;
            for (i=0; i < w; ++i) {
               data[i][axis] = (-w+1.0f + i*2)*sample_spacing;
               data[i][off_axis] = 0;
               data[i][2] = 2*inverse_weight; // 2 full-weight samples
               data[i][3] = 0;
            }
         }

         t = gdraw_BlurPassDownsample(g, c, r, w, data[0], draw_bounds, axis, 1,
                axis==0 ? c->frametex_width*downsample : c->frametex_width,
                axis==1 ? c->frametex_height*downsample : c->frametex_height, sample_bounds, gstats);
         if (r->tex[0] != protect && r->tex[0] != t)
            g->FreeTexture(r->tex[0], 0, gstats);
         r->tex[0] = t;
         gdraw_ExpandRect(sample_bounds, draw_bounds, 1, 1, c->w, c->h);
      }
   }
}

static void gdraw_Blur(GDrawFunctions *g, GDrawBlurInfo *c, GDrawRenderState *r, gswf_recti *draw_bounds, gswf_recti *sample_bounds, GDrawStats *gstats)
{
   S32 p;
   GDrawTexture *protect = r->tex[0];
   gswf_recti sbounds;

   // compute texel offset size
   F32 dx = 1.0f / c->frametex_width;
   F32 dy = 1.0f / c->frametex_height;

   // blur = 1 => 1 tap
   // blur = 1.2 => 3 taps (0.1, 1, 0.1)
   // blur = 2.2 => 3 taps (0.6, 1, 0.6)
   // blur = 2.8 => 3 taps (0.9, 1, 0.9)
   // blur = 3   => 3 taps (1  , 1, 1  )
   // blur = 3.2 => 5 taps (0.1, 1, 1, 1, 0.1)

   //S32 w = ((S32) ceil((r->blur_x-1)/2))*2+1;   // 1.2 => (1.2-1)/2 => 0.1 => 1.0 => 1 => 2 => 3
   //S32 h = ((S32) ceil((r->blur_y-1)/2))*2+1;   // 3   => (3-1)/2 => 1.0 => 1 => 2 => 3

   // gdraw puts 1 border pixel around everything when producing rendertargets and we use this
   // so expand the input sample bounds accordingly
   gdraw_ExpandRect(&sbounds, sample_bounds, 1, 1, c->w, c->h);

   for (p=0; p < r->blur_passes; ++p) {
      #if 0 // @OPTIMIZE do the filter in one pass
      if (w*h <= MAX_TAPS) {
      } else
      #endif
      {
         // do the filter separably
         gdraw_BlurAxis(0,g,c,r,r->blur_x,dx, draw_bounds, &sbounds, protect, gstats);
         gdraw_BlurAxis(1,g,c,r,r->blur_y,dy, draw_bounds, &sbounds, protect, gstats);
      }
   }
}

#ifdef GDRAW_MANAGE_MEM

static void make_pool_aligned(void **start, S32 *num_bytes, U32 alignment)
{
   UINTa addr_orig = (UINTa) *start;
   UINTa addr_aligned = (addr_orig + alignment-1) & ~((UINTa) alignment - 1);

   if (addr_aligned != addr_orig) {
      S32 diff = (S32) (addr_aligned - addr_orig);
      if (*num_bytes < diff) { 
         *start = NULL;
         *num_bytes = 0;
         return;
      } else {
         *start = (void *)addr_aligned;
         *num_bytes -= diff;
      }
   }
}

// Very simple arena allocator
typedef struct
{
   U8 *begin;
   U8 *current;
   U8 *end;
} GDrawArena;

static void gdraw_arena_init(GDrawArena *arena, void *start, U32 size)
{
   arena->begin = (U8 *)start;
   arena->current = (U8 *)start;
   arena->end = (U8 *)start + size;
}

static GDRAW_MAYBE_UNUSED void gdraw_arena_reset(GDrawArena *arena)
{
   arena->current = arena->begin;
}

static void *gdraw_arena_alloc(GDrawArena *arena, U32 size, U32 align)
{
   UINTa start_addr = ((UINTa)arena->current + align-1) & ~((UINTa) align - 1);
   U8 *ptr = (U8 *)start_addr;
   UINTa remaining = arena->end - arena->current;
   UINTa total_size = (ptr - arena->current) + size;
   if (remaining < total_size) // doesn't fit
      return NULL;

   arena->current = ptr + size;
   return ptr;
}

// Allocator for graphics memory.
// Graphics memory is assumed to be write-combined and slow to read for the
// CPU, so we keep all heap management information separately in main memory.
//
// There's a constant management of about 1k (2k for 64bit) to create a heap,
// plus a per-block overhead. The maximum number of blocks the allocator can
// ever use is bounded by 2*max_allocs+1; since GDraw manages a limited
// amount of handles, max_allocs is a known value at heap creation time.
//
// The allocator uses a best-fit heuristic to minimize fragmentation.
// Currently, there are no size classes or other auxiliary data structures to
// speed up this process, since the number of free blocks at any point in time
// is assumed to be fairly low.
//
// The allocator maintains a number of invariants:
// - The free list and physical block list are proper double-linked lists.
//   (i.e. block->next->prev == block->prev->next == block)
// - All allocated blocks are also kept in a hash table, indexed by their
//   pointer (to allow free to locate the corresponding block_info quickly).
//   There's a single-linked, NULL-terminated list of elements in each hash
//   bucket.
// - The physical block list is ordered. It always contains all currently
//   active blocks and spans the whole managed memory range. There are no
//   gaps between blocks, and all blocks have nonzero size.
// - There are no two adjacent free blocks; if two such blocks would be created,
//   they are coalesced immediately.
// - The maximum number of blocks that could ever be necessary is allocated
//   on initialization. All block_infos not currently in use are kept in a
//   single-linked, NULL-terminated list of unused blocks. Every block is either
//   in the physical block list or the unused list, and the total number of
//   blocks is constant.
// These invariants always hold before and after an allocation/free.

#ifndef GFXALLOC_ASSERT
#define GFXALLOC_ASSERT(x)
#endif

typedef struct gfx_block_info
{
   U8 *ptr;
   gfx_block_info *prev, *next; // for free blocks this is the free list, for allocated blocks it's a (single-linked!) list of elements in the corresponding hash bucket
   gfx_block_info *prev_phys, *next_phys;
   U32 is_free : 1;
   U32 is_unused : 1;
   U32 size : 30;
} gfx_block_info;
// 24 bytes/block on 32bit, 48 bytes/block on 64bit.

#define GFXALLOC_HASH_SIZE 256

typedef struct gfx_allocator
{
   U8 *mem_base;
   U8 *mem_end;
   U32 max_allocs;
   U32 block_align;
   U32 block_shift;
   S32 actual_bytes_free;

#ifdef GFXALLOC_CHECK
   int num_blocks;
   int num_unused;
   int num_alloc;
   int num_free;
#endif

   GDrawHandleCache *cache;

   gfx_block_info *unused_list; // next unused block_info (single-linked list)
   gfx_block_info *hash[GFXALLOC_HASH_SIZE]; // allocated blocks
   gfx_block_info blocks[1]; // first block is head of free list AND head of physical block list (sentinel)
} gfx_allocator;
// about 1k (32bit), 2k (64bit) with 256 hash buckets (the default). dominated by hash table.

#ifdef GFXALLOC_CHECK
#define GFXALLOC_IF_CHECK(x)  x
#else
#define GFXALLOC_IF_CHECK(x)
#endif

static U32 gfxalloc_get_hash_code(gfx_allocator *alloc, void *ptr)
{
   U32 a = (U32) (((U8 *) ptr - alloc->mem_base) >> alloc->block_shift);

   // integer hash function by Bob Jenkins (http://burtleburtle.net/bob/hash/integer.html)
   // I use this function because integer mults are slow on PPC and large literal constants
   // take multiple instrs to set up on all RISC CPUs.
   a -= (a<<6);
   a ^= (a>>17);
   a -= (a<<9);
   a ^= (a<<4);
   a -= (a<<3);
   a ^= (a<<10);
   a ^= (a>>15);

   return a & (GFXALLOC_HASH_SIZE - 1);
}

#if defined(SUPERDEBUG) || defined(COMPLETE_DEBUG)
#include <stdlib.h>
#define MAX_REGIONS  8192
typedef struct
{
   U32 begin,end;
} gfx_region;
static gfx_region region[MAX_REGIONS];

static int region_sort(const void *p, const void *q)
{
   U32 a = *(U32*)p;
   U32 b = *(U32*)q;
   if (a < b) return -1;
   if (a > b) return  1;
   return 0;
}

static void gfxalloc_check1(gfx_allocator *alloc)
{
   assert(alloc->max_allocs*2+1 < MAX_REGIONS);
   int i,n=0;
   for (i=0; i < GFXALLOC_HASH_SIZE; ++i) {
      gfx_block_info *b = alloc->hash[i];
      while (b) {
         region[n].begin = (UINTa) b->ptr;
         region[n].end = region[n].begin + b->size;
         ++n;
         b = b->next;
      }
   }
   gfx_block_info *b = alloc->blocks[0].next;
   while (b != &alloc->blocks[0]) {
      region[n].begin = (UINTa) b->ptr;
      region[n].end = region[n].begin + b->size;
      ++n;
      b = b->next;
   }
   qsort(region, n, sizeof(region[0]), region_sort);
   for (i=0; i+1 < n; ++i) {
      assert(region[i].end == region[i+1].begin);
   }
}
#else
#define gfxalloc_check1(a)
#endif

#ifdef COMPLETE_DEBUG
static void verify_against_blocks(int num_regions, void *vptr, S32 len)
{
   U32 *ptr = (U32 *) vptr;
   // binary search for ptr amongst regions
   S32 s=0,e=num_regions-1;
   assert(len != 0);
   while (s < e) {
      S32 i = (s+e+1)>>1;
      // invariant:  b[s] <= ptr <= b[e]
      if (region[i].begin <= (UINTa) ptr)
         s = i;
      else
         e = i-1;

      // consider cases:
      //    s=0,e=1: i = 0, how do we get i to be 1?
   }
   // at this point, s >= e
   assert(s < num_regions && region[s].begin == (UINTa) ptr && (UINTa) ptr+len <= region[s].end);
}

static void debug_complete_check(gfx_allocator *alloc, void *ptr, S32 len, void *skip)
{
   GDrawHandleCache *c = alloc->cache;
   assert(alloc->max_allocs*2+1 < MAX_REGIONS);
   int i,n=0;
   for (i=0; i < GFXALLOC_HASH_SIZE; ++i) {
      gfx_block_info *b = alloc->hash[i];
      while (b) {
         region[n].begin = (UINTa) b->ptr;
         region[n].end = region[n].begin + b->size;
         ++n;
         b = b->next;
      }
   }
   gfx_block_info *b = alloc->blocks[0].next;
   while (b != &alloc->blocks[0]) {
      region[n].begin = (UINTa) b->ptr;
      region[n].end = region[n].begin + b->size;
      ++n;
      b = b->next;
   }
   for (i=0; i < n; ++i)
      assert(region[i].end > region[i].begin);
   qsort(region, n, sizeof(region[0]), region_sort);
   for (i=0; i+1 < n; ++i) {
      assert(region[i].end == region[i+1].begin);
   }

   if (ptr)
      verify_against_blocks(n, ptr, len);

   if (c) {
      GDrawHandle *t = c->head;
      while (t) {
         if (t->raw_ptr && t->raw_ptr != skip)
            verify_against_blocks(n, t->raw_ptr, t->bytes);
         t = t->next;
      }
      t = c->active;
      while (t) {
         if (t->raw_ptr && t->raw_ptr != skip)
            verify_against_blocks(n, t->raw_ptr, t->bytes);
         t = t->next;
      }
   }
}
#else
#define debug_complete_check(a,p,len,s)
#endif

#ifdef GFXALLOC_CHECK
static void gfxalloc_check2(gfx_allocator *alloc)
{
   int n=0;
   gfx_block_info *b = alloc->unused_list;
   while (b) {
      ++n;
      b = b->next;
   }
   GFXALLOC_ASSERT(n == alloc->num_unused);
   b = alloc->blocks->next;
   n = 0;
   while (b != alloc->blocks) {
      ++n;
      b = b->next;
   }
   GFXALLOC_ASSERT(n == alloc->num_free);
   GFXALLOC_ASSERT(alloc->num_blocks == alloc->num_unused + alloc->num_free + alloc->num_alloc);
}
#define gfxalloc_check(a) do { gfxalloc_check1(a); gfxalloc_check2(a); } while(0)
#else
#define gfxalloc_check2(a)
#define gfxalloc_check(a)
#endif


static gfx_block_info *gfxalloc_pop_unused(gfx_allocator *alloc)
{
   GFXALLOC_ASSERT(alloc->unused_list != NULL);
   GFXALLOC_ASSERT(alloc->unused_list->is_unused);
   GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(alloc->num_unused);)

   gfx_block_info *b = alloc->unused_list;
   alloc->unused_list = b->next;
   GFXALLOC_ASSERT(alloc->unused_list);
   b->is_unused = 0;
   GFXALLOC_IF_CHECK(--alloc->num_unused;)
   return b;
}

static void gfxalloc_push_unused(gfx_allocator *alloc, gfx_block_info *b)
{
   GFXALLOC_ASSERT(!b->is_unused);
   b->is_unused = 1;
   b->next = alloc->unused_list;
   alloc->unused_list = b;
   GFXALLOC_IF_CHECK(++alloc->num_unused);
}

static void gfxalloc_add_free(gfx_allocator *alloc, gfx_block_info *b)
{
   gfx_block_info *head = alloc->blocks;

   b->is_free = 1;
   b->next = head->next;
   b->prev = head;
   head->next->prev = b;
   head->next = b;
   GFXALLOC_IF_CHECK(++alloc->num_free;)
}

static void gfxalloc_rem_free(gfx_allocator *alloc, gfx_block_info *b)
{
   RR_UNUSED_VARIABLE(alloc);
   b->is_free = 0;
   b->prev->next = b->next;
   b->next->prev = b->prev;
   GFXALLOC_IF_CHECK(--alloc->num_free;)
}

static void gfxalloc_split_free(gfx_allocator *alloc, gfx_block_info *b, U32 pos)
{
   gfx_block_info *n = gfxalloc_pop_unused(alloc);

   GFXALLOC_ASSERT(b->is_free);
   GFXALLOC_ASSERT(pos > 0 && pos < b->size);

   // set up new free block
   n->ptr = b->ptr + pos;
   n->prev_phys = b;
   n->next_phys = b->next_phys;
   n->next_phys->prev_phys = n;
   n->size = b->size - pos;
   assert(n->size != 0);
   gfxalloc_add_free(alloc, n);

   // fix original block
   b->next_phys = n;
   b->size = pos;
   assert(b->size != 0);

debug_complete_check(alloc, n->ptr, n->size,0);
debug_complete_check(alloc, b->ptr, b->size,0);
}

static gfx_allocator *gfxalloc_create(void *mem, U32 mem_size, U32 align, U32 max_allocs)
{
   gfx_allocator *a;
   U32 i, max_blocks, size;

   if (!align || (align & (align - 1)) != 0) // align must be >0 and a power of 2
      return NULL;

   // for <= max_allocs live allocs, there's <= 2*max_allocs+1 blocks. worst case:
   // [free][used][free] .... [free][used][free]
   max_blocks = max_allocs * 2 + 1;
   size = sizeof(gfx_allocator) + max_blocks * sizeof(gfx_block_info);
   a = (gfx_allocator *) IggyGDrawMalloc(size);
   if (!a)
      return NULL;

   memset(a, 0, size);

   GFXALLOC_IF_CHECK(a->num_blocks = max_blocks;)
   GFXALLOC_IF_CHECK(a->num_alloc  = 0;)
   GFXALLOC_IF_CHECK(a->num_free   = 1;)
   GFXALLOC_IF_CHECK(a->num_unused = max_blocks-1;)

   GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(a->num_blocks == a->num_alloc + a->num_free + a->num_unused);)
   GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(a->num_free <= a->num_blocks+1);)

   a->actual_bytes_free = mem_size;
   a->mem_base = (U8 *) mem;
   a->mem_end = a->mem_base + mem_size;
   a->max_allocs = max_allocs;
   a->block_align = align;
   a->block_shift = 0;
   while ((1u << a->block_shift) < a->block_align)
      a->block_shift++;

   // init sentinel block
   a->blocks[0].prev = a->blocks[0].next = &a->blocks[1]; // point to free block
   a->blocks[0].prev_phys = a->blocks[0].next_phys = &a->blocks[1]; // same

   // init first free block
   a->blocks[1].ptr = a->mem_base;
   a->blocks[1].prev = a->blocks[1].next = &a->blocks[0];
   a->blocks[1].prev_phys = a->blocks[1].next_phys = &a->blocks[0];
   a->blocks[1].is_free = 1;
   a->blocks[1].size = mem_size;

   // init "unused" list
   a->unused_list = a->blocks + 2;
   for (i=2; i < max_blocks; i++) {
      a->blocks[i].is_unused = 1;
      a->blocks[i].next = a->blocks + (i + 1);
   }
   a->blocks[i].is_unused = 1;

   gfxalloc_check(a);
   debug_complete_check(a, NULL, 0,0);
   return a;
}

static void *gfxalloc_alloc(gfx_allocator *alloc, U32 size_in_bytes)
{
   gfx_block_info *cur, *best = NULL;
   U32 i, best_wasted = ~0u;
   U32 size = size_in_bytes;
debug_complete_check(alloc, NULL, 0,0);
gfxalloc_check(alloc);
   GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(alloc->num_blocks == alloc->num_alloc + alloc->num_free + alloc->num_unused);)
   GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(alloc->num_free <= alloc->num_blocks+1);)


   // round up to multiple of our block alignment
   size = (size + alloc->block_align-1) & ~(alloc->block_align - 1);
   assert(size >= size_in_bytes);
   assert(size != 0);

   // find best fit among all free blocks. this is O(N)!
   for (cur = alloc->blocks[0].next; cur != alloc->blocks; cur = cur->next) {
      if (cur->size >= size) {
         U32 wasted = cur->size - size;
         if (wasted < best_wasted) {
            best_wasted = wasted;
            best = cur;
            if (!wasted) break; // can't get better than perfect
         }
      }
   }

   // return the best fit, if we found any suitable block
   if (best) {
debug_check_overlap(alloc->cache, best->ptr, best->size);
      // split off allocated part
      if (size != best->size)
         gfxalloc_split_free(alloc, best, size);
debug_complete_check(alloc, best->ptr, best->size,0);

      // remove from free list and add to allocated hash table
      GFXALLOC_ASSERT(best->size == size);
      gfxalloc_rem_free(alloc, best);

      i = gfxalloc_get_hash_code(alloc, best->ptr);
      best->next = alloc->hash[i];
      alloc->hash[i] = best;
      alloc->actual_bytes_free -= size;
      GFXALLOC_ASSERT(alloc->actual_bytes_free >= 0);

      GFXALLOC_IF_CHECK(++alloc->num_alloc;)
      GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(alloc->num_blocks == alloc->num_alloc + alloc->num_free + alloc->num_unused);)
      GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(alloc->num_free <= alloc->num_blocks+1);)

debug_complete_check(alloc, best->ptr, best->size,0);
gfxalloc_check(alloc);
debug_check_overlap(alloc->cache, best->ptr, best->size);
      return best->ptr;
   } else
      return NULL; // not enough space!
}

static void gfxalloc_free(gfx_allocator *alloc, void *ptr)
{
   GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(alloc->num_blocks == alloc->num_alloc + alloc->num_free + alloc->num_unused);)
   GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(alloc->num_free <= alloc->num_blocks+1);)

   // find the block in the hash table
   gfx_block_info *b, *t, **prevnext;
   U32 i = gfxalloc_get_hash_code(alloc, ptr);

   prevnext = &alloc->hash[i];
   b = alloc->hash[i];

   while (b) {
      if (b->ptr == ptr) break;
      prevnext = &b->next;
      b = b->next;
   }

   if (!b) {
      GFXALLOC_ASSERT(0); // trying to free a non-allocated block
      return;
   }

debug_complete_check(alloc, b->ptr, b->size, 0);
   GFXALLOC_IF_CHECK(--alloc->num_alloc;)

   // remove it from the hash table
   *prevnext = b->next;

   alloc->actual_bytes_free += b->size;

   // merge with previous block if it's free, else add it to free list
   t = b->prev_phys;
   if (t->is_free) {
      t->size += b->size;
      t->next_phys = b->next_phys;
      t->next_phys->prev_phys = t;
      gfxalloc_push_unused(alloc, b);
      b = t;
   } else
      gfxalloc_add_free(alloc, b);

   // try to merge with next block
   t = b->next_phys;
   if (t->is_free) {
      b->size += t->size;
      b->next_phys = t->next_phys;
      t->next_phys->prev_phys = b;
      gfxalloc_rem_free(alloc, t);
      gfxalloc_push_unused(alloc, t);
   }
debug_complete_check(alloc, 0, 0, ptr);
gfxalloc_check(alloc);
   GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(alloc->num_blocks == alloc->num_alloc + alloc->num_free + alloc->num_unused);)
   GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(alloc->num_free <= alloc->num_blocks+1);)
}

#ifdef GDRAW_MANAGE_MEM_TWOPOOL

static rrbool gfxalloc_is_empty(gfx_allocator *alloc)
{
   gfx_block_info *first_free = alloc->blocks[0].next;

   // we want to check whether there's exactly one free block that
   // covers the entire pool.
   if (first_free == alloc->blocks) // 0 free blocks
      return false;

   if (first_free->next != alloc->blocks) // >1 free block
      return false;

   return first_free->ptr == alloc->mem_base && first_free->ptr + first_free->size == alloc->mem_end;
}

static rrbool gfxalloc_mem_contains(gfx_allocator *alloc, void *ptr)
{
   return alloc->mem_base <= (U8*)ptr && (U8*)ptr < alloc->mem_end;
}

#endif

#ifdef GDRAW_DEBUG

static void gfxalloc_dump(gfx_allocator *alloc)
{
   static const char *type[] = {
      "allocated",
      "free",
   };

   for (gfx_block_info *b = alloc->blocks[0].next_phys; b != alloc->blocks; b=b->next_phys) {
      U8 *start = b->ptr;
      U8 *end = b->ptr + b->size;
      printf("%p-%p: %s (%d bytes)\n",  start, end, type[b->is_free], b->size);
   }
}

#endif

#endif

#ifdef GDRAW_DEFRAGMENT

#define GDRAW_DEFRAGMENT_may_overlap   1  // self-overlap for individual copies is OK

// Defragmentation code for graphics memory.
// The platform implementation must provide a GPU memcpy function and handle all necessary
// synchronization. It must also adjust its resource descriptors to match the new addresses
// after defragmentation.

static void gdraw_gpu_memcpy(GDrawHandleCache *c, void *dst, void *src, U32 num_bytes);

static void gdraw_Defragment_memmove(GDrawHandleCache *c, U8 *dst, U8 *src, U32 num_bytes, U32 flags, GDrawStats *stats)
{
   if (dst == src)
      return;

   assert(num_bytes != 0);

   stats->nonzero_flags |= GDRAW_STATS_defrag;
   stats->defrag_objects += 1;
   stats->defrag_bytes += num_bytes;

   if ((flags & GDRAW_DEFRAGMENT_may_overlap) || dst + num_bytes <= src || src + num_bytes <= dst) // no problematic overlap
      gdraw_gpu_memcpy(c, dst, src, num_bytes);
   else {
      // need to copy in multiple chunks
      U32 chunk_size, pos=0;
      if (dst < src)
         chunk_size = (U32) (src - dst);
      else
         chunk_size = (U32) (dst - src);

      while (pos < num_bytes) {
         U32 amount = num_bytes - pos;
         if (amount > chunk_size) amount = chunk_size;
         gdraw_gpu_memcpy(c, dst + pos, src + pos, amount);
         pos += amount;
      }
   }
}

static rrbool gdraw_CanDefragment(GDrawHandleCache *c)
{
   // we can defragment (and extract some gain from it) if and only if there's more
   // than one free block. since gfxalloc coalesces free blocks immediately and keeps
   // them in a circular linked list, this is very easy to detect: just check if the
   // "next" pointer of the first free block points to the sentinel. (this is only
   // the case if there are 0 or 1 free blocks)
   gfx_allocator *alloc = c->alloc;
   return alloc->blocks[0].next->next != alloc->blocks;
}

static void gdraw_DefragmentMain(GDrawHandleCache *c, U32 flags, GDrawStats *stats)
{
   gfx_allocator *alloc = c->alloc;
   gfx_block_info *b, *n;
   U8 *p;
   S32 i;

   GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(alloc->num_blocks == alloc->num_alloc + alloc->num_free + alloc->num_unused);)
   GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(alloc->num_free <= alloc->num_blocks+1);)

   // go over all allocated memory blocks and clear the "prev" pointer
   // (unused for allocated blocks, we'll use it to store a back-pointer to the corresponding handle)
   for (b = alloc->blocks[0].next_phys; b != alloc->blocks; b=b->next_phys)
      if (!b->is_free)
         b->prev = NULL;

   // go through all handles and store a pointer to the handle in the corresponding memory block
   for (i=0; i < c->max_handles; i++)
      if (c->handle[i].raw_ptr) {
         assert(c->handle[i].bytes != 0);
         for (b=alloc->hash[gfxalloc_get_hash_code(alloc, c->handle[i].raw_ptr)]; b; b=b->next)
            if (b->ptr == c->handle[i].raw_ptr) {
               void *block = &c->handle[i];
               b->prev = (gfx_block_info *) block;
               break;
            }

         GFXALLOC_ASSERT(b != NULL); // didn't find this block anywhere!
      }

   // clear alloc hash table (we rebuild it during defrag)
   memset(alloc->hash, 0, sizeof(alloc->hash));

   // defragmentation proper: go over all blocks again, remove all free blocks from the physical
   // block list and compact the remaining blocks together.
   p = alloc->mem_base;
   for (b = alloc->blocks[0].next_phys; b != alloc->blocks; b=n) {
      n = b->next_phys;

      if (!b->is_free) {
         U32 h;

         // move block if necessary
         if (p != b->ptr) {
            assert(b->size != 0);
            gdraw_Defragment_memmove(c, p, b->ptr, b->size, flags, stats);
            b->ptr = p;
            assert(b->prev);
            if (b->prev)
               ((GDrawHandle *) b->prev)->raw_ptr = p;
         }

         // re-insert into hash table
         h = gfxalloc_get_hash_code(alloc, p);
         b->next = alloc->hash[h];
         alloc->hash[h] = b;

         p += b->size;
      } else {
         // free block: remove it from the physical block list
         b->prev_phys->next_phys = b->next_phys;
         b->next_phys->prev_phys = b->prev_phys;
         gfxalloc_rem_free(alloc, b);
         gfxalloc_push_unused(alloc, b);
      }
   }
   // the free list should be empty now
   assert(alloc->blocks[0].next == &alloc->blocks[0]);

   // unless all memory is allocated, we now need to add a new block for the free space at the end
   if (p != alloc->mem_end) {
      b = gfxalloc_pop_unused(alloc);

      b->ptr = p;
      b->prev_phys = alloc->blocks[0].prev_phys;
      b->next_phys = &alloc->blocks[0];
      b->prev_phys->next_phys = b;
      b->next_phys->prev_phys = b;
      b->size = alloc->mem_end - p;
      gfxalloc_add_free(alloc, b);
   }

   GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(alloc->num_blocks == alloc->num_alloc + alloc->num_free + alloc->num_unused);)
   GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(alloc->num_free <= alloc->num_blocks+1);)
}

#endif

#ifdef GDRAW_MANAGE_MEM_TWOPOOL

// Defragmentation code for graphics memory, using two-pool strategy.
//
// The platform implementation must provide a GPU memcpy function and handle
// all necessary synchronization. It must also adjust its resource descriptors
// to match the new addresses after defragmentation.
//
// The high concept for two-pool is that we can't update the resource pools
// mid-frame; instead, while preparing for a frame, we need to produce a memory
// configuration that is suitable for rendering a whole frame at once (in
// contrast to our normal incremental strategy, where we can decide to
// defragment mid-frame if things are getting desperate). This is for tiled
// renderers.
//
// Two-pool works like this:
// - As the name suggests, each handle cache has two memory pools and corresponding backing
//   allocators. The currently used allocator, "alloc", and a second allocator, "alloc_other".
// - Any resource used in a command buffer gets locked and *stays locked* until we're done
//   preparing that command buffer (i.e. no unlocking after every draw as in the normal
//   incremental memory management).
// - All allocations happen from "alloc", always. We mostly do our normal LRU cache freeing
//   to make space when required.
// - We can still run out of space (no surprise) and get into a configuration where we have
//   to defragment. This is the only tricky part, and where the second pool comes in. To
//   defragment, we switch the roles of "alloc" and "alloc_other", and allocate new backing
//   storage for all currently "locked" and "pinned" resources (i.e. everything we've used
//   in the currently pending frame). 
// - In general, we have the invariant that all resources we're using for batches we're
//   working on must be in the "alloc" (fresh) pool, not in the "other" (stale) pool.
//   Therefore, after a defragment/pool switch, any "live" resource (which means it's
//   present in the stale pool) has to be copied to the "fresh" pool as it's getting
//   locked to maintain this invariant.
//
// What this does is give us a guarantee that any given frame either only
// references resources in one pool (the common case), or does a defragment, in
// which case it looks like this:
//
//   +------------------------------+
//   |                              |
//   |                              |  pool A is fresh (=alloc), pool B is stale (=alloc_other)
//   |                              |  all resources referenced in here are in pool A
//   |                              |
//   |                              |
//   |                              |
//   +------------------------------+  <-- defragment! pools flip roles here
//   |                              |
//   |                              |
//   |                              |  pool B is fresh (=alloc), pool A is stale (=alloc_other)
//   |                              |  all resources referenced in here are in pool B
//   |                              |
//   +------------------------------+
//
// Now, at the end of the frame, we need to decide what to do with the
// resources that remain "live" (i.e. they're in the old pool but weren't
// referenced in the current frame so they didn't get copied). As of this
// writing, we simply free them, to maximize the amount of free memory in the
// new pool (and hopefully minimize the chance that we'll have to defragment
// again soon). It would also be possible to copy some of them though, assuming
// there's enough space.
//
// Freeing resources is an interesting case. When the CPU side of GDraw does a
// "free", we can't immediately reclaim the resource memory, since the GPU will
// generally still have outstanding commands that reference that resource. So
// our freed resources first enter the "Dead" state and only actually get freed
// once the GPU is done with them. What this means is that the list of
// resources in the "dead" state can end up holding references to both the
// fresh and the stale pool; the free implementation needs to be aware of this
// and return the memory to the right allocator.
//
// When we defragment, it's important to make sure that the pool we're flipping
// to is actually empty. What this means is that right before a defragment, we
// need to wait for all stale "dead" resources to actually become free. If the
// last defragment was several frames ago, this is fast - we haven't generated
// any new commands referencing the stale resources in several frames, so most
// likely they're all immediately free-able.  By contrast, if we just
// defragmented last frame, this will be a slow operation since we need to wait
// for the GPU pipeline to drain - but if you're triggering defragments in
// several consecutive frames, you're thrashing the resource pools badly and
// are getting really bad performance anyway.

static void gdraw_gpu_memcpy(GDrawHandleCache *c, void *dst, void *src, U32 num_bytes);
static void gdraw_gpu_wait_for_transfer_completion();
static void gdraw_resource_moved(GDrawHandle *t);

static rrbool gdraw_CanDefragment(GDrawHandleCache *c)
{
   // we can defragment (and extract some gain from it) if and only if there's more
   // than one free block. since gfxalloc coalesces free blocks immediately and keeps
   // them in a circular linked list, this is very easy to detect: just check if the
   // "next" pointer of the first free block points to the sentinel. (this is only
   // the case if there are 0 or 1 free blocks)
   gfx_allocator *alloc = c->alloc;
   if (!c->alloc_other) // if we don't have a second pool, we can't defrag at all.
      return false;
   return alloc->blocks[0].next->next != alloc->blocks;
}

static rrbool gdraw_MigrateResource(GDrawHandle *t, GDrawStats *stats)
{
   GDrawHandleCache *c = t->cache;
   void *ptr = NULL;

   assert(t->state == GDRAW_HANDLE_STATE_live || t->state == GDRAW_HANDLE_STATE_locked || t->state == GDRAW_HANDLE_STATE_pinned);
   // anything we migrate should be in the "other" (old) pool
   assert(gfxalloc_mem_contains(c->alloc_other, t->raw_ptr));

   ptr = gfxalloc_alloc(c->alloc, t->bytes);
   if (ptr) {
      // update stats
      stats->nonzero_flags |= GDRAW_STATS_defrag;
      stats->defrag_objects += 1;
      stats->defrag_bytes += t->bytes;

      // copy contents to new storage
      gdraw_gpu_memcpy(c, ptr, t->raw_ptr, t->bytes);

      // free old storage
      gfxalloc_free(c->alloc_other, t->raw_ptr);

      // adjust pointers to point to new location
      t->raw_ptr = ptr;
      gdraw_resource_moved(t);

      return true;
   } else
      return false;
}

static rrbool gdraw_MigrateAllResources(GDrawHandle *sentinel, GDrawStats *stats)
{
   GDrawHandle *h;
   for (h = sentinel->next; h != sentinel; h = h->next) {
      if (!gdraw_MigrateResource(h, stats))
         return false;
   }
   return true;
}

static rrbool gdraw_TwoPoolDefragmentMain(GDrawHandleCache *c, GDrawStats *stats)
{
   gfx_allocator *t;

   // swap allocators
   t = c->alloc;
   c->alloc = c->alloc_other;
   c->alloc_other = t;

   // immediately migrate all currently pinned and locked resources
   rrbool ok = true;
   ok = ok && gdraw_MigrateAllResources(&c->state[GDRAW_HANDLE_STATE_pinned], stats);
   ok = ok && gdraw_MigrateAllResources(&c->state[GDRAW_HANDLE_STATE_locked], stats);

   return ok;
}

static rrbool gdraw_StateListIsEmpty(GDrawHandle *head)
{
   // a list is empty when the head sentinel is the only node
   return head->next == head;
}

static void gdraw_CheckAllPointersUpdated(GDrawHandle *head)
{
#ifdef GDRAW_DEBUG
   GDrawHandle *h;
   for (h = head->next; h != head; h = h->next) {
      assert(gfxalloc_mem_contains(h->cache->alloc, h->raw_ptr));
   }
#endif
}

static void gdraw_PostDefragmentCleanup(GDrawHandleCache *c, GDrawStats *stats)
{
   // if we defragmented during this scene, this is the spot where
   // we need to nuke all references to resources that weren't
   // carried over into the new pool.
   if (c->did_defragment) {
      GDrawHandle *h;

      // alloc list should be empty at this point
      assert(gdraw_StateListIsEmpty(&c->state[GDRAW_HANDLE_STATE_alloc]));

      // free all remaining live resources (these are the resources we didn't
      // touch this frame, hence stale)
      h = &c->state[GDRAW_HANDLE_STATE_live];
      while (!gdraw_StateListIsEmpty(h))
         gdraw_res_free(h->next, stats);

      // "live" is now empty, and we already checked that "alloc" was empty
      // earlier. "dead" may hold objects on the old heap still (that were freed
      // before we swapped allocators). "user owned" is not managed by us.
      // that leaves "locked" and "pinned" resources, both of which better be
      // only pointing into the new heap now!
      gdraw_CheckAllPointersUpdated(&c->state[GDRAW_HANDLE_STATE_locked]);
      gdraw_CheckAllPointersUpdated(&c->state[GDRAW_HANDLE_STATE_pinned]);

      gdraw_gpu_wait_for_transfer_completion();
   }
}

#endif

// Image processing code

// Compute average of 4 RGBA8888 pixels passed as U32.
// Variables are named assuming the values are stored as big-endian, but all bytes
// are treated equally, so this code will work just fine on little-endian data.
static U32 gdraw_Avg4_rgba8888(U32 p0, U32 p1, U32 p2, U32 p3)
{
   U32 mask = 0x00ff00ff;
   U32 bias = 0x00020002;
   
   U32 gasum = ((p0 >> 0) & mask) + ((p1 >> 0) & mask) + ((p2 >> 0) & mask) + ((p3 >> 0) & mask) + bias;
   U32 rbsum = ((p0 >> 8) & mask) + ((p1 >> 8) & mask) + ((p2 >> 8) & mask) + ((p3 >> 8) & mask) + bias;

   return ((gasum >> 2) & mask) | ((rbsum << 6) & ~mask);
}

// Compute average of 2 RGBA8888 pixels passed as U32
static U32 gdraw_Avg2_rgba8888(U32 p0, U32 p1)
{
   return (p0 | p1) - (((p0 ^ p1) >> 1) & 0x7f7f7f7f);
}

// 2:1 downsample in both horizontal and vertical direction, for one line.
// width is width of destination line.
static void gdraw_Downsample_2x2_line(U8 *dst, U8 *line0, U8 *line1, U32 width, U32 bpp)
{
   U32 x;
   if (bpp == 4) {
      U32 *in0 = (U32 *) line0;
      U32 *in1 = (U32 *) line1;
      U32 *out = (U32 *) dst;
      for (x=0; x < width; x++, in0 += 2, in1 += 2)
         *out++ = gdraw_Avg4_rgba8888(in0[0], in0[1], in1[0], in1[1]);
   } else if (bpp == 1) {
      for (x=0; x < width; x++, line0 += 2, line1 += 2)
         *dst++ = (line0[0] + line0[1] + line1[0] + line1[1] + 2) / 4;
   } else
      RR_BREAK();
}

// 2:1 downsample in horizontal but not vertical direction.
static void gdraw_Downsample_2x1_line(U8 *dst, U8 *src, U32 width, U32 bpp)
{
   U32 x;
   if (bpp == 4) {
      U32 *in = (U32 *) src;
      U32 *out = (U32 *) dst;
      for (x=0; x < width; x++, in += 2)
         *out++ = gdraw_Avg2_rgba8888(in[0], in[1]);
   } else if (bpp == 1) {
      for (x=0; x < width; x++, src += 2)
         *dst++ = (src[0] + src[1] + 1) / 2;
   } else
      RR_BREAK();
}

// 2:1 downsample in vertical but not horizontal direction.
static void gdraw_Downsample_1x2(U8 *dst, S32 dstpitch, U8 *src, S32 srcpitch, U32 height, U32 bpp)
{
   U32 y;
   if (bpp == 4) {
      for (y=0; y < height; y++, dst += dstpitch, src += 2*srcpitch)
         *((U32 *) dst) = gdraw_Avg2_rgba8888(*((U32 *) src), *((U32 *) (src + srcpitch)));
   } else if (bpp == 1) {
      for (y=0; y < height; y++, dst += dstpitch, src += 2*srcpitch)
         *dst = (src[0] + src[srcpitch] + 1) / 2;
   } else
      RR_BREAK();
}

// 2:1 downsample (for mipmaps)
// dst: Pointer to destination buffer
// dstpitch: Pitch for destination buffer
// width: Width of *destination* image (i.e. downsampled version)
// height: Height of *destination* image (i.e. downsampled version)
// src: Pointer to source buffer
// srcpitch: Pitch of source buffer
// bpp: Bytes per pixel for image data
//
// can be used for in-place resizing if src==dst and dstpitch <= srcpitch!
static GDRAW_MAYBE_UNUSED void gdraw_Downsample(U8 *dst, S32 dstpitch, U32 width, U32 height, U8 *src, S32 srcpitch, U32 bpp)
{
   U32 y;
   assert(bpp == 1 || bpp == 4);

   // @TODO gamma?
   if (!height) // non-square texture, height was reduced to 1 in a previous step
      gdraw_Downsample_2x1_line(dst, src, width, bpp);
   else if (!width) // non-square texture, width was reduced to 1 in a previous step
      gdraw_Downsample_1x2(dst, dstpitch, src, srcpitch, height, bpp);
   else {
      for (y=0; y < height; y++) {
         gdraw_Downsample_2x2_line(dst, src, src + srcpitch, width, bpp);
         dst += dstpitch;
         src += 2*srcpitch;
      }
   }
}

#ifndef GDRAW_NO_STREAMING_MIPGEN

#define GDRAW_MAXMIPS 16 // maximum number of mipmaps supported.

typedef struct GDrawMipmapContext {
   U32 width; // width of the texture being mipmapped
   U32 height; // height of the texture being mipmapped
   U32 mipmaps; // number of mipmaps
   U32 bpp; // bytes per pixel

   U32 partial_row; // bit N: is mipmap N currently storing a partial row?
   U32 bheight; // height of the buffer at miplevel 0
   U8 *pixels[GDRAW_MAXMIPS];
   U32 pitch[GDRAW_MAXMIPS];
} GDrawMipmapContext;

static rrbool gdraw_MipmapBegin(GDrawMipmapContext *c, U32 width, U32 height, U32 mipmaps, U32 bpp, U8 *buffer, U32 buffer_size)
{
   U32 i;
   U8 *p;

   if (mipmaps > GDRAW_MAXMIPS)
      return false;

   c->width = width;
   c->height = height;
   c->mipmaps = mipmaps;
   c->bpp = bpp;
   c->partial_row = 0;

   // determine how many lines to buffer
   // we try to use roughly 2/3rds of the buffer for the first miplevel (less than 3/4 since with our
   // partial line buffers, we have extra buffer space for lower mip levels).
   c->bheight = (2 * buffer_size) / (3 * width * bpp);

   // round down to next-smaller power of 2 (in case we need to swizzle; swizzling works on pow2-sized blocks)
   while (c->bheight & (c->bheight-1)) // while not a power of 2...
      c->bheight &= c->bheight - 1; // clear least significant bit set

   // then keep lowering the number of buffered lines until they fit (or we reach zero, i.e. it doesn't fit)
   while (c->bheight) {
      p = buffer;
      for (i=0; i < c->mipmaps; i++) {
         U32 mw = c->width >> i;
         U32 bh = c->bheight >> i;
         if (!mw) mw++;
         if (!bh) mw *= 2, bh++; // need space for line of previous miplevel

         c->pixels[i] = p;
         c->pitch[i] = mw * bpp;
         p += c->pitch[i] * bh;
      }

      // if it fits, we're done
      if (p <= buffer + buffer_size) {
         if (c->bheight > height) // buffer doesn't need to be larger than the image!
            c->bheight = height;
         return true;
      }

      // need to try a smaller line buffer...
      c->bheight >>= 1;
   }

   // can't fit even one line into our buffer. ouch!
   return false;
}

// returns true if there was data generated for this miplevel, false otherwise.
static rrbool gdraw_MipmapAddLines(GDrawMipmapContext *c, U32 level)
{
   U32 bw,bh;

   assert(level > 0); // doesn't make sense to call this on level 0
   if (level == 0 || level >= c->mipmaps)
      return false; // this level doesn't exist

   bw = c->width >> level;    // buffer width at this level
   bh = c->bheight >> level;  // buffer height at this level

   if (bh) { // we can still do regular downsampling
      gdraw_Downsample(c->pixels[level], c->pitch[level], bw, bh, c->pixels[level-1], c->pitch[level-1], c->bpp);
      return true;
   } else if (c->height >> level) { // need to buffer partial lines, but still doing vertical 2:1 downsampling
      if ((c->partial_row ^= (1 << level)) & (1 << level)) { // no buffered partial row for this miplevel yet, make one
         memcpy(c->pixels[level], c->pixels[level-1], bw * 2 * c->bpp);
         return false;
      } else { // have one buffered row, can generate output pixels
         gdraw_Downsample_2x2_line(c->pixels[level], c->pixels[level], c->pixels[level-1], bw, c->bpp);
         return true;
      }
   } else { // finish off with a chain of Nx1 miplevels
      gdraw_Downsample_2x1_line(c->pixels[level], c->pixels[level-1], bw, c->bpp);
      return true;
   }
}

#endif // GDRAW_NO_STREAMING_MIPGEN

#ifdef GDRAW_CHECK_BLOCK
static void check_block_alloc(gfx_allocator *alloc, void *ptr, rrbool allocated)
{
   int i,n=0,m=0;
   for (i=0; i < GFXALLOC_HASH_SIZE; ++i) {
      gfx_block_info *b = alloc->hash[i];
      while (b) {
         if (b->ptr == ptr)
            ++n;
         b = b->next;
      }
   }
   gfx_block_info *b = alloc->blocks[0].next;
   while (b != &alloc->blocks[0]) {
      if (b->ptr == ptr)
         ++m;
      b = b->next;
   }
   if (allocated)
      assert(n == 1 && m == 0);
   else
      assert(n == 0 && m == 1);
}
#else
#define check_block_alloc(a,p,f)
#endif

#ifdef GDRAW_BUFFER_RING

////////////////////////////////////////////////////////////////////////
//
//   Buffer ring
//

// Implements a dynamic buffer backed by multiple physical buffers, with
// the usual append-only, DISCARD/NOOVERWRITE semantics.
//
// This can be used for dynamic vertex buffers, constant buffers, etc.
#define GDRAW_BUFRING_MAXSEGS 4     // max number of backing segments

typedef struct gdraw_bufring_seg {
   struct gdraw_bufring_seg *next;  // next segment in ring
   U8 *data;                        // pointer to the allocation
   GDrawFence fence;                // fence for this segment
   U32 used;                        // number of bytes used
} gdraw_bufring_seg;

typedef struct gdraw_bufring {
   gdraw_bufring_seg *cur;          // active ring segment
   U32 seg_size;                    // size of one segment
   U32 align;                       // alignment of segment allocations
   gdraw_bufring_seg all_segs[GDRAW_BUFRING_MAXSEGS];
} gdraw_bufring;

// forwards
static GDrawFence put_fence();
static void wait_on_fence(GDrawFence fence);

static void gdraw_bufring_init(gdraw_bufring * RADRESTRICT ring, void *ptr, U32 size, U32 nsegs, U32 align)
{
   U32 i, seg_size;

   ring->seg_size = 0;
   if (!ptr || nsegs < 1 || size < nsegs * align) // bail if no ring buffer memory or too small
      return;

   if (nsegs > GDRAW_BUFRING_MAXSEGS)
      nsegs = GDRAW_BUFRING_MAXSEGS;

   // align needs to be a positive power of two
   assert(align >= 1 && (align & (align - 1)) == 0);

   // buffer really needs to be properly aligned
   assert(((UINTa)ptr & (align - 1)) == 0);

   seg_size = (size / nsegs) & ~(align - 1);
   for (i=0; i < nsegs; ++i) {
      ring->all_segs[i].next = &ring->all_segs[(i + 1) % nsegs];
      ring->all_segs[i].data = (U8 *) ptr + i * seg_size;
      ring->all_segs[i].fence.value = 0;
      ring->all_segs[i].used = 0;
   }

   ring->cur = ring->all_segs;
   ring->seg_size = seg_size;
   ring->align = align;
}

static void gdraw_bufring_shutdown(gdraw_bufring * RADRESTRICT ring)
{
   ring->cur = NULL;
   ring->seg_size = 0;
}

static void *gdraw_bufring_alloc(gdraw_bufring * RADRESTRICT ring, U32 size, U32 align)
{
   U32 align_up;
   gdraw_bufring_seg *seg;

   if (size > ring->seg_size)
      return NULL; // nope, won't fit

   assert(align <= ring->align);

   // check if it fits in the active segment first
   seg = ring->cur;
   align_up = (seg->used + align - 1) & -align;

   if ((align_up + size) <= ring->seg_size) {
      void *ptr = seg->data + align_up;
      seg->used = align_up + size;
      return ptr;
   }

   // doesn't fit, we have to start a new ring segment.
   seg->fence = put_fence();

   // switch to the next segment, wait till GPU is done with it
   seg = ring->cur = seg->next;
   wait_on_fence(seg->fence);

   // allocate from the new segment. we assume that segment offsets
   // satisfy the highest alignment requirements we ever ask for!
   seg->used = size;
   return seg->data;
}

#endif

////////////////////////////////////////////////////////////////////////
//
//   General resource manager
//

#ifndef GDRAW_FENCE_FLUSH
#define GDRAW_FENCE_FLUSH()
#endif

#ifdef GDRAW_MANAGE_MEM
// functions the platform must implement
#ifndef GDRAW_BUFFER_RING // avoid "redundant redeclaration" warning
static void wait_on_fence(GDrawFence fence);
#endif
static rrbool is_fence_pending(GDrawFence fence);
static void gdraw_defragment_cache(GDrawHandleCache *c, GDrawStats *stats);

// functions we implement
static void gdraw_res_reap(GDrawHandleCache *c, GDrawStats *stats);
#endif

// If GDRAW_MANAGE_MEM is not #defined, this needs to perform the
// actual free using whatever API we're targeting.
//
// If GDRAW_MANAGE_MEM is #defined, the shared code handles the
// memory management part, but you might still need to update
// your state caching.
static void api_free_resource(GDrawHandle *r);

// Actually frees a resource and releases all allocated resources
static void gdraw_res_free(GDrawHandle *r, GDrawStats *stats)
{
   assert(r->state == GDRAW_HANDLE_STATE_live || r->state == GDRAW_HANDLE_STATE_locked || r->state == GDRAW_HANDLE_STATE_dead ||
      r->state == GDRAW_HANDLE_STATE_pinned || r->state == GDRAW_HANDLE_STATE_user_owned);

#ifdef GDRAW_MANAGE_MEM
   GDRAW_FENCE_FLUSH();

   // make sure resource isn't in use before we actually free the memory
   wait_on_fence(r->fence);
   if (r->raw_ptr) {
#ifndef GDRAW_MANAGE_MEM_TWOPOOL
      gfxalloc_free(r->cache->alloc, r->raw_ptr);
#else
      GDrawHandleCache *c = r->cache;
      if (gfxalloc_mem_contains(c->alloc, r->raw_ptr))
         gfxalloc_free(c->alloc, r->raw_ptr);
      else {
         assert(gfxalloc_mem_contains(c->alloc_other, r->raw_ptr));
         gfxalloc_free(c->alloc_other, r->raw_ptr);
      }
#endif
   }
#endif

   api_free_resource(r);

   stats->nonzero_flags |= GDRAW_STATS_frees;
   stats->freed_objects += 1;
   stats->freed_bytes += r->bytes;

   gdraw_HandleCacheFree(r);
}

// Frees the LRU resource in the given cache.
static rrbool gdraw_res_free_lru(GDrawHandleCache *c, GDrawStats *stats)
{
   GDrawHandle *r = gdraw_HandleCacheGetLRU(c);
   if (!r) return false;

   if (c->is_vertex && r->owner) // check for r->owner since it may already be killed (if player destroyed first)
      IggyDiscardVertexBufferCallback(r->owner, r);

   // was it referenced since end of previous frame (=in this frame)?
   // if some, we're thrashing; report it to the user, but only once per frame.
   if (c->prev_frame_end.value < r->fence.value && !c->is_thrashing) {
      IggyGDrawSendWarning(NULL, c->is_vertex ? "GDraw Thrashing vertex memory" : "GDraw Thrashing texture memory");
      c->is_thrashing = true;
   }

   gdraw_res_free(r, stats);
   return true;
}

static void gdraw_res_flush(GDrawHandleCache *c, GDrawStats *stats)
{
   c->is_thrashing = true; // prevents warnings being generated from free_lru
   gdraw_HandleCacheUnlockAll(c);
   while (gdraw_res_free_lru(c, stats))
      ;
}

static GDrawHandle *gdraw_res_alloc_outofmem(GDrawHandleCache *c, GDrawHandle *t, char const *failed_type)
{
   if (t)
      gdraw_HandleCacheAllocateFail(t);
   IggyGDrawSendWarning(NULL, c->is_vertex ? "GDraw Out of static vertex buffer %s" : "GDraw Out of texture %s", failed_type);
   return NULL;
}

#ifndef GDRAW_MANAGE_MEM

static GDrawHandle *gdraw_res_alloc_begin(GDrawHandleCache *c, S32 size, GDrawStats *stats)
{
   GDrawHandle *t;
   if (size > c->total_bytes)
      gdraw_res_alloc_outofmem(c, NULL, "memory (single resource larger than entire pool)");
   else {
      // given how much data we're going to allocate, throw out
      // data until there's "room" (this basically lets us use
      // managed memory and just bound our usage, without actually
      // packing it and being exact)
      while (c->bytes_free < size) {
         if (!gdraw_res_free_lru(c, stats)) {
            gdraw_res_alloc_outofmem(c, NULL, "memory");
            break;
         }
      }
   }

   // now try to allocate a handle
   t = gdraw_HandleCacheAllocateBegin(c);
   if (!t) {
      // it's possible we have no free handles, because all handles
      // are in use without exceeding the max storage above--in that
      // case, just free one texture to give us a free handle (ideally
      // we'd trade off cost of regenerating)
      if (gdraw_res_free_lru(c, stats)) {
         t = gdraw_HandleCacheAllocateBegin(c);
         if (t == NULL) {
            gdraw_res_alloc_outofmem(c, NULL, "handles");
         }
      }
   }
   return t;
}

#else

// Returns whether this resource holds pointers to one of the GDraw-managed
// pools.
static rrbool gdraw_res_is_managed(GDrawHandle *r)
{
   return r->state == GDRAW_HANDLE_STATE_live ||
      r->state == GDRAW_HANDLE_STATE_locked ||
      r->state == GDRAW_HANDLE_STATE_dead ||
      r->state == GDRAW_HANDLE_STATE_pinned;
}

// "Reaps" dead resources. Even if the user requests that a
// resource be freed, it might still be in use in a pending
// command buffer. So we can't free the associated memory
// immediately; instead, we flag the resource as "dead" and
// periodically check whether we can actually free the
// pending memory of dead resources ("reap" them).
static void gdraw_res_reap(GDrawHandleCache *c, GDrawStats *stats)
{
   GDrawHandle *sentinel = &c->state[GDRAW_HANDLE_STATE_dead];
   GDrawHandle *t;
   GDRAW_FENCE_FLUSH();

   // reap all dead resources that aren't in use anymore
   while ((t = sentinel->next) != sentinel && !is_fence_pending(t->fence))
      gdraw_res_free(t, stats);
}

// "Kills" a resource. This means GDraw won't use it anymore
// (it's dead), but there might still be outstanding references
// to it in a pending command buffer, so we can't physically
// free the associated memory until that's all processed.
static void gdraw_res_kill(GDrawHandle *r, GDrawStats *stats)
{
   GDRAW_FENCE_FLUSH(); // dead list is sorted by fence index - make sure all fence values are current.

   r->owner = NULL;
   gdraw_HandleCacheInsertDead(r);
   gdraw_res_reap(r->cache, stats);
}

static GDrawHandle *gdraw_res_alloc_begin(GDrawHandleCache *c, S32 size, GDrawStats *stats)
{
   GDrawHandle *t;
   void *ptr = NULL;

   gdraw_res_reap(c, stats); // NB this also does GDRAW_FENCE_FLUSH();
   if (size > c->total_bytes)
      return gdraw_res_alloc_outofmem(c, NULL, "memory (single resource larger than entire pool)");
   
   // now try to allocate a handle
   t = gdraw_HandleCacheAllocateBegin(c);
   if (!t) {
      // it's possible we have no free handles, because all handles
      // are in use without exceeding the max storage above--in that
      // case, just free one texture to give us a free handle (ideally
      // we'd trade off cost of regenerating)
      gdraw_res_free_lru(c, stats);
      t = gdraw_HandleCacheAllocateBegin(c);
      if (!t)
         return gdraw_res_alloc_outofmem(c, NULL, "handles");
   }

   // try to allocate first
   if (size) {
      ptr = gfxalloc_alloc(c->alloc, size);
      if (!ptr) {
         // doesn't currently fit. try to free some allocations to get space to breathe.
         S32 want_free = RR_MAX(size + (size / 2), GDRAW_MIN_FREE_AMOUNT);
         if (want_free > c->total_bytes)
            want_free = size; // okay, *really* big resource, just try to allocate its real size

         // always keep freeing textures until want_free bytes are free.
         while (c->alloc->actual_bytes_free < want_free) {
            if (!gdraw_res_free_lru(c, stats))
               return gdraw_res_alloc_outofmem(c, t, "memory");
         }

         // now, keep trying to allocate and free some more memory when it still doesn't fit
         while (!(ptr = gfxalloc_alloc(c->alloc, size))) {
            if (c->alloc->actual_bytes_free >= 3 * size || // if we should have enough free bytes to satisfy the request by now
               (c->alloc->actual_bytes_free >= size && size * 2 >= c->total_bytes)) // or the resource is very big and the alloc doesn't fit
            {
               // before we actually consider defragmenting, we want to free all stale resources (not
               // referenced in the previous 2 frames). and if that frees up enough memory so we don't have
               // to defragment, all the better!
               // also, never defragment twice in a frame, just assume we're thrashing when we get in that
               // situation and free up as much as possible.
               if (!c->did_defragment &&
                   c->prev_frame_start.value <= c->handle->fence.value) {

                  // defragment.
   defrag:
                  if (gdraw_CanDefragment(c)) { // only try defrag if it has a chance of helping.
                     gdraw_defragment_cache(c, stats);
                     c->did_defragment = true;
                  }
                  ptr = gfxalloc_alloc(c->alloc, size);
                  if (!ptr)
                     return gdraw_res_alloc_outofmem(c, t, "memory (fragmentation)");
                  break;
               }
            }

            // keep trying to free some more
            if (!gdraw_res_free_lru(c, stats)) {
               if (c->alloc->actual_bytes_free >= size) // nothing left to free but we should be good - defrag again, even if it's the second time in a frame
                  goto defrag;

               return gdraw_res_alloc_outofmem(c, t, "memory");
            }
         }
      }
   }

   t->fence.value = 0; // hasn't been used yet
   t->raw_ptr = ptr;
   return t;
}

#endif