Files
mc-lce/Minecraft.Client/Durango/Iggy/gdraw/gdraw_shared.inl
2026-03-01 02:38:58 +02:00

2596 lines
88 KiB
C++

// gdraw_shared.inl - author: Sean Barrett - copyright 2010 RAD Game Tools
//
// This file implements some common code that can be shared across
// all the sample implementations of GDraw.
#ifdef IGGY_DISABLE_GDRAW_ASSERT
#define assert(x)
#else
#include <assert.h>
#endif
#ifndef GDRAW_MAYBE_UNUSED
#define GDRAW_MAYBE_UNUSED
#endif
///////////////////////////////////////////////////////////////
//
// GDrawHandleCache manages resource "handles" used by Iggy
// (i.e. these handles wrap the platform resource handles,
// and this file provides those wrappers and facilities for
// LRU tracking them). Moreover, for console platforms, we
// actually implement our own managed resource pools.
//
// This is the main state machine when GDRAW_MANAGE_MEM is defined:
// (which covers all console platforms)
//
// +------+ +--------+ |
// | Live |<------->| Locked | |
// +------+ +--------+ |
// / \ ^ |
// / \ \ |
// v v \ |
// +------+ +------+ +------+ | |
// | Dead |--->| Free |<---| User | | |
// +------+ +------+ +------+ | |
// ^ ^ ^ ^ | |
// \ / \ | | |
// \ / v | | |
// +--------+ +-------+ / |
// | Pinned |<--------| Alloc |/ |
// +--------+ +-------+ |
//
// "Free" handles are not in use and available for allocation.
// "Alloc" handles have been assigned by GDraw, but do not yet
// have a system resource backing them. Resources stay in
// this state until we know that for sure that we're going
// to be able to successfully complete creation, at which
// point the resource transitions to one of the regular states.
// "Live" handles correspond to resources that may be used
// for rendering. They are kept in LRU order. Old resources
// may be evicted to make space.
// "Locked" handles cover resources that are going to be used
// in the next draw command. Once a resource is marked locked,
// it may not be evicted until it's back to "Live".
// "Dead" handles describe resources that have been freed on the
// CPU side, but are still in use by the GPU. Their memory may
// only be reclaimed once the GPU is done with them, at which
// point they are moved to the "Free" list. Items on the "Dead"
// list appear ordered by the last time they were used by the
// GPU - "most stale" first.
// "Pinned" resources can be used in any draw call without getting
// locked first. They can never be LRU-freed, but their memory
// is still managed by GDraw. Currently this is only used for
// the Iggy font cache.
// "User" (user-owned) resources are exactly that. They act much like
// pinned resources, but their memory isn't managed by GDraw.
// When a user-owned resource is freed, we really need to free
// it immediately (instead of marking it as "dead"), which might
// necessitate stalling the CPU until the GPU is finished using
// that resource. Since we don't own the memory, delayed frees
// are not an option.
//
// Without GDRAW_MANAGE_MEM, there's no "Dead" resources, and all
// frees are performed immediately.
typedef struct GDrawHandleCache GDrawHandleCache;
typedef struct GDrawHandle GDrawHandle;
typedef struct
{
U64 value;
} GDrawFence;
typedef enum
{
GDRAW_HANDLE_STATE_free = 0,
GDRAW_HANDLE_STATE_live,
GDRAW_HANDLE_STATE_locked,
GDRAW_HANDLE_STATE_dead,
GDRAW_HANDLE_STATE_pinned,
GDRAW_HANDLE_STATE_user_owned,
GDRAW_HANDLE_STATE_alloc,
GDRAW_HANDLE_STATE__count,
// not an actual state!
GDRAW_HANDLE_STATE_sentinel = GDRAW_HANDLE_STATE__count,
} GDrawHandleState;
struct GDrawHandle
{
GDrawNativeHandle handle; // platform handle to a resource (variable size)
void * owner; // 4/8 // opaque handle used to allow freeing resources without calling back to owner
GDrawHandleCache * cache; // 4/8 // which cache this handle came from
GDrawHandle * next,*prev; // 8/16 // doubly-linked list
#ifdef GDRAW_MANAGE_MEM
void * raw_ptr; // 4/8 // pointer to allocation - when you're managing memory manually
#ifdef GDRAW_CORRUPTION_CHECK
U32 cached_raw_value[4];
rrbool has_check_value;
#endif
#endif
GDrawFence fence; // 8 // (optional) platform fence for resource
// 4
U32 bytes:28; // estimated storage cost to allow setting a loose limit
U32 state:4; // state the handle is in
};
// validate alignment to make sure structure will pack correctly
#ifdef __RAD64__
RR_COMPILER_ASSERT((sizeof(GDrawHandle) & 7) == 0);
#else
RR_COMPILER_ASSERT((sizeof(GDrawHandle) & 3) == 0);
#endif
struct GDrawHandleCache
{
S32 bytes_free;
S32 total_bytes;
S32 max_handles;
U32 is_vertex : 1; // vertex buffers have different warning codes and generate discard callbacks
U32 is_thrashing : 1;
U32 did_defragment : 1;
// 30 unused bits
GDrawHandle state[GDRAW_HANDLE_STATE__count]; // sentinel nodes for all of the state lists
#ifdef GDRAW_MANAGE_MEM
struct gfx_allocator *alloc;
#endif
#ifdef GDRAW_MANAGE_MEM_TWOPOOL
struct gfx_allocator *alloc_other;
#endif
GDrawFence prev_frame_start, prev_frame_end; // fence value at start/end of previous frame, for thrashing detection
GDrawHandle handle[1]; // the rest of the handles must be stored right after this in the containing structure
};
#ifdef GDRAW_CORRUPTION_CHECK
// values for corruption checking
#define GDRAW_CORRUPTIONCHECK_renderbegin 0x10
#define GDRAW_CORRUPTIONCHECK_renderend 0x20
#define GDRAW_CORRUPTIONCHECK_nomoregdraw 0x30
#define GDRAW_CORRUPTIONCHECK_maketexbegin 0x40
#define GDRAW_CORRUPTIONCHECK_maketexend 0x50
#define GDRAW_CORRUPTIONCHECK_wrappedcreateend 0x60
#define GDRAW_CORRUPTIONCHECK_wrappedcreatebegin 0x61
#define GDRAW_CORRUPTIONCHECK_wrappeddestroyend 0x70
#define GDRAW_CORRUPTIONCHECK_wrappeddestroybegin 0x71
#define GDRAW_CORRUPTIONCHECK_allochandle 0x80
#define GDRAW_CORRUPTIONCHECK_allochandle_begin 0x81
#define GDRAW_CORRUPTIONCHECK_allochandle_postreap 0x82
#define GDRAW_CORRUPTIONCHECK_allochandle_postfree1 0x83
#define GDRAW_CORRUPTIONCHECK_allochandle_postfree2 0x84
#define GDRAW_CORRUPTIONCHECK_allochandle_postfree3 0x85
#define GDRAW_CORRUPTIONCHECK_allochandle_postalloc1 0x86
#define GDRAW_CORRUPTIONCHECK_allochandle_postalloc2 0x87
#define GDRAW_CORRUPTIONCHECK_allochandle_postalloc3 0x88
#define GDRAW_CORRUPTIONCHECK_allochandle_defrag 0x89
#define GDRAW_CORRUPTIONCHECK_freetex 0x90
static U32 *debug_raw_address(GDrawHandle *t, int choice)
{
static int offset_table[4] = { 0x555555, 0xaaaaaa, 0x333333, 0x6e6e6e };
U8 *base = (U8 *) t->raw_ptr;
int offset = offset_table[choice] & (t->bytes-1) & ~3;
return (U32 *) (base + offset);
}
static void debug_check_overlap_one(GDrawHandle *t, U8 *ptr, S32 len)
{
assert(len >= 0);
if (t->raw_ptr && t->raw_ptr != ptr) {
assert(t->raw_ptr < ptr || t->raw_ptr >= ptr+len);
}
}
static void debug_check_overlap(GDrawHandleCache *c, U8 *ptr, S32 len)
{
GDrawHandle *t = c->head;
while (t) {
debug_check_overlap_one(t, ptr, len);
t = t->next;
}
t = c->active;
while (t) {
debug_check_overlap_one(t, ptr, len);
t = t->next;
}
}
static void debug_check_raw_values(GDrawHandleCache *c)
{
GDrawHandle *t = c->head;
while (t) {
if (t->raw_ptr && t->has_check_value) {
int i;
for (i=0; i < 4; ++i) {
if (*debug_raw_address(t, i) != t->cached_raw_value[i]) {
//zlog("!Iggy texture corruption found\n");
//zlog("t=%p, t->raw_ptr=%p\n", t, t->raw_ptr);
//zlog("Cached values: %08x %08x %08x %08x\n", t->cached_raw_value[0], t->cached_raw_value[1], t->cached_raw_value[2], t->cached_raw_value[3]);
//zlog("Current values: %08x %08x %08x %08x\n", *debug_raw_address(t,0), *debug_raw_address(t,1), *debug_raw_address(t,2), *debug_raw_address(t,3));
assert(0);
}
}
#if 0
GDrawHandle *s;
check_block_alloc(c->alloc, t->raw_ptr, 1);
s = c->head;
while (s != t) {
assert(s->raw_ptr != t->raw_ptr);
s = s->next;
}
s = c->active;
while (s != NULL) {
assert(s->raw_ptr != t->raw_ptr);
s = s->next;
}
#endif
}
t = t->next;
}
t = c->active;
while (t) {
if (t->raw_ptr && t->has_check_value) {
int i;
for (i=0; i < 4; ++i) {
if (*debug_raw_address(t, i) != t->cached_raw_value[i]) {
//zlog("!Iggy texture corruption found\n");
//zlog("t=%p, t->raw_ptr=%p\n", t, t->raw_ptr);
//zlog("Cached values: %08x %08x %08x %08x\n", t->cached_raw_value[0], t->cached_raw_value[1], t->cached_raw_value[2], t->cached_raw_value[3]);
//zlog("Current values: %08x %08x %08x %08x\n", *debug_raw_address(t,0), *debug_raw_address(t,1), *debug_raw_address(t,2), *debug_raw_address(t,3));
assert(0);
}
}
#if 0
GDrawHandle *s;
check_block_alloc(c->alloc, t->raw_ptr, 1);
s = c->active;
while (s != t) {
assert(s->raw_ptr != t->raw_ptr);
s = s->next;
}
#endif
}
t = t->next;
}
}
#ifndef GDRAW_CORRUPTION_MASK
#define GDRAW_CORRUPTION_MASK 0
#endif
#define debug_check_raw_values_if(c,v) \
if ((GDRAW_CORRUPTION_CHECK & ~GDRAW_CORRUPTION_MASK) == ((v) & ~GDRAW_CORRUPTION_MASK)) \
debug_check_raw_values(c); \
else
static void debug_set_raw_value(GDrawHandle *t)
{
if (t->raw_ptr) {
int i;
for (i=0; i < 4; ++i)
t->cached_raw_value[i] = *debug_raw_address(t, i);
t->has_check_value = true;
}
}
static void debug_unset_raw_value(GDrawHandle *t)
{
t->has_check_value = false;
}
static void debug_check_value_is_unreferenced(GDrawHandleCache *c, void *ptr)
{
GDrawHandle *t = c->head;
while (t) {
assert(t->raw_ptr != ptr);
t = t->next;
}
t = c->active;
while (t) {
assert(t->raw_ptr != ptr);
t = t->next;
}
}
#else
#define debug_check_overlap(c,p,len)
#define debug_set_raw_value(t)
#define debug_check_value_is_unreferenced(c,p)
#define debug_unset_raw_value(t)
#define debug_check_raw_values(c)
#define debug_check_raw_values_if(c,v)
#endif
#ifdef SUPERDEBUG
static void check_lists(GDrawHandleCache *c)
{
GDrawHandle *sentinel, *t;
U32 state;
// for all lists, verify that they are consistent and
// properly linked
for (state = 0; state < GDRAW_HANDLE_STATE__count; state++) {
S32 count = 0;
sentinel = &c->state[state];
assert(!sentinel->cache);
assert(sentinel->state == GDRAW_HANDLE_STATE_sentinel);
for (t = sentinel->next; t != sentinel; t = t->next) {
count++;
assert(t->cache == c);
assert(t->state == state);
assert(t->prev->next == t);
assert(t->next->prev == t);
assert(count < 50000);
}
}
// for dead list, additionally verify that it's in the right
// order (namely, sorted by ascending fence index)
sentinel = &c->state[GDRAW_HANDLE_STATE_dead];
for (t = sentinel->next; t != sentinel; t = t->next) {
assert(t->prev == sentinel || t->fence.value >= t->prev->fence.value);
}
}
#include <stdio.h>
static const char *gdraw_StateName(U32 state)
{
switch (state) {
case GDRAW_HANDLE_STATE_free: return "free";
case GDRAW_HANDLE_STATE_live: return "live";
case GDRAW_HANDLE_STATE_locked: return "locked";
case GDRAW_HANDLE_STATE_dead: return "dead";
case GDRAW_HANDLE_STATE_pinned: return "pinned";
case GDRAW_HANDLE_STATE_user_owned: return "user-owned";
case GDRAW_HANDLE_STATE_alloc: return "alloc";
case GDRAW_HANDLE_STATE_sentinel: return "<sentinel>";
default: return "???";
}
}
#else
static RADINLINE void check_lists(GDrawHandleCache *c)
{
RR_UNUSED_VARIABLE(c);
}
#endif
static void gdraw_HandleTransitionInsertBefore(GDrawHandle *t, GDrawHandleState new_state, GDrawHandle *succ)
{
check_lists(t->cache);
assert(t->state != GDRAW_HANDLE_STATE_sentinel); // sentinels should never get here!
assert(t->state != (U32) new_state); // code should never call "transition" if it's not transitioning!
// unlink from prev state
t->prev->next = t->next;
t->next->prev = t->prev;
// add to list for new state
t->next = succ;
t->prev = succ->prev;
t->prev->next = t;
t->next->prev = t;
#ifdef SUPERDEBUG
printf("GD %chandle %p %s->%s\n", t->cache->is_vertex ? 'v' : 't', t, gdraw_StateName(t->state), gdraw_StateName(new_state));
#endif
t->state = new_state;
check_lists(t->cache);
}
static RADINLINE void gdraw_HandleTransitionTo(GDrawHandle *t, GDrawHandleState new_state)
{
gdraw_HandleTransitionInsertBefore(t, new_state, &t->cache->state[new_state]);
}
#ifdef GDRAW_MANAGE_MEM_TWOPOOL
static rrbool gdraw_MigrateResource(GDrawHandle *t, GDrawStats *stats);
static void gdraw_res_free(GDrawHandle *t, GDrawStats *stats);
#endif
static rrbool gdraw_HandleCacheLockStats(GDrawHandle *t, void *owner, GDrawStats *stats)
{
RR_UNUSED_VARIABLE(stats);
// if the GPU memory is owned by the user, then we never spontaneously
// free it, and we can always report true. moreover, Iggy doesn't bother
// keeping 'owner' consistent in this case, so we must check this before
// verifying t->owner.
if (t->state == GDRAW_HANDLE_STATE_user_owned)
return true;
// if t->owner has changed, then Iggy is trying to lock an old version
// of this handle from before (the handle has already been recycled to
// point to a new resource)
if (t->owner != owner)
return false;
// otherwise, it's a valid resource and we should lock it until the next
// unlock call
assert(t->state == GDRAW_HANDLE_STATE_live || t->state == GDRAW_HANDLE_STATE_locked || t->state == GDRAW_HANDLE_STATE_pinned);
if (t->state == GDRAW_HANDLE_STATE_live) {
#ifdef GDRAW_MANAGE_MEM_TWOPOOL
// if we defragmented this frame, we can't just make resources live;
// we need to migrate them to their new location. (which might fail
// if we don't have enough memory left in the new pool)
if (t->cache->did_defragment) {
if (!gdraw_MigrateResource(t, stats)) {
gdraw_res_free(t, stats);
return false;
}
}
#endif
gdraw_HandleTransitionTo(t, GDRAW_HANDLE_STATE_locked);
}
return true;
}
static rrbool gdraw_HandleCacheLock(GDrawHandle *t, void *owner)
{
return gdraw_HandleCacheLockStats(t, owner, NULL);
}
static void gdraw_HandleCacheUnlock(GDrawHandle *t)
{
assert(t->state == GDRAW_HANDLE_STATE_locked || t->state == GDRAW_HANDLE_STATE_pinned || t->state == GDRAW_HANDLE_STATE_user_owned);
if (t->state == GDRAW_HANDLE_STATE_locked)
gdraw_HandleTransitionTo(t, GDRAW_HANDLE_STATE_live);
}
static void gdraw_HandleCacheUnlockAll(GDrawHandleCache *c)
{
GDrawHandle *sentinel = &c->state[GDRAW_HANDLE_STATE_locked];
while (sentinel->next != sentinel)
gdraw_HandleTransitionTo(sentinel->next, GDRAW_HANDLE_STATE_live);
}
static void gdraw_HandleCacheInit(GDrawHandleCache *c, S32 num_handles, S32 bytes)
{
S32 i;
assert(num_handles > 0);
c->max_handles = num_handles;
c->total_bytes = bytes;
c->bytes_free = c->total_bytes;
c->is_vertex = false;
c->is_thrashing = false;
c->did_defragment = false;
for (i=0; i < GDRAW_HANDLE_STATE__count; i++) {
c->state[i].owner = NULL;
c->state[i].cache = NULL; // should never follow cache link from sentinels!
c->state[i].next = c->state[i].prev = &c->state[i];
#ifdef GDRAW_MANAGE_MEM
c->state[i].raw_ptr = NULL;
#endif
c->state[i].fence.value = 0;
c->state[i].bytes = 0;
c->state[i].state = GDRAW_HANDLE_STATE_sentinel;
}
for (i=0; i < num_handles; ++i) {
c->handle[i].cache = c;
c->handle[i].prev = (i == 0) ? &c->state[GDRAW_HANDLE_STATE_free] : &c->handle[i-1];
c->handle[i].next = (i == num_handles - 1) ? &c->state[GDRAW_HANDLE_STATE_free] : &c->handle[i+1];
c->handle[i].bytes = 0;
c->handle[i].state = GDRAW_HANDLE_STATE_free;
#ifdef GDRAW_MANAGE_MEM
c->handle[i].raw_ptr = NULL;
#endif
}
c->state[GDRAW_HANDLE_STATE_free].next = &c->handle[0];
c->state[GDRAW_HANDLE_STATE_free].prev = &c->handle[num_handles - 1];
c->prev_frame_start.value = 0;
c->prev_frame_end.value = 0;
#ifdef GDRAW_MANAGE_MEM
c->alloc = NULL;
#endif
#ifdef GDRAW_MANAGE_MEM_TWOPOOL
c->alloc_other = NULL;
#endif
check_lists(c);
}
static GDrawHandle *gdraw_HandleCacheAllocateBegin(GDrawHandleCache *c)
{
GDrawHandle *free_list = &c->state[GDRAW_HANDLE_STATE_free];
GDrawHandle *t = NULL;
if (free_list->next != free_list) {
t = free_list->next;
gdraw_HandleTransitionTo(t, GDRAW_HANDLE_STATE_alloc);
t->bytes = 0;
t->owner = 0;
#ifdef GDRAW_MANAGE_MEM
t->raw_ptr = NULL;
#endif
#ifdef GDRAW_CORRUPTION_CHECK
t->has_check_value = false;
#endif
}
return t;
}
static void gdraw_HandleCacheAllocateEnd(GDrawHandle *t, S32 bytes, void *owner, GDrawHandleState new_state)
{
assert(t->cache);
assert(t->bytes == 0);
assert(t->owner == 0);
assert(t->state == GDRAW_HANDLE_STATE_alloc);
if (bytes == 0)
assert(new_state == GDRAW_HANDLE_STATE_user_owned);
else
assert(new_state == GDRAW_HANDLE_STATE_locked || new_state == GDRAW_HANDLE_STATE_pinned);
t->bytes = bytes;
t->owner = owner;
t->cache->bytes_free -= bytes;
gdraw_HandleTransitionTo(t, new_state);
}
static void gdraw_HandleCacheFree(GDrawHandle *t)
{
GDrawHandleCache *c = t->cache;
assert(t->state != GDRAW_HANDLE_STATE_alloc && t->state != GDRAW_HANDLE_STATE_sentinel);
c->bytes_free += t->bytes;
t->bytes = 0;
t->owner = 0;
#ifdef GDRAW_MANAGE_MEM
t->raw_ptr = 0;
#endif
#ifdef GDRAW_CORRUPTION_CHECK
t->has_check_value = false;
#endif
gdraw_HandleTransitionTo(t, GDRAW_HANDLE_STATE_free);
}
static void gdraw_HandleCacheAllocateFail(GDrawHandle *t)
{
assert(t->state == GDRAW_HANDLE_STATE_alloc);
gdraw_HandleTransitionTo(t, GDRAW_HANDLE_STATE_free);
}
static GDrawHandle *gdraw_HandleCacheGetLRU(GDrawHandleCache *c)
{
// TransitionTo always inserts at the end, which means that the resources
// at the front of the LRU list are the oldest ones, since in-use resources
// will get appended on every transition from "locked" to "live".
GDrawHandle *sentinel = &c->state[GDRAW_HANDLE_STATE_live];
return (sentinel->next != sentinel) ? sentinel->next : NULL;
}
static void gdraw_HandleCacheTick(GDrawHandleCache *c, GDrawFence now)
{
c->prev_frame_start = c->prev_frame_end;
c->prev_frame_end = now;
// reset these flags every frame
c->is_thrashing = false;
c->did_defragment = false;
}
#ifdef GDRAW_MANAGE_MEM
static void gdraw_HandleCacheInsertDead(GDrawHandle *t)
{
GDrawHandle *s, *sentinel;
assert(t->state == GDRAW_HANDLE_STATE_live || t->state == GDRAW_HANDLE_STATE_locked || t->state == GDRAW_HANDLE_STATE_pinned);
// figure out where t belongs in the dead list in "chronological order"
// do this by finding its (chronological) successor s
sentinel = &t->cache->state[GDRAW_HANDLE_STATE_dead];
s = sentinel->next;
while (s != sentinel && s->fence.value <= t->fence.value)
s = s->next;
// and then insert it there
gdraw_HandleTransitionInsertBefore(t, GDRAW_HANDLE_STATE_dead, s);
}
#endif
////////////////////////////////////////////////////////////////////////
//
// Set transformation matrices
//
// Our vertex shaders use this convention:
// world: our world matrices always look like this
// m00 m01 0 t0
// m10 m11 0 t1
// 0 0 0 d
// 0 0 0 1
//
// we just store the first two rows and insert d
// in the first row, third column. our input position vectors are
// always (x,y,0,1) or (x,y,0,0), so we can still just use dp4 to
// compute final x/y. after that it's a single move to set the
// correct depth value.
//
// viewproj: our view-projection matrix is always just a 2D scale+translate,
// i.e. the matrix looks like this:
//
// p[0] 0 0 p[2]
// 0 p[1] 0 p[3]
// 0 0 1 0
// 0 0 0 1
//
// just store (p[0],p[1],p[2],p[3]) in a 4-component vector and the projection
// transform is a single multiply-add.
//
// The output is volatile since it's often in Write-Combined memory where we
// really don't want compiler reordering.
static RADINLINE void gdraw_PixelSpace(volatile F32 * RADRESTRICT vvec)
{
// 1:1 pixel mapping - just identity since our "view space" is pixels
vvec[0] = 1.0f; vvec[1] = 0.0f; vvec[2] = 0.0f; vvec[3] = 0.0f;
vvec[4] = 0.0f; vvec[5] = 1.0f; vvec[6] = 0.0f; vvec[7] = 0.0f;
}
static RADINLINE void gdraw_WorldSpace(volatile F32 * RADRESTRICT vvec, F32 * RADRESTRICT world_to_pixel, F32 depth, F32 misc)
{
// World->pixel space transform is just a scale
vvec[0] = world_to_pixel[0]; vvec[1] = 0.0f; vvec[2] = depth; vvec[3] = 0.0f;
vvec[4] = 0.0f; vvec[5] = world_to_pixel[1]; vvec[6] = misc; vvec[7] = 0.0f;
}
static RADINLINE void gdraw_ObjectSpace(volatile F32 * RADRESTRICT vvec, gswf_matrix * RADRESTRICT xform, F32 depth, F32 misc)
{
// Object->pixel transform is a 2D homogeneous matrix transform
F32 m00 = xform->m00;
F32 m01 = xform->m01;
F32 m10 = xform->m10;
F32 m11 = xform->m11;
F32 trans0 = xform->trans[0];
F32 trans1 = xform->trans[1];
vvec[0] = m00; vvec[1] = m01; vvec[2] = depth; vvec[3] = trans0;
vvec[4] = m10; vvec[5] = m11; vvec[6] = misc; vvec[7] = trans1;
}
static void gdraw_GetObjectSpaceMatrix(F32 * RADRESTRICT mat, gswf_matrix * RADRESTRICT xform, F32 * RADRESTRICT proj, F32 depth, int out_col_major)
{
int row = out_col_major ? 1 : 4;
int col = out_col_major ? 4 : 1;
F32 xs = proj[0];
F32 ys = proj[1];
mat[0*row+0*col] = xform->m00 * xs;
mat[0*row+1*col] = xform->m01 * xs;
mat[0*row+2*col] = 0.0f;
mat[0*row+3*col] = xform->trans[0] * xs + proj[2];
mat[1*row+0*col] = xform->m10 * ys;
mat[1*row+1*col] = xform->m11 * ys;
mat[1*row+2*col] = 0.0f;
mat[1*row+3*col] = xform->trans[1] * ys + proj[3];
mat[2*row+0*col] = 0.0f;
mat[2*row+1*col] = 0.0f;
mat[2*row+2*col] = 0.0f;
mat[2*row+3*col] = depth;
mat[3*row+0*col] = 0.0f;
mat[3*row+1*col] = 0.0f;
mat[3*row+2*col] = 0.0f;
mat[3*row+3*col] = 1.0f;
}
////////////////////////////////////////////////////////////////////////
//
// Blurs
//
// symmetrically expand a rectangle by ex/ey pixels on both sides, then clamp to tile bounds
static void gdraw_ExpandRect(gswf_recti *out, gswf_recti const *in, S32 ex, S32 ey, S32 w, S32 h)
{
out->x0 = RR_MAX(in->x0 - ex, 0);
out->y0 = RR_MAX(in->y0 - ey, 0);
out->x1 = RR_MIN(in->x1 + ex, w);
out->y1 = RR_MIN(in->y1 + ey, h);
}
static void gdraw_ShiftRect(gswf_recti *out, gswf_recti const *in, S32 dx, S32 dy)
{
out->x0 = in->x0 + dx;
out->y0 = in->y0 + dy;
out->x1 = in->x1 + dx;
out->y1 = in->y1 + dy;
}
#define MAX_TAPS 9 // max # of bilinear samples in one 'convolution' step
enum
{
// basic shader family
VAR_tex0 = 0,
VAR_tex1,
VAR_cmul,
VAR_cadd,
VAR_focal,
// filter family
VAR_filter_tex0 = 0,
VAR_filter_tex1,
VAR_filter_color,
VAR_filter_tc_off,
VAR_filter_tex2,
VAR_filter_clamp0,
VAR_filter_clamp1,
VAR_filter_color2,
MAX_VARS,
// blur family
VAR_blur_tex0 = 0,
VAR_blur_tap,
VAR_blur_clampv,
// color matrix family
VAR_colormatrix_tex0 = 0,
VAR_colormatrix_data,
// ihud family
VAR_ihudv_worldview = 0,
VAR_ihudv_material,
VAR_ihudv_textmode,
};
typedef struct
{
S32 w,h, frametex_width, frametex_height;
void (*BlurPass)(GDrawRenderState *r, int taps, float *data, gswf_recti *s, float *tc, float height_max, float *clampv, GDrawStats *gstats);
} GDrawBlurInfo;
static GDrawTexture *gdraw_BlurPass(GDrawFunctions *g, GDrawBlurInfo *c, GDrawRenderState *r, int taps, float *data, gswf_recti *draw_bounds, gswf_recti *sample_bounds, GDrawStats *gstats)
{
F32 tc[4];
F32 clamp[4];
F32 t=0;
F32 texel_scale_s = 1.0f / c->frametex_width;
F32 texel_scale_t = 1.0f / c->frametex_height;
S32 i;
for (i=0; i < taps; ++i)
t += data[4*i+2];
assert(t >= 0.99f && t <= 1.01f);
tc[0] = texel_scale_s * draw_bounds->x0;
tc[1] = texel_scale_t * draw_bounds->y0;
tc[2] = texel_scale_s * draw_bounds->x1;
tc[3] = texel_scale_t * draw_bounds->y1;
// sample_bounds is (x0,y0) inclusive, (x1,y1) exclusive
// texel centers are offset by 0.5 from integer coordinates and we don't want to sample outside sample_bounds
clamp[0] = texel_scale_s * (sample_bounds->x0 + 0.5f);
clamp[1] = texel_scale_t * (sample_bounds->y0 + 0.5f);
clamp[2] = texel_scale_s * (sample_bounds->x1 - 0.5f);
clamp[3] = texel_scale_t * (sample_bounds->y1 - 0.5f);
if (!g->TextureDrawBufferBegin(draw_bounds, GDRAW_TEXTURE_FORMAT_rgba32, GDRAW_TEXTUREDRAWBUFFER_FLAGS_needs_color | GDRAW_TEXTUREDRAWBUFFER_FLAGS_needs_alpha, 0, gstats))
return r->tex[0];
c->BlurPass(r, taps, data, draw_bounds, tc, (F32) c->h / c->frametex_height, clamp, gstats);
return g->TextureDrawBufferEnd(gstats);
}
static GDrawTexture *gdraw_BlurPassDownsample(GDrawFunctions *g, GDrawBlurInfo *c, GDrawRenderState *r, int taps, float *data, gswf_recti *draw_bounds, int axis, int divisor, int tex_w, int tex_h, gswf_recti *sample_bounds, GDrawStats *gstats)
{
S32 i;
F32 t=0;
F32 tc[4];
F32 clamp[4];
F32 texel_scale_s = 1.0f / tex_w;
F32 texel_scale_t = 1.0f / tex_h;
gswf_recti z;
for (i=0; i < taps; ++i)
t += data[4*i+2];
assert(t >= 0.99f && t <= 1.01f);
// following must be integer divides!
if (axis == 0) {
z.x0 = draw_bounds->x0 / divisor;
z.x1 = (draw_bounds->x1-1) / divisor + 1;
z.y0 = draw_bounds->y0;
z.y1 = draw_bounds->y1;
tc[0] = ((z.x0 - 0.5f)*divisor+0.5f)*texel_scale_s;
tc[2] = ((z.x1 - 0.5f)*divisor+0.5f)*texel_scale_s;
tc[1] = z.y0*texel_scale_t;
tc[3] = z.y1*texel_scale_t;
} else {
z.x0 = draw_bounds->x0;
z.x1 = draw_bounds->x1;
z.y0 = draw_bounds->y0 / divisor;
z.y1 = (draw_bounds->y1-1) / divisor + 1;
tc[0] = z.x0*texel_scale_s;
tc[2] = z.x1*texel_scale_s;
tc[1] = ((z.y0 - 0.5f)*divisor+0.5f)*texel_scale_t;
tc[3] = ((z.y1 - 0.5f)*divisor+0.5f)*texel_scale_t;
}
if (!g->TextureDrawBufferBegin(&z, GDRAW_TEXTURE_FORMAT_rgba32, GDRAW_TEXTUREDRAWBUFFER_FLAGS_needs_color | GDRAW_TEXTUREDRAWBUFFER_FLAGS_needs_alpha, 0, gstats))
return r->tex[0];
clamp[0] = texel_scale_s * (sample_bounds->x0 + 0.5f);
clamp[1] = texel_scale_t * (sample_bounds->y0 + 0.5f);
clamp[2] = texel_scale_s * (sample_bounds->x1 - 0.5f);
clamp[3] = texel_scale_t * (sample_bounds->y1 - 0.5f);
assert(clamp[0] <= clamp[2]);
assert(clamp[1] <= clamp[3]);
c->BlurPass(r, taps, data, &z, tc, (F32) c->h / c->frametex_height, clamp, gstats);
return g->TextureDrawBufferEnd(gstats);
}
#define unmap(t,a,b) (((t)-(a))/(F32) ((b)-(a)))
#define linear_remap(t,a,b,c,d) ((c) + unmap(t,a,b)*((d)-(c)))
static void gdraw_BlurAxis(S32 axis, GDrawFunctions *g, GDrawBlurInfo *c, GDrawRenderState *r, F32 blur_width, F32 texel, gswf_recti *draw_bounds, gswf_recti *sample_bounds, GDrawTexture *protect, GDrawStats *gstats)
{
GDrawTexture *t;
F32 data[MAX_TAPS][4];
S32 off_axis = 1-axis;
S32 w = ((S32) ceil((blur_width-1)/2))*2+1; // 1.2 => 3, 2.8 => 3, 3.2 => 5
F32 edge_weight = 1 - (w - blur_width)/2; // 3 => 0 => 1; 1.2 => 1.8 => 0.9 => 0.1
F32 inverse_weight = 1.0f / blur_width;
w = ((w-1) >> 1) + 1; // 3 => 2, 5 => 3, 7 => 4 (number of texture samples)
if (!r->tex[0])
return;
// horizontal filter
if (w > 1) {
if (w <= MAX_TAPS) {
// we have enough taps to just do it
// use 'w' taps
S32 i, expand;
// just go through and place all the taps in the right place
// if w is 2 (sample from -1,0,1)
// 0 => -0.5
// 1 => 1
// if w is 3:
// 0 => -1.5 samples from -2,-1
// 1 => 0.5 samples from 0,1
// 2 => 2 samples from 2
// if w is 4:
// 0 => -2.5 samples from -3,-2
// 1 => -0.5 samples from -1,0
// 2 => 1.5 samples from 1,2
// 3 => 3 samples from 3
for (i=0; i < w; ++i) {
// first texsample samples from -w+1 and -w+2, e.g. w=2 => -1,0,1
data[i][axis] = (-w+1.5f + i*2)*texel;
data[i][off_axis] = 0;
data[i][2] = 2*inverse_weight; // 2 full-weight samples
data[i][3] = 0;
}
// now reweight the last one
data[i-1][axis] = (w-1)*texel;
data[i-1][2] = edge_weight*inverse_weight;
// now reweight the first one
// (ew*0 + 1*1)/(1+ew) = 1/(1+ew)
data[0][axis] = (-w + 1.0f + 1/(edge_weight+1)) * texel;
data[0][2] = (edge_weight+1)*inverse_weight;
expand = w-1;
gdraw_ExpandRect(draw_bounds, draw_bounds, axis ? 0 : expand, axis ? expand : 0, c->w, c->h);
t = gdraw_BlurPass(g, c, r, w, data[0], draw_bounds, sample_bounds, gstats);
if (r->tex[0] != protect && r->tex[0] != t)
g->FreeTexture(r->tex[0], 0, gstats);
r->tex[0] = t;
gdraw_ExpandRect(sample_bounds, draw_bounds, 1, 1, c->w, c->h); // for next pass
} else {
// @OPTIMIZE: for symmetrical blurs we can get a 2-wide blur in the *off* axis at the same
// time we get N-wide in the on axis, which could double our max width
S32 i, expand;
// @HACK: this is really a dumb way to do it, i kind of had a brain fart, you could get
// the exact same result by just doing the downsample the naive way and then the
// final sample uses texture samples spaced by a texel rather than spaced by two
// texels -- the current method is just as inefficient, it just puts the inefficiency
// in the way the downsampled texture is self-overlapping, so the downsampled texture
// is twice as larger as it should be.
// we COULD be exact by generating a mipmap, then sampling some number of samples
// from the mipmap and some from the original, but that would require being polyphase.
// instead we just are approximate. the mipmap weights the edge pixels by one half
// and overlaps them by one sample, so then in phase two we sample N slightly-overlapping
// mipmap samples
//
// instead we do the following.
// divide the source data up into clusters that are K samples long.
// ...K0... ...K1... ...K2... ...K3...
//
// Suppose K[i] is the average of all the items in cluster i.
//
// We compute a downsampled texture where T[i] = K[i] + K[i+1].
//
// Now, we sample N taps from adjacent elements of T, allowing the texture unit
// to bilerp. Suppose a given sample falls at coordinate i with sub-position p.
// Then tap #j will compute:
// T[i+j]*(1-p) + T[i+j+1]*p
// But tap #j+1 will compute:
// T[i+j+1]*(1-p) + T[i+j+2]*p
// so we end up computing:
// sum(T[i+j]) except for the end samples.
//
// So, how do we create these initial clusters? That's easy, we use K taps
// to sample 2K texels.
//
// What value of k do we use? Well, we're constrained to using MAX_TAPS
// on each pass. So at the high end, we're bounded by:
// K = MAX_TAPS
// S = MAX_TAPS (S is number of samples in second pass)
// S addresses S*2-1 texels of T, and each texel adds K more samples,
// so (ignoring the edges) we basically have w = K*S
// if w == MAX_TAPS*MAX_TAPS, then k = MAX_TAPS
// if w == MAX_TAPS+1, then k = 2
//
// suppose we have 3 taps, then we can sample 5 samples in one pass, so then our
// max coverage is 25 samples, or a filter width of 13. with 7 taps, we sample
// 13 samples in one pass, max coverage is 13*13 samples or (13*13-1)/2 width,
// which is ((2T-1)*(2T-1)-1)/2 or (4T^2 - 4T + 1 -1)/2 or 2T^2 - 2T or 2T*(T-1)
S32 w_mip = (S32) ceil(linear_remap(w, MAX_TAPS+1, MAX_TAPS*MAX_TAPS, 2, MAX_TAPS));
S32 downsample = w_mip;
F32 sample_spacing = texel;
if (downsample < 2) downsample = 2;
if (w_mip > MAX_TAPS) {
// if w_mip > MAX_TAPS, then we ought to use more than one mipmap pass, but
// since that's a huge filter ( > 80 pixels) let's just try subsampling and
// see if it's good enough.
sample_spacing *= w_mip / MAX_TAPS;
w_mip = MAX_TAPS;
} else {
assert(w / downsample <= MAX_TAPS);
}
inverse_weight = 1.0f / (2*w_mip);
for (i=0; i < w_mip; ++i) {
data[i][axis] = (-w_mip+1 + i*2+0.5f)*sample_spacing;
data[i][off_axis] = 0;
data[i][2] = 2*inverse_weight;
data[i][3] = 0;
}
w = w*2 / w_mip;
// @TODO: compute the correct bboxes for this size
// the downsampled texture samples from -w_mip+1 to w_mip
// the sample from within that samples w spots within that,
// or w/2 of those, but they're overlapping by 50%.
// so if a sample is a point i, it samples from the original
// from -w_mip+1 to w_mip + i*w_mip.
// So then the minimum is: -w_mip+1 + (w/2)*w_mip, and
// the maximum is w_mip + (w/2)*w_mip
expand = (((w+1)>>1)+1)*w_mip+1;
gdraw_ExpandRect(draw_bounds, draw_bounds, axis ? 0 : expand, axis ? expand : 0, c->w, c->h);
t = gdraw_BlurPassDownsample(g, c, r, w_mip, data[0], draw_bounds, axis, downsample, c->frametex_width, c->frametex_height, sample_bounds, gstats);
if (r->tex[0] != protect && r->tex[0] != t)
g->FreeTexture(r->tex[0], 0, gstats);
r->tex[0] = t;
gdraw_ExpandRect(sample_bounds, draw_bounds, 1, 1, c->w, c->h);
if (!r->tex[0])
return;
// now do a regular blur pass sampling from that
// the raw texture now contains 'downsample' samples per texel
if (w > 2*MAX_TAPS) {
sample_spacing = texel * (w-1) / (2*MAX_TAPS-1);
w = 2*MAX_TAPS;
} else {
sample_spacing = texel;
}
//sample_spacing *= 1.0f/2;
assert(w >= 2 && w <= 2*MAX_TAPS);
if (w & 1) {
// we just want to evenly weight even-spaced samples
inverse_weight = 1.0f / w;
// just go through and place all the taps in the right place
w = (w+1)>>1;
for (i=0; i < w; ++i) {
data[i][axis] = (-w+1.0f + 0.5f + i*2)*sample_spacing;
data[i][off_axis] = 0;
data[i][2] = 2*inverse_weight; // 2 full-weight samples
data[i][3] = 0;
}
// fix up the last tap
// the following test is always true, but we're testing it here
// explicitly so as to make VS2012's static analyzer not complain
if (i > 0) {
data[i-1][axis] = (-w+1.0f+(i-1)*2)*sample_spacing;
data[i-1][2] = inverse_weight;
}
} else {
// we just want to evenly weight even-spaced samples
inverse_weight = 1.0f / w;
// just go through and place all the taps in the right place
w >>= 1;
for (i=0; i < w; ++i) {
data[i][axis] = (-w+1.0f + i*2)*sample_spacing;
data[i][off_axis] = 0;
data[i][2] = 2*inverse_weight; // 2 full-weight samples
data[i][3] = 0;
}
}
t = gdraw_BlurPassDownsample(g, c, r, w, data[0], draw_bounds, axis, 1,
axis==0 ? c->frametex_width*downsample : c->frametex_width,
axis==1 ? c->frametex_height*downsample : c->frametex_height, sample_bounds, gstats);
if (r->tex[0] != protect && r->tex[0] != t)
g->FreeTexture(r->tex[0], 0, gstats);
r->tex[0] = t;
gdraw_ExpandRect(sample_bounds, draw_bounds, 1, 1, c->w, c->h);
}
}
}
static void gdraw_Blur(GDrawFunctions *g, GDrawBlurInfo *c, GDrawRenderState *r, gswf_recti *draw_bounds, gswf_recti *sample_bounds, GDrawStats *gstats)
{
S32 p;
GDrawTexture *protect = r->tex[0];
gswf_recti sbounds;
// compute texel offset size
F32 dx = 1.0f / c->frametex_width;
F32 dy = 1.0f / c->frametex_height;
// blur = 1 => 1 tap
// blur = 1.2 => 3 taps (0.1, 1, 0.1)
// blur = 2.2 => 3 taps (0.6, 1, 0.6)
// blur = 2.8 => 3 taps (0.9, 1, 0.9)
// blur = 3 => 3 taps (1 , 1, 1 )
// blur = 3.2 => 5 taps (0.1, 1, 1, 1, 0.1)
//S32 w = ((S32) ceil((r->blur_x-1)/2))*2+1; // 1.2 => (1.2-1)/2 => 0.1 => 1.0 => 1 => 2 => 3
//S32 h = ((S32) ceil((r->blur_y-1)/2))*2+1; // 3 => (3-1)/2 => 1.0 => 1 => 2 => 3
// gdraw puts 1 border pixel around everything when producing rendertargets and we use this
// so expand the input sample bounds accordingly
gdraw_ExpandRect(&sbounds, sample_bounds, 1, 1, c->w, c->h);
for (p=0; p < r->blur_passes; ++p) {
#if 0 // @OPTIMIZE do the filter in one pass
if (w*h <= MAX_TAPS) {
} else
#endif
{
// do the filter separably
gdraw_BlurAxis(0,g,c,r,r->blur_x,dx, draw_bounds, &sbounds, protect, gstats);
gdraw_BlurAxis(1,g,c,r,r->blur_y,dy, draw_bounds, &sbounds, protect, gstats);
}
}
}
#ifdef GDRAW_MANAGE_MEM
static void make_pool_aligned(void **start, S32 *num_bytes, U32 alignment)
{
UINTa addr_orig = (UINTa) *start;
UINTa addr_aligned = (addr_orig + alignment-1) & ~((UINTa) alignment - 1);
if (addr_aligned != addr_orig) {
S32 diff = (S32) (addr_aligned - addr_orig);
if (*num_bytes < diff) {
*start = NULL;
*num_bytes = 0;
return;
} else {
*start = (void *)addr_aligned;
*num_bytes -= diff;
}
}
}
// Very simple arena allocator
typedef struct
{
U8 *begin;
U8 *current;
U8 *end;
} GDrawArena;
static void gdraw_arena_init(GDrawArena *arena, void *start, U32 size)
{
arena->begin = (U8 *)start;
arena->current = (U8 *)start;
arena->end = (U8 *)start + size;
}
static GDRAW_MAYBE_UNUSED void gdraw_arena_reset(GDrawArena *arena)
{
arena->current = arena->begin;
}
static void *gdraw_arena_alloc(GDrawArena *arena, U32 size, U32 align)
{
UINTa start_addr = ((UINTa)arena->current + align-1) & ~((UINTa) align - 1);
U8 *ptr = (U8 *)start_addr;
UINTa remaining = arena->end - arena->current;
UINTa total_size = (ptr - arena->current) + size;
if (remaining < total_size) // doesn't fit
return NULL;
arena->current = ptr + size;
return ptr;
}
// Allocator for graphics memory.
// Graphics memory is assumed to be write-combined and slow to read for the
// CPU, so we keep all heap management information separately in main memory.
//
// There's a constant management of about 1k (2k for 64bit) to create a heap,
// plus a per-block overhead. The maximum number of blocks the allocator can
// ever use is bounded by 2*max_allocs+1; since GDraw manages a limited
// amount of handles, max_allocs is a known value at heap creation time.
//
// The allocator uses a best-fit heuristic to minimize fragmentation.
// Currently, there are no size classes or other auxiliary data structures to
// speed up this process, since the number of free blocks at any point in time
// is assumed to be fairly low.
//
// The allocator maintains a number of invariants:
// - The free list and physical block list are proper double-linked lists.
// (i.e. block->next->prev == block->prev->next == block)
// - All allocated blocks are also kept in a hash table, indexed by their
// pointer (to allow free to locate the corresponding block_info quickly).
// There's a single-linked, NULL-terminated list of elements in each hash
// bucket.
// - The physical block list is ordered. It always contains all currently
// active blocks and spans the whole managed memory range. There are no
// gaps between blocks, and all blocks have nonzero size.
// - There are no two adjacent free blocks; if two such blocks would be created,
// they are coalesced immediately.
// - The maximum number of blocks that could ever be necessary is allocated
// on initialization. All block_infos not currently in use are kept in a
// single-linked, NULL-terminated list of unused blocks. Every block is either
// in the physical block list or the unused list, and the total number of
// blocks is constant.
// These invariants always hold before and after an allocation/free.
#ifndef GFXALLOC_ASSERT
#define GFXALLOC_ASSERT(x)
#endif
typedef struct gfx_block_info
{
U8 *ptr;
gfx_block_info *prev, *next; // for free blocks this is the free list, for allocated blocks it's a (single-linked!) list of elements in the corresponding hash bucket
gfx_block_info *prev_phys, *next_phys;
U32 is_free : 1;
U32 is_unused : 1;
U32 size : 30;
} gfx_block_info;
// 24 bytes/block on 32bit, 48 bytes/block on 64bit.
#define GFXALLOC_HASH_SIZE 256
typedef struct gfx_allocator
{
U8 *mem_base;
U8 *mem_end;
U32 max_allocs;
U32 block_align;
U32 block_shift;
S32 actual_bytes_free;
#ifdef GFXALLOC_CHECK
int num_blocks;
int num_unused;
int num_alloc;
int num_free;
#endif
GDrawHandleCache *cache;
gfx_block_info *unused_list; // next unused block_info (single-linked list)
gfx_block_info *hash[GFXALLOC_HASH_SIZE]; // allocated blocks
gfx_block_info blocks[1]; // first block is head of free list AND head of physical block list (sentinel)
} gfx_allocator;
// about 1k (32bit), 2k (64bit) with 256 hash buckets (the default). dominated by hash table.
#ifdef GFXALLOC_CHECK
#define GFXALLOC_IF_CHECK(x) x
#else
#define GFXALLOC_IF_CHECK(x)
#endif
static U32 gfxalloc_get_hash_code(gfx_allocator *alloc, void *ptr)
{
U32 a = (U32) (((U8 *) ptr - alloc->mem_base) >> alloc->block_shift);
// integer hash function by Bob Jenkins (http://burtleburtle.net/bob/hash/integer.html)
// I use this function because integer mults are slow on PPC and large literal constants
// take multiple instrs to set up on all RISC CPUs.
a -= (a<<6);
a ^= (a>>17);
a -= (a<<9);
a ^= (a<<4);
a -= (a<<3);
a ^= (a<<10);
a ^= (a>>15);
return a & (GFXALLOC_HASH_SIZE - 1);
}
#if defined(SUPERDEBUG) || defined(COMPLETE_DEBUG)
#include <stdlib.h>
#define MAX_REGIONS 8192
typedef struct
{
U32 begin,end;
} gfx_region;
static gfx_region region[MAX_REGIONS];
static int region_sort(const void *p, const void *q)
{
U32 a = *(U32*)p;
U32 b = *(U32*)q;
if (a < b) return -1;
if (a > b) return 1;
return 0;
}
static void gfxalloc_check1(gfx_allocator *alloc)
{
assert(alloc->max_allocs*2+1 < MAX_REGIONS);
int i,n=0;
for (i=0; i < GFXALLOC_HASH_SIZE; ++i) {
gfx_block_info *b = alloc->hash[i];
while (b) {
region[n].begin = (UINTa) b->ptr;
region[n].end = region[n].begin + b->size;
++n;
b = b->next;
}
}
gfx_block_info *b = alloc->blocks[0].next;
while (b != &alloc->blocks[0]) {
region[n].begin = (UINTa) b->ptr;
region[n].end = region[n].begin + b->size;
++n;
b = b->next;
}
qsort(region, n, sizeof(region[0]), region_sort);
for (i=0; i+1 < n; ++i) {
assert(region[i].end == region[i+1].begin);
}
}
#else
#define gfxalloc_check1(a)
#endif
#ifdef COMPLETE_DEBUG
static void verify_against_blocks(int num_regions, void *vptr, S32 len)
{
U32 *ptr = (U32 *) vptr;
// binary search for ptr amongst regions
S32 s=0,e=num_regions-1;
assert(len != 0);
while (s < e) {
S32 i = (s+e+1)>>1;
// invariant: b[s] <= ptr <= b[e]
if (region[i].begin <= (UINTa) ptr)
s = i;
else
e = i-1;
// consider cases:
// s=0,e=1: i = 0, how do we get i to be 1?
}
// at this point, s >= e
assert(s < num_regions && region[s].begin == (UINTa) ptr && (UINTa) ptr+len <= region[s].end);
}
static void debug_complete_check(gfx_allocator *alloc, void *ptr, S32 len, void *skip)
{
GDrawHandleCache *c = alloc->cache;
assert(alloc->max_allocs*2+1 < MAX_REGIONS);
int i,n=0;
for (i=0; i < GFXALLOC_HASH_SIZE; ++i) {
gfx_block_info *b = alloc->hash[i];
while (b) {
region[n].begin = (UINTa) b->ptr;
region[n].end = region[n].begin + b->size;
++n;
b = b->next;
}
}
gfx_block_info *b = alloc->blocks[0].next;
while (b != &alloc->blocks[0]) {
region[n].begin = (UINTa) b->ptr;
region[n].end = region[n].begin + b->size;
++n;
b = b->next;
}
for (i=0; i < n; ++i)
assert(region[i].end > region[i].begin);
qsort(region, n, sizeof(region[0]), region_sort);
for (i=0; i+1 < n; ++i) {
assert(region[i].end == region[i+1].begin);
}
if (ptr)
verify_against_blocks(n, ptr, len);
if (c) {
GDrawHandle *t = c->head;
while (t) {
if (t->raw_ptr && t->raw_ptr != skip)
verify_against_blocks(n, t->raw_ptr, t->bytes);
t = t->next;
}
t = c->active;
while (t) {
if (t->raw_ptr && t->raw_ptr != skip)
verify_against_blocks(n, t->raw_ptr, t->bytes);
t = t->next;
}
}
}
#else
#define debug_complete_check(a,p,len,s)
#endif
#ifdef GFXALLOC_CHECK
static void gfxalloc_check2(gfx_allocator *alloc)
{
int n=0;
gfx_block_info *b = alloc->unused_list;
while (b) {
++n;
b = b->next;
}
GFXALLOC_ASSERT(n == alloc->num_unused);
b = alloc->blocks->next;
n = 0;
while (b != alloc->blocks) {
++n;
b = b->next;
}
GFXALLOC_ASSERT(n == alloc->num_free);
GFXALLOC_ASSERT(alloc->num_blocks == alloc->num_unused + alloc->num_free + alloc->num_alloc);
}
#define gfxalloc_check(a) do { gfxalloc_check1(a); gfxalloc_check2(a); } while(0)
#else
#define gfxalloc_check2(a)
#define gfxalloc_check(a)
#endif
static gfx_block_info *gfxalloc_pop_unused(gfx_allocator *alloc)
{
GFXALLOC_ASSERT(alloc->unused_list != NULL);
GFXALLOC_ASSERT(alloc->unused_list->is_unused);
GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(alloc->num_unused);)
gfx_block_info *b = alloc->unused_list;
alloc->unused_list = b->next;
GFXALLOC_ASSERT(alloc->unused_list);
b->is_unused = 0;
GFXALLOC_IF_CHECK(--alloc->num_unused;)
return b;
}
static void gfxalloc_push_unused(gfx_allocator *alloc, gfx_block_info *b)
{
GFXALLOC_ASSERT(!b->is_unused);
b->is_unused = 1;
b->next = alloc->unused_list;
alloc->unused_list = b;
GFXALLOC_IF_CHECK(++alloc->num_unused);
}
static void gfxalloc_add_free(gfx_allocator *alloc, gfx_block_info *b)
{
gfx_block_info *head = alloc->blocks;
b->is_free = 1;
b->next = head->next;
b->prev = head;
head->next->prev = b;
head->next = b;
GFXALLOC_IF_CHECK(++alloc->num_free;)
}
static void gfxalloc_rem_free(gfx_allocator *alloc, gfx_block_info *b)
{
RR_UNUSED_VARIABLE(alloc);
b->is_free = 0;
b->prev->next = b->next;
b->next->prev = b->prev;
GFXALLOC_IF_CHECK(--alloc->num_free;)
}
static void gfxalloc_split_free(gfx_allocator *alloc, gfx_block_info *b, U32 pos)
{
gfx_block_info *n = gfxalloc_pop_unused(alloc);
GFXALLOC_ASSERT(b->is_free);
GFXALLOC_ASSERT(pos > 0 && pos < b->size);
// set up new free block
n->ptr = b->ptr + pos;
n->prev_phys = b;
n->next_phys = b->next_phys;
n->next_phys->prev_phys = n;
n->size = b->size - pos;
assert(n->size != 0);
gfxalloc_add_free(alloc, n);
// fix original block
b->next_phys = n;
b->size = pos;
assert(b->size != 0);
debug_complete_check(alloc, n->ptr, n->size,0);
debug_complete_check(alloc, b->ptr, b->size,0);
}
static gfx_allocator *gfxalloc_create(void *mem, U32 mem_size, U32 align, U32 max_allocs)
{
gfx_allocator *a;
U32 i, max_blocks, size;
if (!align || (align & (align - 1)) != 0) // align must be >0 and a power of 2
return NULL;
// for <= max_allocs live allocs, there's <= 2*max_allocs+1 blocks. worst case:
// [free][used][free] .... [free][used][free]
max_blocks = max_allocs * 2 + 1;
size = sizeof(gfx_allocator) + max_blocks * sizeof(gfx_block_info);
a = (gfx_allocator *) IggyGDrawMalloc(size);
if (!a)
return NULL;
memset(a, 0, size);
GFXALLOC_IF_CHECK(a->num_blocks = max_blocks;)
GFXALLOC_IF_CHECK(a->num_alloc = 0;)
GFXALLOC_IF_CHECK(a->num_free = 1;)
GFXALLOC_IF_CHECK(a->num_unused = max_blocks-1;)
GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(a->num_blocks == a->num_alloc + a->num_free + a->num_unused);)
GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(a->num_free <= a->num_blocks+1);)
a->actual_bytes_free = mem_size;
a->mem_base = (U8 *) mem;
a->mem_end = a->mem_base + mem_size;
a->max_allocs = max_allocs;
a->block_align = align;
a->block_shift = 0;
while ((1u << a->block_shift) < a->block_align)
a->block_shift++;
// init sentinel block
a->blocks[0].prev = a->blocks[0].next = &a->blocks[1]; // point to free block
a->blocks[0].prev_phys = a->blocks[0].next_phys = &a->blocks[1]; // same
// init first free block
a->blocks[1].ptr = a->mem_base;
a->blocks[1].prev = a->blocks[1].next = &a->blocks[0];
a->blocks[1].prev_phys = a->blocks[1].next_phys = &a->blocks[0];
a->blocks[1].is_free = 1;
a->blocks[1].size = mem_size;
// init "unused" list
a->unused_list = a->blocks + 2;
for (i=2; i < max_blocks; i++) {
a->blocks[i].is_unused = 1;
a->blocks[i].next = a->blocks + (i + 1);
}
a->blocks[i].is_unused = 1;
gfxalloc_check(a);
debug_complete_check(a, NULL, 0,0);
return a;
}
static void *gfxalloc_alloc(gfx_allocator *alloc, U32 size_in_bytes)
{
gfx_block_info *cur, *best = NULL;
U32 i, best_wasted = ~0u;
U32 size = size_in_bytes;
debug_complete_check(alloc, NULL, 0,0);
gfxalloc_check(alloc);
GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(alloc->num_blocks == alloc->num_alloc + alloc->num_free + alloc->num_unused);)
GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(alloc->num_free <= alloc->num_blocks+1);)
// round up to multiple of our block alignment
size = (size + alloc->block_align-1) & ~(alloc->block_align - 1);
assert(size >= size_in_bytes);
assert(size != 0);
// find best fit among all free blocks. this is O(N)!
for (cur = alloc->blocks[0].next; cur != alloc->blocks; cur = cur->next) {
if (cur->size >= size) {
U32 wasted = cur->size - size;
if (wasted < best_wasted) {
best_wasted = wasted;
best = cur;
if (!wasted) break; // can't get better than perfect
}
}
}
// return the best fit, if we found any suitable block
if (best) {
debug_check_overlap(alloc->cache, best->ptr, best->size);
// split off allocated part
if (size != best->size)
gfxalloc_split_free(alloc, best, size);
debug_complete_check(alloc, best->ptr, best->size,0);
// remove from free list and add to allocated hash table
GFXALLOC_ASSERT(best->size == size);
gfxalloc_rem_free(alloc, best);
i = gfxalloc_get_hash_code(alloc, best->ptr);
best->next = alloc->hash[i];
alloc->hash[i] = best;
alloc->actual_bytes_free -= size;
GFXALLOC_ASSERT(alloc->actual_bytes_free >= 0);
GFXALLOC_IF_CHECK(++alloc->num_alloc;)
GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(alloc->num_blocks == alloc->num_alloc + alloc->num_free + alloc->num_unused);)
GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(alloc->num_free <= alloc->num_blocks+1);)
debug_complete_check(alloc, best->ptr, best->size,0);
gfxalloc_check(alloc);
debug_check_overlap(alloc->cache, best->ptr, best->size);
return best->ptr;
} else
return NULL; // not enough space!
}
static void gfxalloc_free(gfx_allocator *alloc, void *ptr)
{
GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(alloc->num_blocks == alloc->num_alloc + alloc->num_free + alloc->num_unused);)
GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(alloc->num_free <= alloc->num_blocks+1);)
// find the block in the hash table
gfx_block_info *b, *t, **prevnext;
U32 i = gfxalloc_get_hash_code(alloc, ptr);
prevnext = &alloc->hash[i];
b = alloc->hash[i];
while (b) {
if (b->ptr == ptr) break;
prevnext = &b->next;
b = b->next;
}
if (!b) {
GFXALLOC_ASSERT(0); // trying to free a non-allocated block
return;
}
debug_complete_check(alloc, b->ptr, b->size, 0);
GFXALLOC_IF_CHECK(--alloc->num_alloc;)
// remove it from the hash table
*prevnext = b->next;
alloc->actual_bytes_free += b->size;
// merge with previous block if it's free, else add it to free list
t = b->prev_phys;
if (t->is_free) {
t->size += b->size;
t->next_phys = b->next_phys;
t->next_phys->prev_phys = t;
gfxalloc_push_unused(alloc, b);
b = t;
} else
gfxalloc_add_free(alloc, b);
// try to merge with next block
t = b->next_phys;
if (t->is_free) {
b->size += t->size;
b->next_phys = t->next_phys;
t->next_phys->prev_phys = b;
gfxalloc_rem_free(alloc, t);
gfxalloc_push_unused(alloc, t);
}
debug_complete_check(alloc, 0, 0, ptr);
gfxalloc_check(alloc);
GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(alloc->num_blocks == alloc->num_alloc + alloc->num_free + alloc->num_unused);)
GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(alloc->num_free <= alloc->num_blocks+1);)
}
#ifdef GDRAW_MANAGE_MEM_TWOPOOL
static rrbool gfxalloc_is_empty(gfx_allocator *alloc)
{
gfx_block_info *first_free = alloc->blocks[0].next;
// we want to check whether there's exactly one free block that
// covers the entire pool.
if (first_free == alloc->blocks) // 0 free blocks
return false;
if (first_free->next != alloc->blocks) // >1 free block
return false;
return first_free->ptr == alloc->mem_base && first_free->ptr + first_free->size == alloc->mem_end;
}
static rrbool gfxalloc_mem_contains(gfx_allocator *alloc, void *ptr)
{
return alloc->mem_base <= (U8*)ptr && (U8*)ptr < alloc->mem_end;
}
#endif
#ifdef GDRAW_DEBUG
static void gfxalloc_dump(gfx_allocator *alloc)
{
static const char *type[] = {
"allocated",
"free",
};
for (gfx_block_info *b = alloc->blocks[0].next_phys; b != alloc->blocks; b=b->next_phys) {
U8 *start = b->ptr;
U8 *end = b->ptr + b->size;
printf("%p-%p: %s (%d bytes)\n", start, end, type[b->is_free], b->size);
}
}
#endif
#endif
#ifdef GDRAW_DEFRAGMENT
#define GDRAW_DEFRAGMENT_may_overlap 1 // self-overlap for individual copies is OK
// Defragmentation code for graphics memory.
// The platform implementation must provide a GPU memcpy function and handle all necessary
// synchronization. It must also adjust its resource descriptors to match the new addresses
// after defragmentation.
static void gdraw_gpu_memcpy(GDrawHandleCache *c, void *dst, void *src, U32 num_bytes);
static void gdraw_Defragment_memmove(GDrawHandleCache *c, U8 *dst, U8 *src, U32 num_bytes, U32 flags, GDrawStats *stats)
{
if (dst == src)
return;
assert(num_bytes != 0);
stats->nonzero_flags |= GDRAW_STATS_defrag;
stats->defrag_objects += 1;
stats->defrag_bytes += num_bytes;
if ((flags & GDRAW_DEFRAGMENT_may_overlap) || dst + num_bytes <= src || src + num_bytes <= dst) // no problematic overlap
gdraw_gpu_memcpy(c, dst, src, num_bytes);
else {
// need to copy in multiple chunks
U32 chunk_size, pos=0;
if (dst < src)
chunk_size = (U32) (src - dst);
else
chunk_size = (U32) (dst - src);
while (pos < num_bytes) {
U32 amount = num_bytes - pos;
if (amount > chunk_size) amount = chunk_size;
gdraw_gpu_memcpy(c, dst + pos, src + pos, amount);
pos += amount;
}
}
}
static rrbool gdraw_CanDefragment(GDrawHandleCache *c)
{
// we can defragment (and extract some gain from it) if and only if there's more
// than one free block. since gfxalloc coalesces free blocks immediately and keeps
// them in a circular linked list, this is very easy to detect: just check if the
// "next" pointer of the first free block points to the sentinel. (this is only
// the case if there are 0 or 1 free blocks)
gfx_allocator *alloc = c->alloc;
return alloc->blocks[0].next->next != alloc->blocks;
}
static void gdraw_DefragmentMain(GDrawHandleCache *c, U32 flags, GDrawStats *stats)
{
gfx_allocator *alloc = c->alloc;
gfx_block_info *b, *n;
U8 *p;
S32 i;
GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(alloc->num_blocks == alloc->num_alloc + alloc->num_free + alloc->num_unused);)
GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(alloc->num_free <= alloc->num_blocks+1);)
// go over all allocated memory blocks and clear the "prev" pointer
// (unused for allocated blocks, we'll use it to store a back-pointer to the corresponding handle)
for (b = alloc->blocks[0].next_phys; b != alloc->blocks; b=b->next_phys)
if (!b->is_free)
b->prev = NULL;
// go through all handles and store a pointer to the handle in the corresponding memory block
for (i=0; i < c->max_handles; i++)
if (c->handle[i].raw_ptr) {
assert(c->handle[i].bytes != 0);
for (b=alloc->hash[gfxalloc_get_hash_code(alloc, c->handle[i].raw_ptr)]; b; b=b->next)
if (b->ptr == c->handle[i].raw_ptr) {
void *block = &c->handle[i];
b->prev = (gfx_block_info *) block;
break;
}
GFXALLOC_ASSERT(b != NULL); // didn't find this block anywhere!
}
// clear alloc hash table (we rebuild it during defrag)
memset(alloc->hash, 0, sizeof(alloc->hash));
// defragmentation proper: go over all blocks again, remove all free blocks from the physical
// block list and compact the remaining blocks together.
p = alloc->mem_base;
for (b = alloc->blocks[0].next_phys; b != alloc->blocks; b=n) {
n = b->next_phys;
if (!b->is_free) {
U32 h;
// move block if necessary
if (p != b->ptr) {
assert(b->size != 0);
gdraw_Defragment_memmove(c, p, b->ptr, b->size, flags, stats);
b->ptr = p;
assert(b->prev);
if (b->prev)
((GDrawHandle *) b->prev)->raw_ptr = p;
}
// re-insert into hash table
h = gfxalloc_get_hash_code(alloc, p);
b->next = alloc->hash[h];
alloc->hash[h] = b;
p += b->size;
} else {
// free block: remove it from the physical block list
b->prev_phys->next_phys = b->next_phys;
b->next_phys->prev_phys = b->prev_phys;
gfxalloc_rem_free(alloc, b);
gfxalloc_push_unused(alloc, b);
}
}
// the free list should be empty now
assert(alloc->blocks[0].next == &alloc->blocks[0]);
// unless all memory is allocated, we now need to add a new block for the free space at the end
if (p != alloc->mem_end) {
b = gfxalloc_pop_unused(alloc);
b->ptr = p;
b->prev_phys = alloc->blocks[0].prev_phys;
b->next_phys = &alloc->blocks[0];
b->prev_phys->next_phys = b;
b->next_phys->prev_phys = b;
b->size = alloc->mem_end - p;
gfxalloc_add_free(alloc, b);
}
GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(alloc->num_blocks == alloc->num_alloc + alloc->num_free + alloc->num_unused);)
GFXALLOC_IF_CHECK(GFXALLOC_ASSERT(alloc->num_free <= alloc->num_blocks+1);)
}
#endif
#ifdef GDRAW_MANAGE_MEM_TWOPOOL
// Defragmentation code for graphics memory, using two-pool strategy.
//
// The platform implementation must provide a GPU memcpy function and handle
// all necessary synchronization. It must also adjust its resource descriptors
// to match the new addresses after defragmentation.
//
// The high concept for two-pool is that we can't update the resource pools
// mid-frame; instead, while preparing for a frame, we need to produce a memory
// configuration that is suitable for rendering a whole frame at once (in
// contrast to our normal incremental strategy, where we can decide to
// defragment mid-frame if things are getting desperate). This is for tiled
// renderers.
//
// Two-pool works like this:
// - As the name suggests, each handle cache has two memory pools and corresponding backing
// allocators. The currently used allocator, "alloc", and a second allocator, "alloc_other".
// - Any resource used in a command buffer gets locked and *stays locked* until we're done
// preparing that command buffer (i.e. no unlocking after every draw as in the normal
// incremental memory management).
// - All allocations happen from "alloc", always. We mostly do our normal LRU cache freeing
// to make space when required.
// - We can still run out of space (no surprise) and get into a configuration where we have
// to defragment. This is the only tricky part, and where the second pool comes in. To
// defragment, we switch the roles of "alloc" and "alloc_other", and allocate new backing
// storage for all currently "locked" and "pinned" resources (i.e. everything we've used
// in the currently pending frame).
// - In general, we have the invariant that all resources we're using for batches we're
// working on must be in the "alloc" (fresh) pool, not in the "other" (stale) pool.
// Therefore, after a defragment/pool switch, any "live" resource (which means it's
// present in the stale pool) has to be copied to the "fresh" pool as it's getting
// locked to maintain this invariant.
//
// What this does is give us a guarantee that any given frame either only
// references resources in one pool (the common case), or does a defragment, in
// which case it looks like this:
//
// +------------------------------+
// | |
// | | pool A is fresh (=alloc), pool B is stale (=alloc_other)
// | | all resources referenced in here are in pool A
// | |
// | |
// | |
// +------------------------------+ <-- defragment! pools flip roles here
// | |
// | |
// | | pool B is fresh (=alloc), pool A is stale (=alloc_other)
// | | all resources referenced in here are in pool B
// | |
// +------------------------------+
//
// Now, at the end of the frame, we need to decide what to do with the
// resources that remain "live" (i.e. they're in the old pool but weren't
// referenced in the current frame so they didn't get copied). As of this
// writing, we simply free them, to maximize the amount of free memory in the
// new pool (and hopefully minimize the chance that we'll have to defragment
// again soon). It would also be possible to copy some of them though, assuming
// there's enough space.
//
// Freeing resources is an interesting case. When the CPU side of GDraw does a
// "free", we can't immediately reclaim the resource memory, since the GPU will
// generally still have outstanding commands that reference that resource. So
// our freed resources first enter the "Dead" state and only actually get freed
// once the GPU is done with them. What this means is that the list of
// resources in the "dead" state can end up holding references to both the
// fresh and the stale pool; the free implementation needs to be aware of this
// and return the memory to the right allocator.
//
// When we defragment, it's important to make sure that the pool we're flipping
// to is actually empty. What this means is that right before a defragment, we
// need to wait for all stale "dead" resources to actually become free. If the
// last defragment was several frames ago, this is fast - we haven't generated
// any new commands referencing the stale resources in several frames, so most
// likely they're all immediately free-able. By contrast, if we just
// defragmented last frame, this will be a slow operation since we need to wait
// for the GPU pipeline to drain - but if you're triggering defragments in
// several consecutive frames, you're thrashing the resource pools badly and
// are getting really bad performance anyway.
static void gdraw_gpu_memcpy(GDrawHandleCache *c, void *dst, void *src, U32 num_bytes);
static void gdraw_gpu_wait_for_transfer_completion();
static void gdraw_resource_moved(GDrawHandle *t);
static rrbool gdraw_CanDefragment(GDrawHandleCache *c)
{
// we can defragment (and extract some gain from it) if and only if there's more
// than one free block. since gfxalloc coalesces free blocks immediately and keeps
// them in a circular linked list, this is very easy to detect: just check if the
// "next" pointer of the first free block points to the sentinel. (this is only
// the case if there are 0 or 1 free blocks)
gfx_allocator *alloc = c->alloc;
if (!c->alloc_other) // if we don't have a second pool, we can't defrag at all.
return false;
return alloc->blocks[0].next->next != alloc->blocks;
}
static rrbool gdraw_MigrateResource(GDrawHandle *t, GDrawStats *stats)
{
GDrawHandleCache *c = t->cache;
void *ptr = NULL;
assert(t->state == GDRAW_HANDLE_STATE_live || t->state == GDRAW_HANDLE_STATE_locked || t->state == GDRAW_HANDLE_STATE_pinned);
// anything we migrate should be in the "other" (old) pool
assert(gfxalloc_mem_contains(c->alloc_other, t->raw_ptr));
ptr = gfxalloc_alloc(c->alloc, t->bytes);
if (ptr) {
// update stats
stats->nonzero_flags |= GDRAW_STATS_defrag;
stats->defrag_objects += 1;
stats->defrag_bytes += t->bytes;
// copy contents to new storage
gdraw_gpu_memcpy(c, ptr, t->raw_ptr, t->bytes);
// free old storage
gfxalloc_free(c->alloc_other, t->raw_ptr);
// adjust pointers to point to new location
t->raw_ptr = ptr;
gdraw_resource_moved(t);
return true;
} else
return false;
}
static rrbool gdraw_MigrateAllResources(GDrawHandle *sentinel, GDrawStats *stats)
{
GDrawHandle *h;
for (h = sentinel->next; h != sentinel; h = h->next) {
if (!gdraw_MigrateResource(h, stats))
return false;
}
return true;
}
static rrbool gdraw_TwoPoolDefragmentMain(GDrawHandleCache *c, GDrawStats *stats)
{
gfx_allocator *t;
// swap allocators
t = c->alloc;
c->alloc = c->alloc_other;
c->alloc_other = t;
// immediately migrate all currently pinned and locked resources
rrbool ok = true;
ok = ok && gdraw_MigrateAllResources(&c->state[GDRAW_HANDLE_STATE_pinned], stats);
ok = ok && gdraw_MigrateAllResources(&c->state[GDRAW_HANDLE_STATE_locked], stats);
return ok;
}
static rrbool gdraw_StateListIsEmpty(GDrawHandle *head)
{
// a list is empty when the head sentinel is the only node
return head->next == head;
}
static void gdraw_CheckAllPointersUpdated(GDrawHandle *head)
{
#ifdef GDRAW_DEBUG
GDrawHandle *h;
for (h = head->next; h != head; h = h->next) {
assert(gfxalloc_mem_contains(h->cache->alloc, h->raw_ptr));
}
#endif
}
static void gdraw_PostDefragmentCleanup(GDrawHandleCache *c, GDrawStats *stats)
{
// if we defragmented during this scene, this is the spot where
// we need to nuke all references to resources that weren't
// carried over into the new pool.
if (c->did_defragment) {
GDrawHandle *h;
// alloc list should be empty at this point
assert(gdraw_StateListIsEmpty(&c->state[GDRAW_HANDLE_STATE_alloc]));
// free all remaining live resources (these are the resources we didn't
// touch this frame, hence stale)
h = &c->state[GDRAW_HANDLE_STATE_live];
while (!gdraw_StateListIsEmpty(h))
gdraw_res_free(h->next, stats);
// "live" is now empty, and we already checked that "alloc" was empty
// earlier. "dead" may hold objects on the old heap still (that were freed
// before we swapped allocators). "user owned" is not managed by us.
// that leaves "locked" and "pinned" resources, both of which better be
// only pointing into the new heap now!
gdraw_CheckAllPointersUpdated(&c->state[GDRAW_HANDLE_STATE_locked]);
gdraw_CheckAllPointersUpdated(&c->state[GDRAW_HANDLE_STATE_pinned]);
gdraw_gpu_wait_for_transfer_completion();
}
}
#endif
// Image processing code
// Compute average of 4 RGBA8888 pixels passed as U32.
// Variables are named assuming the values are stored as big-endian, but all bytes
// are treated equally, so this code will work just fine on little-endian data.
static U32 gdraw_Avg4_rgba8888(U32 p0, U32 p1, U32 p2, U32 p3)
{
U32 mask = 0x00ff00ff;
U32 bias = 0x00020002;
U32 gasum = ((p0 >> 0) & mask) + ((p1 >> 0) & mask) + ((p2 >> 0) & mask) + ((p3 >> 0) & mask) + bias;
U32 rbsum = ((p0 >> 8) & mask) + ((p1 >> 8) & mask) + ((p2 >> 8) & mask) + ((p3 >> 8) & mask) + bias;
return ((gasum >> 2) & mask) | ((rbsum << 6) & ~mask);
}
// Compute average of 2 RGBA8888 pixels passed as U32
static U32 gdraw_Avg2_rgba8888(U32 p0, U32 p1)
{
return (p0 | p1) - (((p0 ^ p1) >> 1) & 0x7f7f7f7f);
}
// 2:1 downsample in both horizontal and vertical direction, for one line.
// width is width of destination line.
static void gdraw_Downsample_2x2_line(U8 *dst, U8 *line0, U8 *line1, U32 width, U32 bpp)
{
U32 x;
if (bpp == 4) {
U32 *in0 = (U32 *) line0;
U32 *in1 = (U32 *) line1;
U32 *out = (U32 *) dst;
for (x=0; x < width; x++, in0 += 2, in1 += 2)
*out++ = gdraw_Avg4_rgba8888(in0[0], in0[1], in1[0], in1[1]);
} else if (bpp == 1) {
for (x=0; x < width; x++, line0 += 2, line1 += 2)
*dst++ = (line0[0] + line0[1] + line1[0] + line1[1] + 2) / 4;
} else
RR_BREAK();
}
// 2:1 downsample in horizontal but not vertical direction.
static void gdraw_Downsample_2x1_line(U8 *dst, U8 *src, U32 width, U32 bpp)
{
U32 x;
if (bpp == 4) {
U32 *in = (U32 *) src;
U32 *out = (U32 *) dst;
for (x=0; x < width; x++, in += 2)
*out++ = gdraw_Avg2_rgba8888(in[0], in[1]);
} else if (bpp == 1) {
for (x=0; x < width; x++, src += 2)
*dst++ = (src[0] + src[1] + 1) / 2;
} else
RR_BREAK();
}
// 2:1 downsample in vertical but not horizontal direction.
static void gdraw_Downsample_1x2(U8 *dst, S32 dstpitch, U8 *src, S32 srcpitch, U32 height, U32 bpp)
{
U32 y;
if (bpp == 4) {
for (y=0; y < height; y++, dst += dstpitch, src += 2*srcpitch)
*((U32 *) dst) = gdraw_Avg2_rgba8888(*((U32 *) src), *((U32 *) (src + srcpitch)));
} else if (bpp == 1) {
for (y=0; y < height; y++, dst += dstpitch, src += 2*srcpitch)
*dst = (src[0] + src[srcpitch] + 1) / 2;
} else
RR_BREAK();
}
// 2:1 downsample (for mipmaps)
// dst: Pointer to destination buffer
// dstpitch: Pitch for destination buffer
// width: Width of *destination* image (i.e. downsampled version)
// height: Height of *destination* image (i.e. downsampled version)
// src: Pointer to source buffer
// srcpitch: Pitch of source buffer
// bpp: Bytes per pixel for image data
//
// can be used for in-place resizing if src==dst and dstpitch <= srcpitch!
static GDRAW_MAYBE_UNUSED void gdraw_Downsample(U8 *dst, S32 dstpitch, U32 width, U32 height, U8 *src, S32 srcpitch, U32 bpp)
{
U32 y;
assert(bpp == 1 || bpp == 4);
// @TODO gamma?
if (!height) // non-square texture, height was reduced to 1 in a previous step
gdraw_Downsample_2x1_line(dst, src, width, bpp);
else if (!width) // non-square texture, width was reduced to 1 in a previous step
gdraw_Downsample_1x2(dst, dstpitch, src, srcpitch, height, bpp);
else {
for (y=0; y < height; y++) {
gdraw_Downsample_2x2_line(dst, src, src + srcpitch, width, bpp);
dst += dstpitch;
src += 2*srcpitch;
}
}
}
#ifndef GDRAW_NO_STREAMING_MIPGEN
#define GDRAW_MAXMIPS 16 // maximum number of mipmaps supported.
typedef struct GDrawMipmapContext {
U32 width; // width of the texture being mipmapped
U32 height; // height of the texture being mipmapped
U32 mipmaps; // number of mipmaps
U32 bpp; // bytes per pixel
U32 partial_row; // bit N: is mipmap N currently storing a partial row?
U32 bheight; // height of the buffer at miplevel 0
U8 *pixels[GDRAW_MAXMIPS];
U32 pitch[GDRAW_MAXMIPS];
} GDrawMipmapContext;
static rrbool gdraw_MipmapBegin(GDrawMipmapContext *c, U32 width, U32 height, U32 mipmaps, U32 bpp, U8 *buffer, U32 buffer_size)
{
U32 i;
U8 *p;
if (mipmaps > GDRAW_MAXMIPS)
return false;
c->width = width;
c->height = height;
c->mipmaps = mipmaps;
c->bpp = bpp;
c->partial_row = 0;
// determine how many lines to buffer
// we try to use roughly 2/3rds of the buffer for the first miplevel (less than 3/4 since with our
// partial line buffers, we have extra buffer space for lower mip levels).
c->bheight = (2 * buffer_size) / (3 * width * bpp);
// round down to next-smaller power of 2 (in case we need to swizzle; swizzling works on pow2-sized blocks)
while (c->bheight & (c->bheight-1)) // while not a power of 2...
c->bheight &= c->bheight - 1; // clear least significant bit set
// then keep lowering the number of buffered lines until they fit (or we reach zero, i.e. it doesn't fit)
while (c->bheight) {
p = buffer;
for (i=0; i < c->mipmaps; i++) {
U32 mw = c->width >> i;
U32 bh = c->bheight >> i;
if (!mw) mw++;
if (!bh) mw *= 2, bh++; // need space for line of previous miplevel
c->pixels[i] = p;
c->pitch[i] = mw * bpp;
p += c->pitch[i] * bh;
}
// if it fits, we're done
if (p <= buffer + buffer_size) {
if (c->bheight > height) // buffer doesn't need to be larger than the image!
c->bheight = height;
return true;
}
// need to try a smaller line buffer...
c->bheight >>= 1;
}
// can't fit even one line into our buffer. ouch!
return false;
}
// returns true if there was data generated for this miplevel, false otherwise.
static rrbool gdraw_MipmapAddLines(GDrawMipmapContext *c, U32 level)
{
U32 bw,bh;
assert(level > 0); // doesn't make sense to call this on level 0
if (level == 0 || level >= c->mipmaps)
return false; // this level doesn't exist
bw = c->width >> level; // buffer width at this level
bh = c->bheight >> level; // buffer height at this level
if (bh) { // we can still do regular downsampling
gdraw_Downsample(c->pixels[level], c->pitch[level], bw, bh, c->pixels[level-1], c->pitch[level-1], c->bpp);
return true;
} else if (c->height >> level) { // need to buffer partial lines, but still doing vertical 2:1 downsampling
if ((c->partial_row ^= (1 << level)) & (1 << level)) { // no buffered partial row for this miplevel yet, make one
memcpy(c->pixels[level], c->pixels[level-1], bw * 2 * c->bpp);
return false;
} else { // have one buffered row, can generate output pixels
gdraw_Downsample_2x2_line(c->pixels[level], c->pixels[level], c->pixels[level-1], bw, c->bpp);
return true;
}
} else { // finish off with a chain of Nx1 miplevels
gdraw_Downsample_2x1_line(c->pixels[level], c->pixels[level-1], bw, c->bpp);
return true;
}
}
#endif // GDRAW_NO_STREAMING_MIPGEN
#ifdef GDRAW_CHECK_BLOCK
static void check_block_alloc(gfx_allocator *alloc, void *ptr, rrbool allocated)
{
int i,n=0,m=0;
for (i=0; i < GFXALLOC_HASH_SIZE; ++i) {
gfx_block_info *b = alloc->hash[i];
while (b) {
if (b->ptr == ptr)
++n;
b = b->next;
}
}
gfx_block_info *b = alloc->blocks[0].next;
while (b != &alloc->blocks[0]) {
if (b->ptr == ptr)
++m;
b = b->next;
}
if (allocated)
assert(n == 1 && m == 0);
else
assert(n == 0 && m == 1);
}
#else
#define check_block_alloc(a,p,f)
#endif
#ifdef GDRAW_BUFFER_RING
////////////////////////////////////////////////////////////////////////
//
// Buffer ring
//
// Implements a dynamic buffer backed by multiple physical buffers, with
// the usual append-only, DISCARD/NOOVERWRITE semantics.
//
// This can be used for dynamic vertex buffers, constant buffers, etc.
#define GDRAW_BUFRING_MAXSEGS 4 // max number of backing segments
typedef struct gdraw_bufring_seg {
struct gdraw_bufring_seg *next; // next segment in ring
U8 *data; // pointer to the allocation
GDrawFence fence; // fence for this segment
U32 used; // number of bytes used
} gdraw_bufring_seg;
typedef struct gdraw_bufring {
gdraw_bufring_seg *cur; // active ring segment
U32 seg_size; // size of one segment
U32 align; // alignment of segment allocations
gdraw_bufring_seg all_segs[GDRAW_BUFRING_MAXSEGS];
} gdraw_bufring;
// forwards
static GDrawFence put_fence();
static void wait_on_fence(GDrawFence fence);
static void gdraw_bufring_init(gdraw_bufring * RADRESTRICT ring, void *ptr, U32 size, U32 nsegs, U32 align)
{
U32 i, seg_size;
ring->seg_size = 0;
if (!ptr || nsegs < 1 || size < nsegs * align) // bail if no ring buffer memory or too small
return;
if (nsegs > GDRAW_BUFRING_MAXSEGS)
nsegs = GDRAW_BUFRING_MAXSEGS;
// align needs to be a positive power of two
assert(align >= 1 && (align & (align - 1)) == 0);
// buffer really needs to be properly aligned
assert(((UINTa)ptr & (align - 1)) == 0);
seg_size = (size / nsegs) & ~(align - 1);
for (i=0; i < nsegs; ++i) {
ring->all_segs[i].next = &ring->all_segs[(i + 1) % nsegs];
ring->all_segs[i].data = (U8 *) ptr + i * seg_size;
ring->all_segs[i].fence.value = 0;
ring->all_segs[i].used = 0;
}
ring->cur = ring->all_segs;
ring->seg_size = seg_size;
ring->align = align;
}
static void gdraw_bufring_shutdown(gdraw_bufring * RADRESTRICT ring)
{
ring->cur = NULL;
ring->seg_size = 0;
}
static void *gdraw_bufring_alloc(gdraw_bufring * RADRESTRICT ring, U32 size, U32 align)
{
U32 align_up;
gdraw_bufring_seg *seg;
if (size > ring->seg_size)
return NULL; // nope, won't fit
assert(align <= ring->align);
// check if it fits in the active segment first
seg = ring->cur;
align_up = (seg->used + align - 1) & -align;
if ((align_up + size) <= ring->seg_size) {
void *ptr = seg->data + align_up;
seg->used = align_up + size;
return ptr;
}
// doesn't fit, we have to start a new ring segment.
seg->fence = put_fence();
// switch to the next segment, wait till GPU is done with it
seg = ring->cur = seg->next;
wait_on_fence(seg->fence);
// allocate from the new segment. we assume that segment offsets
// satisfy the highest alignment requirements we ever ask for!
seg->used = size;
return seg->data;
}
#endif
////////////////////////////////////////////////////////////////////////
//
// General resource manager
//
#ifndef GDRAW_FENCE_FLUSH
#define GDRAW_FENCE_FLUSH()
#endif
#ifdef GDRAW_MANAGE_MEM
// functions the platform must implement
#ifndef GDRAW_BUFFER_RING // avoid "redundant redeclaration" warning
static void wait_on_fence(GDrawFence fence);
#endif
static rrbool is_fence_pending(GDrawFence fence);
static void gdraw_defragment_cache(GDrawHandleCache *c, GDrawStats *stats);
// functions we implement
static void gdraw_res_reap(GDrawHandleCache *c, GDrawStats *stats);
#endif
// If GDRAW_MANAGE_MEM is not #defined, this needs to perform the
// actual free using whatever API we're targeting.
//
// If GDRAW_MANAGE_MEM is #defined, the shared code handles the
// memory management part, but you might still need to update
// your state caching.
static void api_free_resource(GDrawHandle *r);
// Actually frees a resource and releases all allocated resources
static void gdraw_res_free(GDrawHandle *r, GDrawStats *stats)
{
assert(r->state == GDRAW_HANDLE_STATE_live || r->state == GDRAW_HANDLE_STATE_locked || r->state == GDRAW_HANDLE_STATE_dead ||
r->state == GDRAW_HANDLE_STATE_pinned || r->state == GDRAW_HANDLE_STATE_user_owned);
#ifdef GDRAW_MANAGE_MEM
GDRAW_FENCE_FLUSH();
// make sure resource isn't in use before we actually free the memory
wait_on_fence(r->fence);
if (r->raw_ptr) {
#ifndef GDRAW_MANAGE_MEM_TWOPOOL
gfxalloc_free(r->cache->alloc, r->raw_ptr);
#else
GDrawHandleCache *c = r->cache;
if (gfxalloc_mem_contains(c->alloc, r->raw_ptr))
gfxalloc_free(c->alloc, r->raw_ptr);
else {
assert(gfxalloc_mem_contains(c->alloc_other, r->raw_ptr));
gfxalloc_free(c->alloc_other, r->raw_ptr);
}
#endif
}
#endif
api_free_resource(r);
stats->nonzero_flags |= GDRAW_STATS_frees;
stats->freed_objects += 1;
stats->freed_bytes += r->bytes;
gdraw_HandleCacheFree(r);
}
// Frees the LRU resource in the given cache.
static rrbool gdraw_res_free_lru(GDrawHandleCache *c, GDrawStats *stats)
{
GDrawHandle *r = gdraw_HandleCacheGetLRU(c);
if (!r) return false;
if (c->is_vertex && r->owner) // check for r->owner since it may already be killed (if player destroyed first)
IggyDiscardVertexBufferCallback(r->owner, r);
// was it referenced since end of previous frame (=in this frame)?
// if some, we're thrashing; report it to the user, but only once per frame.
if (c->prev_frame_end.value < r->fence.value && !c->is_thrashing) {
IggyGDrawSendWarning(NULL, c->is_vertex ? "GDraw Thrashing vertex memory" : "GDraw Thrashing texture memory");
c->is_thrashing = true;
}
gdraw_res_free(r, stats);
return true;
}
static void gdraw_res_flush(GDrawHandleCache *c, GDrawStats *stats)
{
c->is_thrashing = true; // prevents warnings being generated from free_lru
gdraw_HandleCacheUnlockAll(c);
while (gdraw_res_free_lru(c, stats))
;
}
static GDrawHandle *gdraw_res_alloc_outofmem(GDrawHandleCache *c, GDrawHandle *t, char const *failed_type)
{
if (t)
gdraw_HandleCacheAllocateFail(t);
IggyGDrawSendWarning(NULL, c->is_vertex ? "GDraw Out of static vertex buffer %s" : "GDraw Out of texture %s", failed_type);
return NULL;
}
#ifndef GDRAW_MANAGE_MEM
static GDrawHandle *gdraw_res_alloc_begin(GDrawHandleCache *c, S32 size, GDrawStats *stats)
{
GDrawHandle *t;
if (size > c->total_bytes)
gdraw_res_alloc_outofmem(c, NULL, "memory (single resource larger than entire pool)");
else {
// given how much data we're going to allocate, throw out
// data until there's "room" (this basically lets us use
// managed memory and just bound our usage, without actually
// packing it and being exact)
while (c->bytes_free < size) {
if (!gdraw_res_free_lru(c, stats)) {
gdraw_res_alloc_outofmem(c, NULL, "memory");
break;
}
}
}
// now try to allocate a handle
t = gdraw_HandleCacheAllocateBegin(c);
if (!t) {
// it's possible we have no free handles, because all handles
// are in use without exceeding the max storage above--in that
// case, just free one texture to give us a free handle (ideally
// we'd trade off cost of regenerating)
if (gdraw_res_free_lru(c, stats)) {
t = gdraw_HandleCacheAllocateBegin(c);
if (t == NULL) {
gdraw_res_alloc_outofmem(c, NULL, "handles");
}
}
}
return t;
}
#else
// Returns whether this resource holds pointers to one of the GDraw-managed
// pools.
static rrbool gdraw_res_is_managed(GDrawHandle *r)
{
return r->state == GDRAW_HANDLE_STATE_live ||
r->state == GDRAW_HANDLE_STATE_locked ||
r->state == GDRAW_HANDLE_STATE_dead ||
r->state == GDRAW_HANDLE_STATE_pinned;
}
// "Reaps" dead resources. Even if the user requests that a
// resource be freed, it might still be in use in a pending
// command buffer. So we can't free the associated memory
// immediately; instead, we flag the resource as "dead" and
// periodically check whether we can actually free the
// pending memory of dead resources ("reap" them).
static void gdraw_res_reap(GDrawHandleCache *c, GDrawStats *stats)
{
GDrawHandle *sentinel = &c->state[GDRAW_HANDLE_STATE_dead];
GDrawHandle *t;
GDRAW_FENCE_FLUSH();
// reap all dead resources that aren't in use anymore
while ((t = sentinel->next) != sentinel && !is_fence_pending(t->fence))
gdraw_res_free(t, stats);
}
// "Kills" a resource. This means GDraw won't use it anymore
// (it's dead), but there might still be outstanding references
// to it in a pending command buffer, so we can't physically
// free the associated memory until that's all processed.
static void gdraw_res_kill(GDrawHandle *r, GDrawStats *stats)
{
GDRAW_FENCE_FLUSH(); // dead list is sorted by fence index - make sure all fence values are current.
r->owner = NULL;
gdraw_HandleCacheInsertDead(r);
gdraw_res_reap(r->cache, stats);
}
static GDrawHandle *gdraw_res_alloc_begin(GDrawHandleCache *c, S32 size, GDrawStats *stats)
{
GDrawHandle *t;
void *ptr = NULL;
gdraw_res_reap(c, stats); // NB this also does GDRAW_FENCE_FLUSH();
if (size > c->total_bytes)
return gdraw_res_alloc_outofmem(c, NULL, "memory (single resource larger than entire pool)");
// now try to allocate a handle
t = gdraw_HandleCacheAllocateBegin(c);
if (!t) {
// it's possible we have no free handles, because all handles
// are in use without exceeding the max storage above--in that
// case, just free one texture to give us a free handle (ideally
// we'd trade off cost of regenerating)
gdraw_res_free_lru(c, stats);
t = gdraw_HandleCacheAllocateBegin(c);
if (!t)
return gdraw_res_alloc_outofmem(c, NULL, "handles");
}
// try to allocate first
if (size) {
ptr = gfxalloc_alloc(c->alloc, size);
if (!ptr) {
// doesn't currently fit. try to free some allocations to get space to breathe.
S32 want_free = RR_MAX(size + (size / 2), GDRAW_MIN_FREE_AMOUNT);
if (want_free > c->total_bytes)
want_free = size; // okay, *really* big resource, just try to allocate its real size
// always keep freeing textures until want_free bytes are free.
while (c->alloc->actual_bytes_free < want_free) {
if (!gdraw_res_free_lru(c, stats))
return gdraw_res_alloc_outofmem(c, t, "memory");
}
// now, keep trying to allocate and free some more memory when it still doesn't fit
while (!(ptr = gfxalloc_alloc(c->alloc, size))) {
if (c->alloc->actual_bytes_free >= 3 * size || // if we should have enough free bytes to satisfy the request by now
(c->alloc->actual_bytes_free >= size && size * 2 >= c->total_bytes)) // or the resource is very big and the alloc doesn't fit
{
// before we actually consider defragmenting, we want to free all stale resources (not
// referenced in the previous 2 frames). and if that frees up enough memory so we don't have
// to defragment, all the better!
// also, never defragment twice in a frame, just assume we're thrashing when we get in that
// situation and free up as much as possible.
if (!c->did_defragment &&
c->prev_frame_start.value <= c->handle->fence.value) {
// defragment.
defrag:
if (gdraw_CanDefragment(c)) { // only try defrag if it has a chance of helping.
gdraw_defragment_cache(c, stats);
c->did_defragment = true;
}
ptr = gfxalloc_alloc(c->alloc, size);
if (!ptr)
return gdraw_res_alloc_outofmem(c, t, "memory (fragmentation)");
break;
}
}
// keep trying to free some more
if (!gdraw_res_free_lru(c, stats)) {
if (c->alloc->actual_bytes_free >= size) // nothing left to free but we should be good - defrag again, even if it's the second time in a frame
goto defrag;
return gdraw_res_alloc_outofmem(c, t, "memory");
}
}
}
}
t->fence.value = 0; // hasn't been used yet
t->raw_ptr = ptr;
return t;
}
#endif