Quick Ecs And Ipc

An Entity Component System (ECS) in C23 can generate incredibly optimized SIMD instructions on Struct of Arrays (SOA) entities while emphasizing maintenance with this X-Macro approach:

#define ent_d(exec) \
    exec(float, x)  \
    exec(float, y)  \
    exec(float, z)  \
    exec(float, vx) \
    exec(float, vy) \
    exec(float, vz)

constexpr auto g_ents_cap = 8192;

struct
{
    #define exec(type, field) type field[g_ents_cap];
    ent_d(exec)
    #undef exec
}
g_ents = {};

auto g_ents_size = 0;

typedef struct
{
    #define exec(type, field) type field;
    ent_d(exec)
    #undef exec
}
ent_t;

void push(const ent_t ent)
{
    #define exec(type, field) g_ents.field[g_ents_size] = ent.field;
    ent_d(exec)
    #undef exec
    g_ents_size++;
}

void move()
{
    for(auto i = 0; i < g_ents_cap; i++)
    {
        g_ents.x[i] += g_ents.vx[i];
        g_ents.y[i] += g_ents.vy[i];
        g_ents.z[i] += g_ents.vz[i];
    }
}

The move() function operates on the constant capacity size g_ents_cap, and not g_ents_size to eliminate SIMD tail branching:

00000000000011a0 <move>:
    11a0:   lea     0x2eb9(%rip),%rax
    11a7:   lea     0x8000(%rax),%rdx
    11ae:   xchg    %ax,%ax
    11b0:   vmovaps (%rax),%ymm0
    11b4:   add     $0x20,%rax
    11b8:   vaddps  0x17fe0(%rax),%ymm0,%ymm0
    11c0:   vmovaps %ymm0,-0x20(%rax)
    11c5:   vmovaps 0x7fe0(%rax),%ymm0
    11cd:   vaddps  0x1ffe0(%rax),%ymm0,%ymm0
    11d5:   vmovaps %ymm0,0x7fe0(%rax)
    11dd:   vmovaps 0xffe0(%rax),%ymm0
    11e5:   vaddps  0x27fe0(%rax),%ymm0,%ymm0
    11ed:   vmovaps %ymm0,0xffe0(%rax)
    11f5:   cmp     %rdx,%rax
    11f8:   jne     11b0 <move+0x10>
    11fa:   vzeroupper
    11fd:   ret

In practice we’d use g_ents_size, but the generated assembly is nicer for the format of this post.

A Tale of Instructions Per Cycle (IPC):

The SOA move has an IPC of 1.5:

 Performance counter stats for './a.out 10000':

                 0      context-switches:u #    0.0 cs/sec  cs_per_second
                 0      cpu-migrations:u   #    0.0 migrations/sec  migrations_per_second
               633      page-faults:u      # 5215.8 faults/sec  page_faults_per_second
            121.36 msec task-clock:u       #    0.9 CPUs  CPUs_utilized
             8,793      branch-misses:u    #    0.0 %  branch_miss_rate         (49.63%)
        41,243,880      branches:u         #  339.8 M/sec  branch_frequency     (50.45%)
       401,636,074      cpu-cycles:u       #    3.3 GHz  cycles_frequency       (50.61%)
       607,399,083      instructions:u     #    1.5 instructions  insn_per_cycle  (50.37%)

    0.122078331 seconds time elapsed

Swapping the layout of g_ents from SOA to Array of Structs (AOS) improves IPC to 2.7:

 Performance counter stats for './a.out 10000':

                 0      context-switches:u #    0.0 cs/sec  cs_per_second
                 0      cpu-migrations:u   #    0.0 migrations/sec  migrations_per_second
               825      page-faults:u      # 1313.2 faults/sec  page_faults_per_second
            628.23 msec task-clock:u       #    1.0 CPUs  CPUs_utilized
            10,081      branch-misses:u    #    0.0 %  branch_miss_rate         (50.23%)
       163,270,204      branches:u         #  259.9 M/sec  branch_frequency     (50.13%)
     1,732,942,939      cpu-cycles:u       #    2.8 GHz  cycles_frequency       (49.99%)
     4,617,009,107      instructions:u     #    2.7 instructions  insn_per_cycle  (49.77%)

    0.631062773 seconds time elapsed

The hardware is certainly saturated with more work, _but_ the cycle count and wall clock time is nearly 5x in size.

Inspecting the AOS move:

00000000000012e0 <move>:
    12e0:   lea       0x2d59(%rip),%rax
    12e7:   lea       0x180000(%rax),%rdx
    12ee:   xchg      %ax,%ax
    12f0:   vmovq     0x20(%rax),%xmm4
    12f5:   vmovq     0x8(%rax),%xmm2
    12fa:   add       $0x30,%rax
    12fe:   vmovq     -0x18(%rax),%xmm0
    1303:   vmovq     -0x20(%rax),%xmm3
    1308:   vinsertps $0x10,%xmm0,%xmm2,%xmm1
    130e:   vinsertps $0x40,%xmm3,%xmm4,%xmm5
    1314:   vmovlhps  %xmm4,%xmm0,%xmm0
    1318:   vmovq     -0x8(%rax),%xmm4
    131d:   vpermilps $0x99,%xmm0,%xmm0
    1323:   vmovlhps  %xmm3,%xmm2,%xmm2
    1327:   vmovq     -0x30(%rax),%xmm3
    132c:   vaddps    %xmm5,%xmm1,%xmm1
    1330:   vaddps    %xmm4,%xmm0,%xmm0
    1334:   vpermilps $0x99,%xmm2,%xmm2
    133a:   vaddps    %xmm3,%xmm2,%xmm2
    133e:   vmovss    %xmm1,-0x28(%rax)
    1343:   vmovshdup %xmm0,%xmm6
    1347:   vmovlhps  %xmm0,%xmm1,%xmm1
    134b:   vpermilps $0x99,%xmm1,%xmm1
    1351:   vmovlps   %xmm2,-0x30(%rax)
    1356:   vmovlps   %xmm1,-0x18(%rax)
    135b:   vmovss    %xmm6,-0x10(%rax)
    1360:   cmp       %rax,%rdx
    1363:   jne       12f0 <move+0x10>
    1365:   ret

We see more partial loads, more scalar ops, more register shuffles, more partial writes. More instructions might be in flight per cycle, but IPC does not equal performance when memory can be streamed and processed for less instructions per cycle.