The Cost of Branching in C

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

static void tik()
{
    static clock_t start, end;
    static int flag = 0;
    static double elapsed_time;
    if (flag == 0) {
        start = clock();
    } else {
        end = clock();
        elapsed_time = (double)(end - start) / (double)CLOCKS_PER_SEC;
        printf("%f,", elapsed_time);
    }
    flag = 1 - flag;
}

#define A_SIZE 2048

static const int   ITERS     = 500 * 1000;

unsigned int test_branch(int const* A, int const n, int const iters)
{
    unsigned int res = 0;

    for (int i = 0; i < iters; ++i) {
        for (int j = 0; j < n; ++j) {
            if(A[j]) {
                res = j;
            }
        }
    }

    return res;
}

unsigned int test_branch_0(int const* A, int const n, int const iters)
{
    unsigned int res = 0;

    for (int i = 0; i < iters; ++i) {
        for (int j = 0; j < n; ++j) {
            if(__builtin_expect(A[j], 0)) {
                res = j;
            }
        }
    }

    return res;
}

unsigned int test_branch_1(int const* A, int const n, int const iters)
{
    unsigned int res = 0;

    for (int i = 0; i < iters; ++i) {
        for (int j = 0; j < n; ++j) {
            if(__builtin_expect(A[j], 1)) {
                res = j;
            }
        }
    }

    return res;
}

// stdout format
// "targeting probability, real percentage, plain branch, expect 0, expect 1"
int main(int argc, char *argv[])
{
    float prob = (float)atoi(argv[1]) * 10 / 100;
    static int A[A_SIZE];
    srand(time(NULL));

    printf("%.2f,", prob);
    for (int j = 0; j < A_SIZE; ++j) {
        float const rand1 = ((float)rand()) * (float)RAND_MAX;
        float const rand2 = (float)rand();
        float const rand_val = rand1 + rand2;
        float const rand_01 = rand_val / (float)RAND_MAX / (float)RAND_MAX;
        A[j] = rand_01 < prob;
    }
    int t = 0;
    for (int j = 0; j < A_SIZE; ++j) {
        if (A[j]) {
            t++;
        }
    }
    printf("%.2f,", (float)t/A_SIZE);

    // so that it actually computes
    volatile int sum = 0;

    // warm up cache
    sum = test_branch(A, A_SIZE, ITERS);
    sum = test_branch_0(A, A_SIZE, ITERS);
    sum = test_branch_1(A, A_SIZE, ITERS);

    tik();
    sum = test_branch(A, A_SIZE, ITERS);
    tik();

    tik();
    sum = test_branch_0(A, A_SIZE, ITERS);
    tik();

    tik();
    sum = test_branch_1(A, A_SIZE, ITERS);
    tik();

    puts("");

    return 0;
}

# using clang
$ clang -O branch.c && for i in {0..10}; do ; ./a.out $i ; done;
0.00,0.00,0.562807,0.470665,0.425100,
0.10,0.11,0.555139,0.519742,0.726889,
0.20,0.21,0.557386,0.562136,0.730022,
0.30,0.31,0.552457,0.656972,0.736262,
0.40,0.40,0.565915,0.756922,0.728128,
0.50,0.50,0.555038,0.828692,0.732193,
0.60,0.60,0.551107,0.951515,0.726332,
0.70,0.70,0.553245,1.042037,0.714125,
0.80,0.81,0.550671,1.175527,0.722460,
0.90,0.91,0.551631,1.365827,0.713159,
1.00,1.00,0.552825,1.518310,0.489986,
$ clang --version
clang version 6.0.0-svn321328-1~exp1 (trunk)
$ # using gcc
$ gcc -O branch.c && for i in {0..10}; do ; ./a.out $i ; done;
0.00,0.00,0.550638,0.552257,0.551098,
0.10,0.11,0.540312,0.559934,0.553849,
0.20,0.20,0.541764,0.557476,0.550883,
0.30,0.29,0.539910,0.552430,0.560583,
0.40,0.40,0.545194,0.553256,0.548485,
0.50,0.50,0.542356,0.589932,0.612541,
0.60,0.61,0.540623,0.554823,0.549013,
0.70,0.71,0.539795,0.562244,0.547882,
0.80,0.79,0.547068,0.566020,0.556831,
0.90,0.90,0.553231,0.558796,0.554989,
1.00,1.00,0.543937,0.563535,0.559513,
$ gcc --version
gcc-6 (Ubuntu/Linaro 6.3.0-18ubuntu2~16.04) 6.3.0 20170519

I find the result from clang quite puzzling, while the one from gcc much understandable. Possibly, the results would be more insightful in the future. Anyway, this code snippet is out so that you can test it on your box using the compiler you prefer.