1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

static void tik()
{
static clock_t start, end;
static int flag = 0;
static double elapsed_time;
if (flag == 0) {
start = clock();
} else {
end = clock();
elapsed_time = (double)(end - start) / (double)CLOCKS_PER_SEC;
printf("%f,", elapsed_time);
}
flag = 1 - flag;
}

#define A_SIZE 2048

static const int ITERS = 500 * 1000;

unsigned int test_branch(int const* A, int const n, int const iters)
{
unsigned int res = 0;

for (int i = 0; i < iters; ++i) {
for (int j = 0; j < n; ++j) {
if(A[j]) {
res = j;
}
}
}

return res;
}

unsigned int test_branch_0(int const* A, int const n, int const iters)
{
unsigned int res = 0;

for (int i = 0; i < iters; ++i) {
for (int j = 0; j < n; ++j) {
if(__builtin_expect(A[j], 0)) {
res = j;
}
}
}

return res;
}

unsigned int test_branch_1(int const* A, int const n, int const iters)
{
unsigned int res = 0;

for (int i = 0; i < iters; ++i) {
for (int j = 0; j < n; ++j) {
if(__builtin_expect(A[j], 1)) {
res = j;
}
}
}

return res;
}

// stdout format
// "targeting probability, real percentage, plain branch, expect 0, expect 1"
int main(int argc, char *argv[])
{
float prob = (float)atoi(argv[1]) * 10 / 100;
static int A[A_SIZE];
srand(time(NULL));

printf("%.2f,", prob);
for (int j = 0; j < A_SIZE; ++j) {
float const rand1 = ((float)rand()) * (float)RAND_MAX;
float const rand2 = (float)rand();
float const rand_val = rand1 + rand2;
float const rand_01 = rand_val / (float)RAND_MAX / (float)RAND_MAX;
A[j] = rand_01 < prob;
}
int t = 0;
for (int j = 0; j < A_SIZE; ++j) {
if (A[j]) {
t++;
}
}
printf("%.2f,", (float)t/A_SIZE);

// so that it actually computes
volatile int sum = 0;

// warm up cache
sum = test_branch(A, A_SIZE, ITERS);
sum = test_branch_0(A, A_SIZE, ITERS);
sum = test_branch_1(A, A_SIZE, ITERS);

tik();
sum = test_branch(A, A_SIZE, ITERS);
tik();

tik();
sum = test_branch_0(A, A_SIZE, ITERS);
tik();

tik();
sum = test_branch_1(A, A_SIZE, ITERS);
tik();

puts("");

return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# using clang
$ clang -O branch.c && for i in {0..10}; do ; ./a.out $i ; done;
0.00,0.00,0.562807,0.470665,0.425100,
0.10,0.11,0.555139,0.519742,0.726889,
0.20,0.21,0.557386,0.562136,0.730022,
0.30,0.31,0.552457,0.656972,0.736262,
0.40,0.40,0.565915,0.756922,0.728128,
0.50,0.50,0.555038,0.828692,0.732193,
0.60,0.60,0.551107,0.951515,0.726332,
0.70,0.70,0.553245,1.042037,0.714125,
0.80,0.81,0.550671,1.175527,0.722460,
0.90,0.91,0.551631,1.365827,0.713159,
1.00,1.00,0.552825,1.518310,0.489986,
$ clang --version
clang version 6.0.0-svn321328-1~exp1 (trunk)
$ # using gcc
$ gcc -O branch.c && for i in {0..10}; do ; ./a.out $i ; done;
0.00,0.00,0.550638,0.552257,0.551098,
0.10,0.11,0.540312,0.559934,0.553849,
0.20,0.20,0.541764,0.557476,0.550883,
0.30,0.29,0.539910,0.552430,0.560583,
0.40,0.40,0.545194,0.553256,0.548485,
0.50,0.50,0.542356,0.589932,0.612541,
0.60,0.61,0.540623,0.554823,0.549013,
0.70,0.71,0.539795,0.562244,0.547882,
0.80,0.79,0.547068,0.566020,0.556831,
0.90,0.90,0.553231,0.558796,0.554989,
1.00,1.00,0.543937,0.563535,0.559513,
$ gcc --version
gcc-6 (Ubuntu/Linaro 6.3.0-18ubuntu2~16.04) 6.3.0 20170519

I find the result from clang quite puzzling, while the one from gcc much understandable. Possibly, the results would be more insightful in the future. Anyway, this code snippet is out so that you can test it on your box using the compiler you prefer.