This post is pretty good read on cache effect. Even though C# is used in that article, the author claims that language choice has little effect on the conclusion. Since I don’t have access to C# compilers on my Linux box, I converted the examples to C to reproduce it. Surprisingly, the result is not 100% consistent with his observation.

There are seven examples in the original post, but ex1 is just a special case of ex2, so I would only discuss ex[2..7].

§ex2

1
2
3
4
5
6
7
8
9
10
11
12
$ clang -O test.c -lpthread ; ./a.out
== ex2 ==
step is 1: Elapsed time: 0.044202 seconds
step is 2: Elapsed time: 0.041431 seconds
step is 4: Elapsed time: 0.040507 seconds
step is 8: Elapsed time: 0.040617 seconds
step is 16: Elapsed time: 0.041350 seconds
step is 32: Elapsed time: 0.026930 seconds
step is 64: Elapsed time: 0.016239 seconds
step is 128: Elapsed time: 0.006211 seconds
step is 256: Elapsed time: 0.003147 seconds
step is 512: Elapsed time: 0.001748 seconds

For step in [1..16], the access is within the cache line, so exec time is not much different. Starting from 32, the number of cache lines used drops by half compared with next step size, which accounts for the diminishing of exec time.

§ex3

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
$ clang -O test.c -lpthread ; ./a.out
== ex3 ==
size is 1K: Elapsed time: 0.225452 seconds
size is 2K: Elapsed time: 0.224542 seconds
size is 4K: Elapsed time: 0.231378 seconds
size is 8K: Elapsed time: 0.219132 seconds
size is 16K: Elapsed time: 0.222465 seconds
size is 32K: Elapsed time: 0.219828 seconds
size is 64K: Elapsed time: 0.219956 seconds
size is 128K: Elapsed time: 0.218648 seconds
size is 256K: Elapsed time: 0.252090 seconds
size is 512K: Elapsed time: 0.228325 seconds
size is 1024K: Elapsed time: 0.272825 seconds
size is 2048K: Elapsed time: 0.473781 seconds
size is 4096K: Elapsed time: 0.547629 seconds
size is 8192K: Elapsed time: 0.605537 seconds
size is 16384K: Elapsed time: 0.625167 seconds
size is 32768K: Elapsed time: 0.663107 seconds

The caches on my box have the following size, and I have to say the pattern is not very obvious to me.

1
2
3
L1d: 32K
L2: 256K
L3: 4096K

§ex4

EDIT: Need to use (*3) instead of ++ due to heavy optimization of clang.

1
2
3
4
5
6
$ clang test.c -lpthread; ./a.out
== ex4 ==
Elapsed time: 0.360638 seconds
Elapsed time: 0.176043 seconds
Elapsed time: 0.175649 seconds
Elapsed time: 0.351160 seconds

I could see the data dependency effect only if I don’t turn on optimization, as shown above.

Once the optimization is turned on, the result is totally different:

1
2
3
4
5
6
$ clang -O test.c -lpthread; ./a.out
== ex4 ==
Elapsed time: 0.003489 seconds
Elapsed time: 0.011261 seconds
Elapsed time: 0.008456 seconds
Elapsed time: 0.006193 seconds

The second case is ~3x slower, which I believe is a [bug]https://llvm.org/bugs/show_bug.cgi?id=28128() in auto-vectorizatoin of clang. The third and forth case are ~2x slower, because the consecutive multiplication in the first case could be collapsed into one, while infeasible for latter cases.

§ex5

Similar result on my box, even though only using 4M array.

Cache associativity could be obtained using techniques shown at https://ferdinand-muetsch.de/ubuntu-cache-information-bash-script/ on Linux systems.

§ex6

Similar results:

1
2
3
4
$ clang -O test.c -lpthread; ./a.out
== ex6 ==
Elapsed time: 0.000223 seconds
Elapsed time: 0.000050 seconds

§ex7

No weird result any more.

1
2
3
4
5
$ clang -O test.c -lpthread; ./a.out
== ex7 ==
Elapsed time: 0.026791 seconds
Elapsed time: 0.025850 seconds
Elapsed time: 0.011034 seconds

§Code

The complete source code:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <stdio.h>
#include <time.h>
#include <pthread.h>

void tik()
{
static clock_t start, end;
static int flag = 0;
static double elapsed_time;
if (flag == 0) {
start = clock();
} else {
end = clock();
elapsed_time = (double)(end - start) / (double)CLOCKS_PER_SEC;
printf("Elapsed time: %f seconds\n", elapsed_time);
}
flag = 1 - flag;
}

typedef unsigned int uint;

static const uint arr_len = 64 * 1024 * 1024;
static uint i_max = 64 * 1024 * 1024;
static uint arr[arr_len];

static uint s_counter[1024];

static uint A, B, C, D, E, F, G;

void ex2()
{
puts("== ex2 ==");
// warm up cache
for (uint i = 0; i < arr_len; i += 1) {
arr[i] *= 3;
}
uint step_max = 1024;
for (uint step = 1; step < step_max; step *= 2) {
printf("step is %u: ", step);
tik();
for (uint i = 0; i < arr_len; i += step) {
arr[i] *= 3;
}
tik();
}
}

void ex3()
{
puts("== ex3 ==");
uint llc_size = 64 * 1024 * 1024;
for (uint size = 1024; size < llc_size; size *= 2) {
printf("size is %uK: ", size/1024);
tik();
for (uint i = 0; i < i_max; ++i) {
arr[ (i*16) % size ] *= 3;
}
tik();
}
}

void ex4()
{
puts("== ex4 ==");
static uint acc[2] = {1, 1};
tik();
for (uint i = 0; i < i_max; ++i) {
acc[0] *= 3;
acc[0] *= 3;
}
tik();
acc[0] = acc[1] = 1;
tik();
for (uint i = 0; i < i_max; ++i) {
acc[0] *= 3;
acc[1] *= 3;
}
tik();
acc[0] = acc[1] = 1;
tik();
#pragma clang loop vectorize(disable)
for (uint i = 0; i < i_max; ++i) {
acc[0] *= 3;
acc[1] *= 3;
}
tik();
acc[0] = acc[1] = 1;
tik();
for (uint i = 0; i < i_max; ++i) {
acc[0] *= 3;
}
for (uint i = 0; i < i_max; ++i) {
acc[1] *= 3;
}
tik();
}

void ex5()
{
static const uint arr_len = 4*1024*1024;
static unsigned char arr[arr_len];
for (uint K = 1; K <= 512; ++K) {
tik();
uint p = 0;
for (uint i = 0; i < i_max; ++i) {
arr[p]++;
p += K;
if (p >= arr_len) { p = 0; }
}
tik();
}
}

static void inc_counter(uint p)
{
for (uint i = 0; i < i_max; ++i) {
s_counter[p] += 3;
}
}

void ex6()
{
puts("== ex6 ==");
static const uintptr_t n_threads = 4;
pthread_t tasks[n_threads];
tik();
for (uintptr_t i = 0; i < n_threads; ++i) {
pthread_create(&tasks[i], NULL, (void*)&inc_counter, (void*)i);
}
for (uintptr_t i = 0; i < n_threads; ++i) {
pthread_join(tasks[i], NULL);
}
tik();
tik();
for (uintptr_t i = 0; i < n_threads; ++i) {
pthread_create(&tasks[i], NULL, (void*)&inc_counter, (void*)(16+i*16));
}
for (uintptr_t i = 0; i < n_threads; ++i) {
pthread_join(tasks[i], NULL);
}
tik();
}

void ex7()
{
puts("== ex7 ==");
A = B = C = D = E = F = G = 1;

tik();
for (uint i = 0; i < i_max; ++i) {
A *= 3;
B *= 3;
C *= 3;
D *= 3;
}
tik();
tik();
for (uint i = 0; i < i_max; ++i) {
A *= 3;
C *= 3;
E *= 3;
G *= 3;
}
tik();
tik();
for (uint i = 0; i < i_max; ++i) {
A *= 3;
C *= 3;
}
tik();
}

int main (int argc, char** argv) {
ex2();
ex3();
ex4();
ex5();
ex6();
ex7();

return 0;
}