Basically, confirm the discovery from http://david-grs.github.io/tls_performance_overhead_cost_linux/ on my Linux box.

main.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
#include "tik.h"

void f();
void g();

int main(int argc, char *argv[])
{
size_t size = 1000*1000*1000;
tik();
for (size_t i = 0; i < size; ++i) {
f();
}
tik();
tik();
for (size_t i = 0; i < size; ++i) {
g();
}
tik();

return 0;
}

fg.c

1
2
3
4
5
6
7
8
9
10
11
void f()
{
static int x;
x++;
}

void g()
{
static _Thread_local int y;
y++;
}

On my box, the cost of thread-local storage is ~2x more expensive than global storage. (Compiling requires tik.)

1
2
3
4
5
$ clang -shared -O -fPIC -o libfg.so fg.c
$ clang -O main.c -lfg -L .
$ LD_LIBRARY_PATH=. ./a.out
1.9
3.9

Inspecting the generated assembly:

1
2
3
4
5
6
7
8
9
10
11
f:                                      # @f
incl f.x(%rip)
retq

g: # @g
pushq %rax
leaq g.y@TLSLD(%rip), %rdi
callq __tls_get_addr@PLT
incl g.y@DTPOFF(%rax)
popq %rax
retq

Reference

http://cs-fundamentals.com/c-programming/static-and-dynamic-linking-in-c.php#static-and-dynamic-linking

https://docs.oracle.com/cd/E19683-01/817-3677/chapter8-1/index.html