Came across this commit while working with OpenJDK. The whole stuff was beyond me at first sight, so here’s a simplified version to capture the idiom.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#include <iostream>

using namespace std;

class A {
public:
virtual void f() = 0;

virtual ~A() = default;
};

class B : public A {
public:
virtual void f() {
puts("B");
}
};

class C : public B {
};

class O {
#if 0
template<typename T, typename U, typename CL>
void
iterate(void (T::*)(), void (U::*)(), CL* t) {
t->f();
}
#else
template<typename T, typename U, typename CL>
typename enable_if<is_same<T, U>::value, void>::type
iterate(void (T::*)(), void (U::*)(), CL* t) {
t->f();
}
template<typename T, typename U, typename CL>
typename enable_if<!is_same<T, U>::value, void>::type
iterate(void (T::*)(), void (U::*)(), CL* t) {
t->T::f();
}
#endif
public:
template<typename CL>
void iterate(CL* t) {
iterate(&CL::f, &A::f, t);
}
};

void myfunc(C* c);

void myfunc(C* c) {
O o;
o.iterate(c);
}

Using the #if switch, we can control the devirtulation.

1
2
$ clang++ -O -S test.cc -o slow.s # with #if 1
$ clang++ -O -S test.cc -o fast.s # with #if 0

Comparing slow.s with fast.s, we can see the key difference lies in the body of myfunc. In the fast case, puts is inlined, hence a direct call, in contrast to the indirect call in the slow case.

1
2
3
# slow.s
movq (%rdi), %rax
jmpq *(%rax) # TAILCALL
1
2
3
# fast.s
movl $.L.str, %edi
jmp puts # TAILCALL