Implementing the ping-pong program in Erlang, C++ and Java.

§Erlang

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
-module(pingpong).
-mode(compile).
-compile(export_all).
-compile(nowarn_export_all).

send_msg(ID, Msg) ->
Msg,
% io:format("~p~n", [Msg]),
ID ! {self()},
ok.

pong(0) ->
ok;
pong(N) ->
receive {PingID} ->
send_msg(PingID, "Pong"),
pong(N-1)
end.

ping(_, 0) ->
ok;
ping(PongID, N) ->
send_msg(PongID, "Ping"),
receive _ ->
ping(PongID, N-1)
end.

main(_) ->
MaxCount = 20000,
PongID = spawn(?MODULE, pong, [MaxCount]),
T1 = erlang:monotonic_time(),
ping(PongID, MaxCount),
T2 = erlang:monotonic_time(),
Time = erlang:convert_time_unit(T2 - T1, native, microsecond),
io:format("Elapsed time: ~p ms~n", [Time / 1000]),
ok.

Running using a single CPU or two logical CPUs belonging to the same core. (Had to specify +S; otherwise it creates too many scheduler threads.)

1
2
3
4
$ erlc pingpong.erl && taskset -c 0   erl +S 1 -eval "pingpong:main(1), halt()." -noshell
Elapsed time: 21.66 ms
$ erlc pingpong.erl && taskset -c 0,6 erl +S 2 -eval "pingpong:main(1), halt()." -noshell
Elapsed time: 21.071 ms

§C++

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#include <iostream>
#include <thread>
#include <semaphore>
#include <chrono>

#include <sys/resource.h>

using namespace std;

class MBox {
binary_semaphore s{0};
public:
void put(string msg) {
// cout << msg << endl;
s.release();
}

void get() {
s.acquire();
}
};

MBox ping_mbox;
MBox pong_mbox;

constexpr int max_count = 20000;

void pong_runnable() {
for (auto i = 0; i < max_count; ++i) {
pong_mbox.get();
ping_mbox.put("Pong");
}
}

int main()
{
thread pong_worker(pong_runnable);

auto start = chrono::steady_clock::now();

for (auto i = 0; i < max_count; ++i) {
pong_mbox.put("Ping");
ping_mbox.get();
}

auto end = chrono::steady_clock::now();
// the final `get` is blocking, so waiting for child thread outside measurement window
pong_worker.join();

auto time_ms = chrono::duration_cast<chrono::microseconds>(end - start).count() / 1000.0;

cout << "Elapsed time: " << time_ms << " ms" << endl;

// struct rusage ru;
// if (getrusage(RUSAGE_SELF, &ru)) {
// perror("getrusage");
// } else {
// printf(" voluntary switches = %ld\n", ru.ru_nvcsw);
// printf(" involuntary switches = %ld\n", ru.ru_nivcsw);
// }

return 0;
}
1
2
3
4
$ clang++ -O -std=c++20 pingpong.cpp && taskset -c 0   ./a.out
Elapsed time: 69.05 ms
$ clang++ -O -std=c++20 pingpong.cpp && taskset -c 0,6 ./a.out
Elapsed time: 6.558 ms

§Java

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import java.util.concurrent.Semaphore;

class pingpong {
final static boolean use_vthread = false;
// final static boolean use_vthread = true;
final static int max_count = 20000;

static class MBox {
private final Semaphore s = new Semaphore(0);

void put(String msg) {
// System.out.println(msg);
s.release();
}

void get() {
try {
s.acquire();
} catch (InterruptedException e) {
e.printStackTrace();
return;
}
}
}

static final MBox ping_mbox = new MBox();
static final MBox pong_mbox = new MBox();

static void launch_pong() {
Runnable r = () -> {
for (int i = 0; i < max_count; i++) {
pong_mbox.get();
ping_mbox.put("Pong");
}
};

if (use_vthread) {
Thread.startVirtualThread(r);
} else {
new Thread(r).start();
}
}

static void run_ping() throws InterruptedException {
Runnable r = () -> {
for (int i = 0; i < max_count; i++) {
pong_mbox.put("Ping");
ping_mbox.get();
}
};

if (use_vthread) {
Thread.startVirtualThread(r).join();
} else {
r.run();
}
}

static void warmup() throws InterruptedException {
for (var i=0; i<10; ++i) {
launch_pong();
run_ping();
}
}

public static void main(String[] args) throws InterruptedException {
warmup();
launch_pong();

long start = System.nanoTime();
run_ping();
long end = System.nanoTime();

long timeElapsed = end - start;
System.out.printf("Elapsed time: %.3f ms\n", timeElapsed / 1_000_000.0);
}
}

Using OS threads:

1
2
3
4
$ taskset -c 0   java --enable-preview --source 19 pingpong.java
Elapsed time: 64.085 ms
$ taskset -c 0,6 java --enable-preview --source 19 pingpong.java
Elapsed time: 114.891 ms

Using virtual (green) threads:

1
2
3
4
$ taskset -c 0   java --enable-preview --source 19 pingpong.java
Elapsed time: 18.058 ms
$ taskset -c 0,6 java --enable-preview --source 19 pingpong.java
Elapsed time: 66.210 ms

§Conclusion

Grouping all results in a table:

Lang/Runtime Elapsed time on single CPU Elapsed time on 2 CPUs (same core)
Erlang 21.66 21.071
C++ 69.05 6.558
Java 64.085 114.891
Java (vthread) 18.058 66.210
  • C++ and Java have on-par peak performance on a single core: 69ms vs 64ms
  • JVM has poor support for SMP: using more CPU has detrimental effect, 64ms -> 114ms and 18 -> 66ms
  • Java Virtual Threads and Erlang show that user-mode context switching is much cheaper: 18ms/21ms vs 64ms/69ms
  • C++ thread context switching has better scalability: 69ms -> 6ms; using getrusage one can see the #context switches drops significantly.

§ENV

1
2
3
4
5
6
7
8
Erlang      : 25
Clang : 14
Java : 19
Linux : 5.10
OS : Debian 11.6
#CPU : 12 (6 cores)
CPU : Intel(R) Core(TM) i7-9850H CPU @ 2.60GHz
Turbo boost : off