Implementing a ping-pong benchmark in Go, C++, and Java.

The earlier 2022 ping-pong post compared Erlang, C++, and Java.

All programs below perform 20,000 ping-pong handshakes. Measurements are medians of five runs.

§Go

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
package main

import (
"fmt"
"sync"
"time"
)

func t1(loop int, ch chan int, wg *sync.WaitGroup) {
defer wg.Done()

ch <- 1

for {
x := <- ch
if x == loop {
break
}
ch <- (x+1)
}
}

func t2(loop int, ch chan int, wg *sync.WaitGroup) {
defer wg.Done()

for {
x := <- ch
ch <- x
if x == loop {
break
}
}

}

func main() {
var wg sync.WaitGroup

var ch = make(chan int)

var loop = 20_000

start := time.Now()

wg.Add(2)
go t1(loop, ch, &wg)
go t2(loop, ch, &wg)
wg.Wait()

elapsed := time.Since(start)

fmt.Println("Time (ms): ", elapsed.Milliseconds())
}

Running on one CPU and on two logical CPUs from the same physical core:

1
2
3
4
$ taskset -c 0   go run pingpong.go
Time (ms): 12
$ taskset -c 0,6 go run pingpong.go
Time (ms): 10

§C++

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#include <iostream>
#include <thread>
#include <semaphore>
#include <chrono>

#include <sys/resource.h>

using namespace std;

class MBox {
binary_semaphore s{0};
public:
void put(string msg) {
// cout << msg << endl;
s.release();
}

void get() {
s.acquire();
}
};

MBox ping_mbox;
MBox pong_mbox;

constexpr int max_count = 20000;

void pong_runnable() {
for (auto i = 0; i < max_count; ++i) {
pong_mbox.get();
ping_mbox.put("Pong");
}
}

int main()
{
thread pong_worker(pong_runnable);

auto start = chrono::steady_clock::now();

for (auto i = 0; i < max_count; ++i) {
pong_mbox.put("Ping");
ping_mbox.get();
}

auto end = chrono::steady_clock::now();
pong_worker.join();

auto time_ms = chrono::duration_cast<chrono::microseconds>(end - start).count() / 1000.0;

cout << "Elapsed time: " << time_ms << " ms" << endl;

// struct rusage ru;
// if (getrusage(RUSAGE_SELF, &ru)) {
// perror("getrusage");
// } else {
// printf(" voluntary switches = %ld\n", ru.ru_nvcsw);
// printf(" involuntary switches = %ld\n", ru.ru_nivcsw);
// }

return 0;
}
1
2
3
4
5
$ clang++ -O -std=c++20 pingpong.cpp
$ taskset -c 0 ./a.out
Elapsed time: 79.802 ms
$ taskset -c 0,6 ./a.out
Elapsed time: 10.722 ms

§Java

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import java.util.concurrent.Exchanger;
import java.util.concurrent.locks.*;
import java.util.function.*;

class pingpong {
// final static boolean use_vthread = false;
final static boolean use_vthread = true;
final static int max_count = 20000;

final static Exchanger<String> exchanger = new Exchanger<>();

static void send_msg(String msg) {
// System.out.println(msg);
}

static Thread launch_pong() {
Runnable r = () -> {
try {
for (int i = 0; i < max_count; i++) {
exchanger.exchange("Pong");
send_msg("Pong");
}
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
};

Thread t;

if (use_vthread) {
t = Thread.startVirtualThread(r);
} else {
t = new Thread(r);
t.start();
}

return t;
}

static void run_ping() {
Runnable r = () -> {
try {
for (int i = 0; i < max_count; i++) {
send_msg("Ping");
exchanger.exchange("Ping");
}
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
};

if (use_vthread) {
try {
Thread.startVirtualThread(r).join();
} catch (InterruptedException e) {
e.printStackTrace();
}
} else {
// use current thread
r.run();
}
}

static void warmup() throws InterruptedException {
for (var i = 0; i < 200; ++i) {
var t = launch_pong();
run_ping();
t.join();
}
}

public static void main(String[] args) throws InterruptedException {
warmup();

var t = launch_pong();

long start = System.nanoTime();
run_ping();
t.join();
long end = System.nanoTime();

long timeElapsed = end - start;
System.out.printf("Elapsed time: %.3f ms\n", timeElapsed / 1_000_000.0);
}
}

Using virtual threads:

1
2
3
4
$ taskset -c 0   java pingpong.java
Elapsed time: 7.615 ms
$ taskset -c 0,6 java pingpong.java
Elapsed time: 1.261 ms

Using OS threads (use_vthread = false):

1
2
3
4
$ taskset -c 0   java pingpong.java
Elapsed time: 62.011 ms
$ taskset -c 0,6 java pingpong.java
Elapsed time: 1.191 ms

§Conclusion

Median of five runs:

Lang/Runtime Elapsed time on one CPU Elapsed time on 2 logical CPUs
Go 12 10
C++ 79.802 10.722
Java 62.011 1.191
Java (vthread) 7.615 1.261
  • On one CPU, Java virtual threads are fastest; OS-thread C++ and Java are much slower.
  • On two logical CPUs, both Java variants are fastest, while C++ closes the gap with Go.

§ENV

1
2
3
4
5
6
7
Go          : 1.26
Clang : 19
Java : 26
Linux : 6.12
#CPU : 12 (6 cores)
CPU : Intel(R) Core(TM) i7-9850H CPU @ 2.60GHz
Turbo boost : off