KEMBAR78
Performance Profiling in Rust | PDF
Performance Analysis
The Rust edition
Performance Analysis
Questions:
Performance Analysis
Questions:
Who ate my CPU?
Performance Analysis
Questions:
Who ate my CPU? Who ate my RAM?
CPU
CPU profiling
$ cargo install flamegraph
$ cargo flamegraph --dev
CPU profiling
$ cargo install flamegraph
$ cargo flamegraph --dev
dtrace: system integrity protection is on, some features will not be
available
dtrace: failed to initialize dtrace: DTrace requires additional
privileges
failed to sample program
Just use linux
$ docker run --rm rust:1.51
# apt-get install linux-perf
...
# cargo flamegraph
Finished dev [unoptimized + debuginfo] target(s) in 0.03s
/usr/bin/perf: line 13: exec: perf_5.10: not found
E: linux-perf-5.10 is not installed.
failed to sample program
Match the kernel version
$ docker run --rm rust:1.51-bullseye
# apt-get install linux-perf
...
# cargo flamegraph --dev
Finished dev [unoptimized + debuginfo] target(s) in 0.03s
….
Profiling a running program
$ cargo install inferno
$ perf record -p "$(pgrep profexample)" -F 997 -g
…^C
[ perf record: Captured and wrote 5.535 MB perf.data (81144 samples) ]
$ perf script | inferno-collapse-perf > stacks.folded
$ inferno-flamegraph < stacks.folded > flamegraph.svg
$ open flamegraph.svg
...
Small digression
● Frame pointers:
push rbp
mov rbp, rsp
Small digression (2)
● On x86-64 the default is to omit frame pointers
● X86-64 ABI says:
● The conventional use of %rbp as a frame pointer for the stack frame may be avoided by using %rsp (the stack
pointer) to index into the stack frame. This technique saves two instructions in the prologue and epilogue and
makes one additional general-purpose register (%rbp) available.
● Gcc since 4.6 omits frame pointers by default on x84-6
● Rust omits frame pointers also on dev builds
● DWARF info is used to figure out the layout of the stack frame for
each function. You don’t need full debug info for backtraces:
[profile.release]
debug = 1
Profiling a running program
$ cargo install inferno
$ perf record -p "$(pgrep profexample)" -F 997 -g --call-graph dwarf
…^C
[ perf record: Captured and wrote 461.199 MB perf.data (57251 samples) ]
$ perf script | inferno-collapse-perf > stacks.folded
$ inferno-flamegraph < stacks.folded > flamegraph.svg
$ open flamegraph.svg
...
Questions:
What are those unknown functions called by “factorial”?
Questions:
● What if I want to run this on k8s where I don’t control my kernel version?
● What if I want to run this on mac without pulling my hair out?
● What if I don’t have a shell on prod?
Questions:
● What if I want to run this on k8s where I don’t control my kernel version?
● What if I want to run this on mac without pulling my hair out?
● What if I don’t have a shell on prod?
Go
package main
import (
"net/http"
_ "net/http/pprof"
)
func main() {
// ...
http.ListenAndServe("localhost:6060", nil)
}
$ go tool pprof --http localhost:4080
'http://localhost:6060/debug/pprof/profile'
Fetching profile over HTTP from http://localhost:6060/debug/pprof/profile
Saved profile in /Users/mkm/pprof/pprof.samples.cpu.001.pb.gz
Serving web UI on http://localhost:4080
● https://github.com/google/pprof
○ Multiple languages, not only Go
● https://github.com/google/perf_data_converter
Rust?
Enter: tikv/pprof-rs !!
[dependencies]
+pprof = { version = "0.4", features = ["flamegraph"] }
tikv/pprof-rs
fn main() {
+ let guard = pprof::ProfilerGuard::new(997).unwrap();
+
for _ in 0..6 {
thread::spawn(worker);
}
thread::sleep(time::Duration::from_secs(10));
+
+ if let Ok(report) = guard.report().build() {
+ let file = File::create("flamegraph.svg").unwrap();
+ report.flamegraph(file).unwrap();
+ };
}
$ kubectl -n iox port-forward deploy/iox-router-1000 8080:8080
$ firefox 'http://localhost:8080/debug/pprof/profile?seconds=30'
$ kubectl -n iox port-forward deploy/iox-router-1000 8080:8080
$ go tool --http localhost:4080 
pprof 'http://localhost:8080/debug/pprof/profile?seconds=30'
Compatible with https://speedscope.app
How does that work
https://github.com/influxdata/influxdb_iox/blob/main/src/influxdb_ioxd/http.rs#L364
How does that work
$ curl 'http://localhost:8080/debug/pprof/profile?seconds=1' | pq --msgtype perftools.profiles.Profile | jq
{
"sample_type": [
{
"type": 1164,
"unit": 1165
}
],
"sample": [
{
"location_id": [
1,
2,
...
"value": [
1
],
...
"string_table": [
"",
"flatbuffers::builder::FlatBufferBuilder::create_string",
"_ZN81_$LT$core..str..iter..Chars$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17hfd848827f3ee829eE",
"_ZN4core3ptr19swap_nonoverlapping17h364e10b3426efe72E",
"_ZN6server15Server$LT$M$GT$19write_sharded_entry28_$u7b$$u7b$closure$u7d$$u7d$17h4746f0c3132c7a41E",
"_ZN5alloc4sync12Arc$LT$T$GT$9drop_slow17h20a22a0cdaab7f27E",
"_ZN22influxdb_line_protocol7tag_key17hf795357f1c28005dE",
How does that work
$ curl -s -H "accept: text/html" 'http://localhost:8080/debug/pprof/profile?seconds=1' | head
<?xml version="1.0" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg version="1.1" width="1200" height="2262" onload="init(evt)" viewBox="0 0
1200 2262" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:fg="http://github.com/jonhoo/inferno"><!--Flame graph stack visualization. See https://github.com/brendangregg/FlameGraph for
latest version, and http://www.brendangregg.com/flamegraphs.html for examples.--><!--NOTES: --><defs><linearGradient id="background"
y1="0" y2="1" x1="0" x2="0"><stop stop-color="#eeeeee" offset="5%"/><stop stop-color="#eeeeb0"
offset="95%"/></linearGradient></defs><style type="text/css">
text { font-family:"Verdana"; font-size:12px; fill:rgb(0,0,0); }
#title { text-anchor:middle; font-size:17px; }
#search { opacity:0.1; cursor:pointer; }
#search:hover, #search.show { opacity:1; }
#subtitle { text-anchor:middle; font-color:rgb(160,160,160); }
#unzoom { cursor:pointer; }
#frames > *:hover { stroke:black; stroke-width:0.5; cursor:pointer; }
.hide { display:none; }
.parent { opacity:0.5; }
...
Backtraces and async
https://github.com/rust-lang/rust/issues/73524
HEAP
Attempt 1
[dependencies]
tikv-jemallocator = {version = "0.4.0", features = ["profiling"] }
#[global_allocator]
static GLOBAL: Jemalloc = Jemalloc;
$ cargo build &&
_RJEM_MALLOC_CONF="prof:true,prof_final:true,prof_prefix:jeprof.out"
./target/debug/profexample
Attempt 2: implement it like pprof-rs
use super::Profiler;
use libc::{c_int, c_void, size_t};
#[link(name = "jemalloc")]
extern "C" {
#[link_name = "_rjem_malloc"]
pub fn sys_malloc(size: size_t) -> *mut c_void;
#[link_name = "_rjem_free"]
pub fn sys_free(ptr: *mut c_void);
#[link_name = "_rjem_realloc"]
pub fn sys_realloc(ptr: *mut c_void, size: size_t) -> *mut c_void;
}
#[no_mangle]
pub unsafe extern "C" fn malloc(size: size_t) -> *mut c_void {
Profiler::track_allocated(size as isize);
sys_malloc(size)
}
//! The standard API includes: the [`malloc`], [`calloc`], [`realloc`], and
//! [`free`], which conform to to ISO/IEC 9899:1990 (“ISO C90”),
//! [`posix_memalign`] which conforms to conforms to POSIX.1-2016, and
//! [`aligned_alloc`].
Questions
Performance Profiling in Rust
Performance Profiling in Rust
Performance Profiling in Rust
Performance Profiling in Rust

Performance Profiling in Rust

  • 1.
  • 5.
  • 6.
  • 7.
  • 8.
  • 10.
    CPU profiling $ cargoinstall flamegraph $ cargo flamegraph --dev
  • 11.
    CPU profiling $ cargoinstall flamegraph $ cargo flamegraph --dev dtrace: system integrity protection is on, some features will not be available dtrace: failed to initialize dtrace: DTrace requires additional privileges failed to sample program
  • 13.
    Just use linux $docker run --rm rust:1.51 # apt-get install linux-perf ... # cargo flamegraph Finished dev [unoptimized + debuginfo] target(s) in 0.03s /usr/bin/perf: line 13: exec: perf_5.10: not found E: linux-perf-5.10 is not installed. failed to sample program
  • 15.
    Match the kernelversion $ docker run --rm rust:1.51-bullseye # apt-get install linux-perf ... # cargo flamegraph --dev Finished dev [unoptimized + debuginfo] target(s) in 0.03s ….
  • 17.
    Profiling a runningprogram $ cargo install inferno $ perf record -p "$(pgrep profexample)" -F 997 -g …^C [ perf record: Captured and wrote 5.535 MB perf.data (81144 samples) ] $ perf script | inferno-collapse-perf > stacks.folded $ inferno-flamegraph < stacks.folded > flamegraph.svg $ open flamegraph.svg ...
  • 19.
    Small digression ● Framepointers: push rbp mov rbp, rsp
  • 20.
    Small digression (2) ●On x86-64 the default is to omit frame pointers ● X86-64 ABI says: ● The conventional use of %rbp as a frame pointer for the stack frame may be avoided by using %rsp (the stack pointer) to index into the stack frame. This technique saves two instructions in the prologue and epilogue and makes one additional general-purpose register (%rbp) available. ● Gcc since 4.6 omits frame pointers by default on x84-6 ● Rust omits frame pointers also on dev builds ● DWARF info is used to figure out the layout of the stack frame for each function. You don’t need full debug info for backtraces: [profile.release] debug = 1
  • 21.
    Profiling a runningprogram $ cargo install inferno $ perf record -p "$(pgrep profexample)" -F 997 -g --call-graph dwarf …^C [ perf record: Captured and wrote 461.199 MB perf.data (57251 samples) ] $ perf script | inferno-collapse-perf > stacks.folded $ inferno-flamegraph < stacks.folded > flamegraph.svg $ open flamegraph.svg ...
  • 23.
    Questions: What are thoseunknown functions called by “factorial”?
  • 24.
    Questions: ● What ifI want to run this on k8s where I don’t control my kernel version? ● What if I want to run this on mac without pulling my hair out? ● What if I don’t have a shell on prod?
  • 25.
    Questions: ● What ifI want to run this on k8s where I don’t control my kernel version? ● What if I want to run this on mac without pulling my hair out? ● What if I don’t have a shell on prod?
  • 27.
    Go package main import ( "net/http" _"net/http/pprof" ) func main() { // ... http.ListenAndServe("localhost:6060", nil) }
  • 28.
    $ go toolpprof --http localhost:4080 'http://localhost:6060/debug/pprof/profile' Fetching profile over HTTP from http://localhost:6060/debug/pprof/profile Saved profile in /Users/mkm/pprof/pprof.samples.cpu.001.pb.gz Serving web UI on http://localhost:4080
  • 32.
    ● https://github.com/google/pprof ○ Multiplelanguages, not only Go ● https://github.com/google/perf_data_converter
  • 33.
    Rust? Enter: tikv/pprof-rs !! [dependencies] +pprof= { version = "0.4", features = ["flamegraph"] }
  • 34.
    tikv/pprof-rs fn main() { +let guard = pprof::ProfilerGuard::new(997).unwrap(); + for _ in 0..6 { thread::spawn(worker); } thread::sleep(time::Duration::from_secs(10)); + + if let Ok(report) = guard.report().build() { + let file = File::create("flamegraph.svg").unwrap(); + report.flamegraph(file).unwrap(); + }; }
  • 38.
    $ kubectl -niox port-forward deploy/iox-router-1000 8080:8080 $ firefox 'http://localhost:8080/debug/pprof/profile?seconds=30'
  • 40.
    $ kubectl -niox port-forward deploy/iox-router-1000 8080:8080 $ go tool --http localhost:4080 pprof 'http://localhost:8080/debug/pprof/profile?seconds=30'
  • 44.
  • 45.
    How does thatwork https://github.com/influxdata/influxdb_iox/blob/main/src/influxdb_ioxd/http.rs#L364
  • 46.
    How does thatwork $ curl 'http://localhost:8080/debug/pprof/profile?seconds=1' | pq --msgtype perftools.profiles.Profile | jq { "sample_type": [ { "type": 1164, "unit": 1165 } ], "sample": [ { "location_id": [ 1, 2, ... "value": [ 1 ], ... "string_table": [ "", "flatbuffers::builder::FlatBufferBuilder::create_string", "_ZN81_$LT$core..str..iter..Chars$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17hfd848827f3ee829eE", "_ZN4core3ptr19swap_nonoverlapping17h364e10b3426efe72E", "_ZN6server15Server$LT$M$GT$19write_sharded_entry28_$u7b$$u7b$closure$u7d$$u7d$17h4746f0c3132c7a41E", "_ZN5alloc4sync12Arc$LT$T$GT$9drop_slow17h20a22a0cdaab7f27E", "_ZN22influxdb_line_protocol7tag_key17hf795357f1c28005dE",
  • 47.
    How does thatwork $ curl -s -H "accept: text/html" 'http://localhost:8080/debug/pprof/profile?seconds=1' | head <?xml version="1.0" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg version="1.1" width="1200" height="2262" onload="init(evt)" viewBox="0 0 1200 2262" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:fg="http://github.com/jonhoo/inferno"><!--Flame graph stack visualization. See https://github.com/brendangregg/FlameGraph for latest version, and http://www.brendangregg.com/flamegraphs.html for examples.--><!--NOTES: --><defs><linearGradient id="background" y1="0" y2="1" x1="0" x2="0"><stop stop-color="#eeeeee" offset="5%"/><stop stop-color="#eeeeb0" offset="95%"/></linearGradient></defs><style type="text/css"> text { font-family:"Verdana"; font-size:12px; fill:rgb(0,0,0); } #title { text-anchor:middle; font-size:17px; } #search { opacity:0.1; cursor:pointer; } #search:hover, #search.show { opacity:1; } #subtitle { text-anchor:middle; font-color:rgb(160,160,160); } #unzoom { cursor:pointer; } #frames > *:hover { stroke:black; stroke-width:0.5; cursor:pointer; } .hide { display:none; } .parent { opacity:0.5; } ...
  • 48.
  • 49.
  • 50.
    Attempt 1 [dependencies] tikv-jemallocator ={version = "0.4.0", features = ["profiling"] } #[global_allocator] static GLOBAL: Jemalloc = Jemalloc; $ cargo build && _RJEM_MALLOC_CONF="prof:true,prof_final:true,prof_prefix:jeprof.out" ./target/debug/profexample
  • 52.
    Attempt 2: implementit like pprof-rs use super::Profiler; use libc::{c_int, c_void, size_t}; #[link(name = "jemalloc")] extern "C" { #[link_name = "_rjem_malloc"] pub fn sys_malloc(size: size_t) -> *mut c_void; #[link_name = "_rjem_free"] pub fn sys_free(ptr: *mut c_void); #[link_name = "_rjem_realloc"] pub fn sys_realloc(ptr: *mut c_void, size: size_t) -> *mut c_void; } #[no_mangle] pub unsafe extern "C" fn malloc(size: size_t) -> *mut c_void { Profiler::track_allocated(size as isize); sys_malloc(size) }
  • 53.
    //! The standardAPI includes: the [`malloc`], [`calloc`], [`realloc`], and //! [`free`], which conform to to ISO/IEC 9899:1990 (“ISO C90”), //! [`posix_memalign`] which conforms to conforms to POSIX.1-2016, and //! [`aligned_alloc`].
  • 57.