diff options
21 files changed, 438 insertions, 5 deletions
diff --git a/profcollectd/README.md b/profcollectd/README.md index dba75a2c..e9066000 100644 --- a/profcollectd/README.md +++ b/profcollectd/README.md @@ -37,6 +37,62 @@ are controlled by the following configurations: Setting the frequency value to `0` disables collection for the corresponding event. +#### Custom configuration + +In adb root: + +``` +# Record every 60s (By default, record every 10m). The actual interval will be longer than the +# set value if the device goes to hibernation. +oriole:/ # setprop persist.device_config.profcollect_native_boot.collection_interval 60 + +# Each time recording, record ETM data for 1s (By default, it's 0.5s). +oriole:/ # setprop persist.device_config.profcollect_native_boot.sampling_period 1000 + +# Set ETM data storage limit to 50G (By default, it is 512M). +oriole:/ # setprop persist.device_config.profcollect_native_boot.max_trace_limit 53687091200 + +# Enable ETM data collection (By default, it's decided by the server). +oriole:/ # setprop persist.device_config.profcollect_native_boot.enabled true + +# After adjusting configuration, need to restart profcollectd +oriole:/ # setprop ctl.stop profcollectd +# Wait for a few seconds. +oriole:/ # setprop ctl.start profcollectd + +# Check if profcollectd is running +oriole:/ # ps -e | grep profcollectd +root 918 1 10945660 47040 binder_wait_for_work 0 S profcollectd + +# Check if the new configuration takes effect. +oriole:/ # cat /data/misc/profcollectd/output/config.json +{"version":1,"node_id":[189,15,145,225,97,167],"build_fingerprint":"google/oriole/oriole:Tiramisu/TP1A.220223.002/8211650:userdebug/dev-keys","collection_interval":{"secs":60,"nanos":0},"sampling_period":{"secs":1,"nanos":0},"binary_filter":"^/(system|apex/.+)/(bin|lib|lib64)/.+","max_trace_limit":53687091200} +``` + +To check existing collected ETM data: +``` +oriole:/ # cd data/misc/profcollectd/trace/ +oriole:/data/misc/profcollectd/trace # ls +``` + +To check if ETM data can be collected successfully: +``` +# Trigger one collection manually. +oriole:/ # profcollectctl once +Trace once + +# Check trace directory to see if there is a recent manual trace file. +oriole:/ # ls /data/misc/profcollectd/trace/ +20220224-222946_manual.etmtrace +``` + +If there are too many trace files, we need to processing them to avoid reaching storage limit. +It may take a long time. +``` +oriole:/ # profcollectctl process +Processing traces +``` + ### Processing The raw tracing data needs to be combined with the original binary to create the AutoFDO branch diff --git a/profcollectd/libprofcollectd/lib.rs b/profcollectd/libprofcollectd/lib.rs index f95132f4..da178f27 100644 --- a/profcollectd/libprofcollectd/lib.rs +++ b/profcollectd/libprofcollectd/lib.rs @@ -127,6 +127,9 @@ pub fn reset() -> Result<()> { pub fn init_logging() { let min_log_level = if cfg!(feature = "test") { log::Level::Info } else { log::Level::Error }; android_logger::init_once( - android_logger::Config::default().with_tag("profcollectd").with_min_level(min_log_level), + android_logger::Config::default() + .with_tag("profcollectd") + .with_min_level(min_log_level) + .with_log_id(android_logger::LogId::System), ); } diff --git a/simpleperf/Android.bp b/simpleperf/Android.bp index ba4ef198..b5edda80 100644 --- a/simpleperf/Android.bp +++ b/simpleperf/Android.bp @@ -328,7 +328,10 @@ cc_library { srcs: ["profcollect.cpp"], host_supported: false, static_libs: ["libsimpleperf"], - shared_libs: ["libLLVM_android"], + shared_libs: [ + "libLLVM_android", + "libpower", + ], } rust_bindgen { @@ -530,6 +533,11 @@ cc_library_shared { cc_defaults { name: "simpleperf_test_srcs", + tidy_timeout_srcs: [ + "record_test.cpp", + "cmd_report_sample_test.cpp", + "cmd_report_test.cpp", + ], srcs: [ "cmd_inject_test.cpp", "cmd_kmem_test.cpp", @@ -561,6 +569,10 @@ cc_defaults { ], }, linux: { + tidy_timeout_srcs: [ + "cmd_stat_test.cpp", + "cmd_record_test.cpp", + ], srcs: [ "CallChainJoiner_test.cpp", "cmd_api_test.cpp", diff --git a/simpleperf/ETMDecoder.cpp b/simpleperf/ETMDecoder.cpp index 41a7d130..7c1cca05 100644 --- a/simpleperf/ETMDecoder.cpp +++ b/simpleperf/ETMDecoder.cpp @@ -735,7 +735,7 @@ class ETMDecoderImpl : public ETMDecoder { auto resp = decoder.TraceDataIn(OCSD_OP_DATA, data_index_, left_size, data, &processed); if (IsRespError(resp)) { // A decoding error shouldn't ruin all data. Reset decoders to recover from it. - LOG(INFO) << "reset etm decoders for seeing a decode failure, resp " << resp; + LOG(DEBUG) << "reset etm decoders for seeing a decode failure, resp " << resp; decoder.TraceDataIn(OCSD_OP_RESET, data_index_ + processed, 0, nullptr, nullptr); } data += processed; diff --git a/simpleperf/cmd_record.cpp b/simpleperf/cmd_record.cpp index b84bcfab..f206b808 100644 --- a/simpleperf/cmd_record.cpp +++ b/simpleperf/cmd_record.cpp @@ -114,7 +114,9 @@ constexpr size_t DEFAULT_CALL_CHAIN_JOINER_CACHE_SIZE = 8 * 1024 * 1024; static constexpr size_t kRecordBufferSize = 64 * 1024 * 1024; static constexpr size_t kSystemWideRecordBufferSize = 256 * 1024 * 1024; -static constexpr size_t kDefaultAuxBufferSize = 4 * 1024 * 1024; +// If the kernel needs to allocate continuous DMA memory for ETR (like when IOMMU for ETR isn't +// available), requesting 4M ETR buffer may fail and cause warning. So use 1M buffer here. +static constexpr size_t kDefaultAuxBufferSize = 1 * 1024 * 1024; // On Pixel 3, it takes about 1ms to enable ETM, and 16-40ms to disable ETM and copy 4M ETM data. // So make default period to 100ms. @@ -221,7 +223,7 @@ class RecordCommand : public Command { "--aux-buffer-size <buffer_size> Set aux buffer size, only used in cs-etm event type.\n" " Need to be power of 2 and page size aligned.\n" " Used memory size is (buffer_size * (cpu_count + 1).\n" -" Default is 4M.\n" +" Default is 1M.\n" "--no-inherit Don't record created child threads/processes.\n" "--cpu-percent <percent> Set the max percent of cpu time used for recording.\n" " percent is in range [1-100], default is 25.\n" diff --git a/simpleperf/doc/README.md b/simpleperf/doc/README.md index 473a5c11..51ffc839 100644 --- a/simpleperf/doc/README.md +++ b/simpleperf/doc/README.md @@ -95,6 +95,9 @@ See [executable_commands_reference.md](./executable_commands_reference.md). See [scripts_reference.md](./scripts_reference.md). +## View the profile + +See [view_the_profile.md](./view_the_profile.md). ## Answers to common issues diff --git a/simpleperf/doc/pictures/android_studio_profiler_flame_chart.png b/simpleperf/doc/pictures/android_studio_profiler_flame_chart.png Binary files differnew file mode 100644 index 00000000..6fea4f77 --- /dev/null +++ b/simpleperf/doc/pictures/android_studio_profiler_flame_chart.png diff --git a/simpleperf/doc/pictures/android_studio_profiler_open_perf_trace.png b/simpleperf/doc/pictures/android_studio_profiler_open_perf_trace.png Binary files differnew file mode 100644 index 00000000..faa9b70d --- /dev/null +++ b/simpleperf/doc/pictures/android_studio_profiler_open_perf_trace.png diff --git a/simpleperf/doc/pictures/android_studio_profiler_select_process.png b/simpleperf/doc/pictures/android_studio_profiler_select_process.png Binary files differnew file mode 100644 index 00000000..4004e171 --- /dev/null +++ b/simpleperf/doc/pictures/android_studio_profiler_select_process.png diff --git a/simpleperf/doc/pictures/android_studio_profiler_select_recording_method.png b/simpleperf/doc/pictures/android_studio_profiler_select_recording_method.png Binary files differnew file mode 100644 index 00000000..3813533a --- /dev/null +++ b/simpleperf/doc/pictures/android_studio_profiler_select_recording_method.png diff --git a/simpleperf/doc/pictures/continuous_pprof.png b/simpleperf/doc/pictures/continuous_pprof.png Binary files differnew file mode 100644 index 00000000..f92f7b65 --- /dev/null +++ b/simpleperf/doc/pictures/continuous_pprof.png diff --git a/simpleperf/doc/pictures/firefox_profiler.png b/simpleperf/doc/pictures/firefox_profiler.png Binary files differnew file mode 100644 index 00000000..4019289b --- /dev/null +++ b/simpleperf/doc/pictures/firefox_profiler.png diff --git a/simpleperf/doc/pictures/flamescope.png b/simpleperf/doc/pictures/flamescope.png Binary files differnew file mode 100644 index 00000000..2a19dad5 --- /dev/null +++ b/simpleperf/doc/pictures/flamescope.png diff --git a/simpleperf/doc/pictures/flamescope_click.png b/simpleperf/doc/pictures/flamescope_click.png Binary files differnew file mode 100644 index 00000000..a5a8a979 --- /dev/null +++ b/simpleperf/doc/pictures/flamescope_click.png diff --git a/simpleperf/doc/pictures/flamescope_flamegraph.png b/simpleperf/doc/pictures/flamescope_flamegraph.png Binary files differnew file mode 100644 index 00000000..8a0b4ef9 --- /dev/null +++ b/simpleperf/doc/pictures/flamescope_flamegraph.png diff --git a/simpleperf/doc/pictures/report_command.png b/simpleperf/doc/pictures/report_command.png Binary files differnew file mode 100644 index 00000000..c04419a8 --- /dev/null +++ b/simpleperf/doc/pictures/report_command.png diff --git a/simpleperf/doc/pictures/report_html.png b/simpleperf/doc/pictures/report_html.png Binary files differnew file mode 100644 index 00000000..358ef7b4 --- /dev/null +++ b/simpleperf/doc/pictures/report_html.png diff --git a/simpleperf/doc/scripts_reference.md b/simpleperf/doc/scripts_reference.md index 1a97ebe5..31dee02d 100644 --- a/simpleperf/doc/scripts_reference.md +++ b/simpleperf/doc/scripts_reference.md @@ -114,6 +114,9 @@ device. It is more convenient than running adb commands manually. ## Viewing the profile +Scripts in this section are for viewing the profile or converting profile data into formats used by +external UIs. For recommended UIs, see [view_the_profile.md](view_the_profile.md). + ### report.py report.py is a wrapper of the `report` command on the host. It accepts all options of the `report` diff --git a/simpleperf/doc/view_the_profile.md b/simpleperf/doc/view_the_profile.md new file mode 100644 index 00000000..8a0e07e7 --- /dev/null +++ b/simpleperf/doc/view_the_profile.md @@ -0,0 +1,342 @@ +# View the profile + +[TOC] + +## Introduction + +After using `simpleperf record` or `app_profiler.py`, we get a profile data file. The file contains +a list of samples. Each sample has a timestamp, a thread id, a callstack, events (like cpu-cycles +or cpu-clock) used in this sample, etc. We have many choices for viewing the profile. We can show +samples in chronological order, or show aggregated flamegraphs. We can show reports in text format, +or in some interactive UIs. + +Below shows some recommended UIs to view the profile. Google developers can find more examples in +[go/gmm-profiling](go/gmm-profiling?polyglot=linux-workstation#viewing-the-profile). + + +## Continuous PProf UI (great flamegraph UI, but only available internally) + +[PProf](https://github.com/google/pprof) is a mature profiling technology used extensively on +Google servers, with a powerful flamegraph UI, with strong drilldown, search, pivot, profile diff, +and graph visualisation. + +![Example](./pictures/continuous_pprof.png) + +We can use `pprof_proto_generator.py` to convert profiles into pprof.profile protobufs for use in +pprof. + +``` +# Output all threads, broken down by threadpool. +./pprof_proto_generator.py + +# Use proguard mapping. +./pprof_proto_generator.py --proguard-mapping-file proguard.map + +# Just the main (UI) thread (query by thread name): +./pprof_proto_generator.py --comm com.example.android.displayingbitmaps +``` + +This will print some debug logs about Failed to read symbols: this is usually OK, unless those +symbols are hotspots. + +Upload pprof.profile to http://pprof/ UI: + +``` +# Upload all threads in profile, grouped by threadpool. +# This is usually a good default, combining threads with similar names. +pprof --flame --tagroot threadpool pprof.profile + +# Upload all threads in profile, grouped by individual thread name. +pprof --flame --tagroot thread pprof.profile + +# Upload all threads in profile, without grouping by thread. +pprof --flame pprof.profile +This will output a URL, example: https://pprof.corp.google.com/?id=589a60852306144c880e36429e10b166 +``` + +## Firefox Profiler (great chronological UI) + +We can view Android profiles using Firefox Profiler: https://profiler.firefox.com/. This does not +require Firefox installation -- Firefox Profiler is just a website, you can open it in any browser. + +![Example](./pictures/firefox_profiler.png) + +Firefox Profiler has a great chronological view, as it doesn't pre-aggregate similar stack traces +like pprof does. + +We can use `gecko_profile_generator.py` to convert raw perf.data files into a Firefox Profile, with +Proguard deobfuscation. + +``` +# Create Gecko Profile +./gecko_profile_generator.py | gzip > gecko_profile.json.gz + +# Create Gecko Profile using Proguard map +./gecko_profile_generator.py --proguard-mapping-file proguard.map | gzip > gecko_profile.json.gz +``` + +Then drag-and-drop gecko_profile.json.gz into https://profiler.firefox.com/. + +Firefox Profiler supports: + +1. Aggregated Flamegraphs +2. Chronological Stackcharts + +And allows filtering by: + +1. Individual threads +2. Multiple threads (Ctrl+Click thread names to select many) +3. Timeline period +4. Stack frame text search + +## FlameScope (great jank-finding UI) + +[Netflix's FlameScope](https://github.com/Netflix/flamescope) is a rough, proof-of-concept UI that +lets you spot repeating patterns of work by laying out the profile as a subsecond heatmap. + +Below, each vertical stripe is one second, and each cell is 10ms. Redder cells have more samples. +See https://www.brendangregg.com/blog/2018-11-08/flamescope-pattern-recognition.html for how to +spot patterns. + +This is an example of a 60s DisplayBitmaps app Startup Profile. + +![Example](./pictures/flamescope.png) + +You can see: + + The thick red vertical line on the left is startup. + The long white vertical sections on the left shows the app is mostly idle, waiting for commands + from instrumented tests. + Then we see periodically red blocks, which shows the app is periodically busy handling commands + from instrumented tests. + +Click the start and end cells of a duration: + +![Example](./pictures/flamescope_click.png) + +To see a flamegraph for that duration: + +![Example](./pictures/flamescope_flamegraph.png) + +Install and run Flamescope: + +``` +git clone https://github.com/Netflix/flamescope ~/flamescope +cd ~/flamescope +pip install -r requirements.txt +npm install +npm run webpack +python3 run.py +``` + +Then open FlameScope in-browser: http://localhost:5000/. + +FlameScope can read gzipped perf script format profiles. Convert simpleperf perf.data to this +format with `report_sample.py`, and place it in Flamescope's examples directory: + +``` +# Create `Linux perf script` format profile. +report_sample.py | gzip > ~/flamescope/examples/my_simpleperf_profile.gz + +# Create `Linux perf script` format profile using Proguard map. +report_sample.py \ + --proguard-mapping-file proguard.map \ + | gzip > ~/flamescope/examples/my_simpleperf_profile.gz +``` + +Open the profile "as Linux Perf", and click start and end sections to get a flamegraph of that +timespan. + +To investigate UI Thread Jank, filter to UI thread samples only: + +``` +report_sample.py \ + --comm com.example.android.displayingbitmaps \ # UI Thread + | gzip > ~/flamescope/examples/uithread.gz +``` + +Once you've identified the timespan of interest, consider also zooming into that section with +Firefox Profiler, which has a more powerful flamegraph viewer. + +## Differential FlameGraph + +See Brendan Gregg's [Differential Flame Graphs](https://www.brendangregg.com/blog/2014-11-09/differential-flame-graphs.html) blog. + +Use Simpleperf's `stackcollapse.py` to convert perf.data to Folded Stacks format for the FlameGraph +toolkit. + +Consider diffing both directions: After minus Before, and Before minus After. + +If you've recorded before and after your optimisation as perf_before.data and perf_after.data, and +you're only interested in the UI thread: + +``` +# Generate before and after folded stacks from perf.data files +./stackcollapse.py --kernel --jit -i perf_before.data \ + --proguard-mapping-file proguard_before.map \ + --comm com.example.android.displayingbitmaps \ + > perf_before.folded +./stackcollapse.py --kernel --jit -i perf_after.data \ + --proguard-mapping-file proguard_after.map \ + --comm com.example.android.displayingbitmaps \ + > perf_after.folded + +# Generate diff reports +FlameGraph/difffolded.pl -n perf_before.folded perf_after.folded \ + | FlameGraph/flamegraph.pl > diff1.svg +FlameGraph/difffolded.pl -n --negate perf_after.folded perf_before.folded \ + | FlameGraph/flamegraph.pl > diff2.svg +``` + +## Android Studio Profiler + +Android Studio Profiler supports recording and reporting profiles of app processes. It supports +several recording methods, including one using simpleperf as backend. You can use Android Studio +Profiler for both recording and reporting. + +In Android Studio: +Open View -> Tool Windows -> Profiler +Click + -> Your Device -> Profileable Processes -> Your App + +![Example](./pictures/android_studio_profiler_select_process.png) + +Click into "CPU" Chart + +Choose Callstack Sample Recording. Even if you're using Java, this provides better observability, +into ART, malloc, and the kernel. + +![Example](./pictures/android_studio_profiler_select_recording_method.png) + +Click Record, run your test on the device, then Stop when you're done. + +Click on a thread track, and "Flame Chart" to see a chronological chart on the left, and an +aggregated flamechart on the right: + +![Example](./pictures/android_studio_profiler_flame_chart.png) + +If you want more flexibility in recording options, or want to add proguard mapping file, you can +record using simpleperf, and report using Android Studio Profiler. + +We can use `simpleperf report-sample` to convert perf.data to trace files for Android Studio +Profiler. + +``` +# Convert perf.data to perf.trace for Android Studio Profiler. +# If on Mac/Windows, use simpleperf host executable for those platforms instead. +bin/linux/x86_64/simpleperf report-sample --show-callchain --protobuf -i perf.data -o perf.trace + +# Convert perf.data to perf.trace using proguard mapping file. +bin/linux/x86_64/simpleperf report-sample --show-callchain --protobuf -i perf.data -o perf.trace \ + --proguard-mapping-file proguard.map +``` + +In Android Studio: Open File -> Open -> Select perf.trace + +![Example](./pictures/android_studio_profiler_open_perf_trace.png) + + +## Simpleperf HTML Report + +Simpleperf can generate its own HTML Profile, which is able to show Android-specific information +and separate flamegraphs for all threads, with a much rougher flamegraph UI. + +![Example](./pictures/report_html.png) + +This UI is fairly rough; we recommend using the Continuous PProf UI or Firefox Profiler instead. But +it's useful for a quick look at your data. + +Each of the following commands take as input ./perf.data and output ./report.html. + +``` +# Make an HTML report. +./report_html.py + +# Make an HTML report with Proguard mapping. +./report_html.py --proguard-mapping-file proguard.map +``` + +This will print some debug logs about Failed to read symbols: this is usually OK, unless those +symbols are hotspots. + +See also [report_html.py's README](scripts_reference.md#report_htmlpy) and `report_html.py -h`. + + +## PProf Interactive Command Line + +Unlike Continuous PProf UI, [PProf](https://github.com/google/pprof) command line is publicly +available, and allows drilldown, pivoting and filtering. + +The below session demonstrates filtering to stack frames containing processBitmap. + +``` +$ pprof pprof.profile +(pprof) show=processBitmap +(pprof) top +Active filters: + show=processBitmap +Showing nodes accounting for 2.45s, 11.44% of 21.46s total + flat flat% sum% cum cum% + 2.45s 11.44% 11.44% 2.45s 11.44% com.example.android.displayingbitmaps.util.ImageFetcher.processBitmap +``` + +And then showing the tags of those frames, to tell what threads they are running on: + +``` +(pprof) tags + pid: Total 2.5s + 2.5s ( 100%): 31112 + + thread: Total 2.5s + 1.4s (57.21%): AsyncTask #3 + 1.1s (42.79%): AsyncTask #4 + + threadpool: Total 2.5s + 2.5s ( 100%): AsyncTask #%d + + tid: Total 2.5s + 1.4s (57.21%): 31174 + 1.1s (42.79%): 31175 +``` + +Contrast with another method: + +``` +(pprof) show=addBitmapToCache +(pprof) top +Active filters: + show=addBitmapToCache +Showing nodes accounting for 1.05s, 4.88% of 21.46s total + flat flat% sum% cum cum% + 1.05s 4.88% 4.88% 1.05s 4.88% com.example.android.displayingbitmaps.util.ImageCache.addBitmapToCache +``` + +For more information, see the [pprof README](https://github.com/google/pprof/blob/master/doc/README.md#interactive-terminal-use). + + +## Simpleperf Report Command Line + +The simpleperf report command reports profiles in text format. + +![Example](./pictures/report_command.png) + +You can call `simpleperf report` directly or call it via `report.py`. + +``` +# Report symbols in table format. +$ ./report.py --children + +# Report call graph. +$ bin/linux/x86_64/simpleperf report -g -i perf.data +``` + +See also [report command's README](executable_commands_reference.md#The-report-command) and +`report.py -h`. + + +## Custom Report Interface + +If the above View UIs can't fulfill your need, you can use `simpleperf_report_lib.py` to parse +perf.data, extract sample information, and feed it to any views you like. + +See [simpleperf_report_lib.py's README](scripts_reference.md#simpleperf_report_libpy) for more +details. diff --git a/simpleperf/profcollect.cpp b/simpleperf/profcollect.cpp index 39c04ab7..724111e8 100644 --- a/simpleperf/profcollect.cpp +++ b/simpleperf/profcollect.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ +#include <wakelock/wakelock.h> #include <include/simpleperf_profcollect.hpp> #include "ETMRecorder.h" @@ -42,6 +43,13 @@ bool HasDeviceSupport() { } bool Record(const char* event_name, const char* output, float duration) { + // The kernel may panic when trying to hibernate or hotplug CPUs while collecting + // ETM data. So get wakelock to keep the CPUs on. + auto wakelock = android::wakelock::WakeLock::tryGet("profcollectd"); + if (!wakelock) { + LOG(ERROR) << "Failed to request wakelock."; + return false; + } auto recordCmd = CreateCommandInstance("record"); std::vector<std::string> args; args.push_back("-a"); diff --git a/simpleperf/scripts/run_simpleperf_without_usb_connection.py b/simpleperf/scripts/run_simpleperf_without_usb_connection.py index 8beddcd9..405deb4f 100755 --- a/simpleperf/scripts/run_simpleperf_without_usb_connection.py +++ b/simpleperf/scripts/run_simpleperf_without_usb_connection.py @@ -45,6 +45,8 @@ def start_recording(args): shell_cmd = 'cd /data/local/tmp && nohup ./simpleperf record ' + args.record_options if args.app: shell_cmd += ' --app ' + args.app + if args.pid: + shell_cmd += ' -p ' + args.pid if args.size_limit: shell_cmd += ' --size-limit ' + args.size_limit shell_cmd += ' >/data/local/tmp/simpleperf_output 2>&1' @@ -85,6 +87,8 @@ def main(): Default is `-e task-clock:u -g`.""") start_parser.add_argument('-p', '--app', help="""Profile an Android app, given the package name. Like `-p com.example.android.myapp`.""") + start_parser.add_argument('--pid', help="""Profile an Android app, given the process id. + Like `--pid 918`.""") start_parser.add_argument('--size_limit', type=str, help="""Stop profiling when recording data reaches [size_limit][K|M|G] bytes. Like `--size_limit 1M`.""") |