From 3ad0a8350bec1da0745f3c3c234b7070ddb4a2dd Mon Sep 17 00:00:00 2001 From: Adenilson Cavalcanti Date: Sat, 29 Mar 2025 16:13:44 -0700 Subject: [PATCH] [zstd][cli] Add performance counters support to bench mode Adding an extra parameter while running in benchmark mode to allow collecting processor performance counters, as that will allow next to know performance stats per operation (i.e. compression vs decompression). We can collect the following performance counters using the Linux perf API: CPU cycles, instructions, branch misses, cache hits and cache misses. One advantage of leveraging the Linux perf API is that it should work on any processor that runs Linux, therefore should work fine on x86-64 (Intel and AMD), Arm (arm32/aarch64) and RISC-V. The counters will allow to generate interesting stats like cycles/byte, a measure that is helpful to compare different CPU micro architectures with the benefit of being independent of clock speed. Plus, any I/O operations (i.e. reading files from the disk) that will waste cycles displayed in a regular 'perf stat' will *not* be counted, since we only capture counters during the main benchmark loop. This patch is still in its early stages as the idea is to listen to feedback and properly address its current shortcommings to progress towards a contribution that can be landed on zstd. --- programs/benchzstd.c | 19 +++++++++ programs/benchzstd.h | 1 + programs/counters.h | 97 ++++++++++++++++++++++++++++++++++++++++++++ programs/zstdcli.c | 12 ++++++ 4 files changed, 129 insertions(+) create mode 100644 programs/counters.h diff --git a/programs/benchzstd.c b/programs/benchzstd.c index f55c8697504..e28aab593de 100644 --- a/programs/benchzstd.c +++ b/programs/benchzstd.c @@ -27,10 +27,12 @@ #include /* fprintf, fopen */ #include /* malloc, free */ #include /* memset, strerror */ +#include "counters.h" #include "util.h" /* UTIL_getFileSize, UTIL_sleep */ #include "../lib/common/mem.h" #include "benchfn.h" #include "timefn.h" /* UTIL_time_t */ + #ifndef ZSTD_STATIC_LINKING_ONLY # define ZSTD_STATIC_LINKING_ONLY #endif @@ -541,6 +543,9 @@ static BMK_benchOutcome_t BMK_benchMemAdvancedNoAlloc( "Warning : time measurements may be incorrect in multithreading mode... \n") } + /* FIXME(cavalcanti): only include this for Linux (!Android)@x86 */ + BMK_linuxPerfCounters_t counters; + /* Bench */ { U64 const crcOrig = (adv->mode == BMK_decodeOnly) @@ -599,6 +604,12 @@ static BMK_benchOutcome_t BMK_benchMemAdvancedNoAlloc( displayName, (unsigned)srcSize); + /* FIXME(cavalcanti): only include this for Linux (!Android)@x86 */ + if (adv->cpuCounters) { + BMK_countersInit(&counters); + BMK_eventStart(&counters); + } + while (!(compressionCompleted && decompressionCompleted)) { if (!compressionCompleted) { BMK_runOutcome_t const cOutcome = @@ -680,6 +691,13 @@ static BMK_benchOutcome_t BMK_benchMemAdvancedNoAlloc( markNb = (markNb + 1) % NB_MARKS; } /* while (!(compressionCompleted && decompressionCompleted)) */ + /* FIXME(cavalcanti): only include this for Linux (!Android)@x86 */ + if (adv->cpuCounters) { + BMK_eventStop(&counters); + BMK_countersClose(&counters); + fprintf(stdout, "###### Perf cycles: %llu\n", counters.cycles); + } + /* CRC Checking */ { const BYTE* resultBuffer = (const BYTE*)(*resultBufferPtr); U64 const crcCheck = XXH64(resultBuffer, srcSize, 0); @@ -763,6 +781,7 @@ static BMK_benchOutcome_t BMK_benchMemAdvancedNoAlloc( benchResult.cMem = (1ULL << (comprParams->windowLog)) + ZSTD_sizeof_CCtx(cctx); + return BMK_benchOutcome_setValidResult(benchResult); } diff --git a/programs/benchzstd.h b/programs/benchzstd.h index d62a33c0aec..897604ff9fe 100644 --- a/programs/benchzstd.h +++ b/programs/benchzstd.h @@ -106,6 +106,7 @@ typedef struct { int ldmHashRateLog; ZSTD_ParamSwitch_e literalCompressionMode; int useRowMatchFinder; /* use row-based matchfinder if possible */ + int cpuCounters; } BMK_advancedParams_t; /* returns default parameters used by nonAdvanced functions */ diff --git a/programs/counters.h b/programs/counters.h new file mode 100644 index 00000000000..e92f237b576 --- /dev/null +++ b/programs/counters.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/**************************************************************************** + * Performance counters + * + ****************************************************************************/ +#ifndef BENCH_ZSTD_COUNTERS +#define BENCH_ZSTD_COUNTERS +/* FIXME(cavalcanti): only include this for Linux (!Android)@x86 */ +#include +#include +#include +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +typedef struct { + struct perf_event_attr events; + int fd; + long long cycles; +} BMK_linuxPerfCounters_t; + +static int BMK_countersOpen(BMK_linuxPerfCounters_t* counters) +{ + pid_t pid = 0; + int cpu = -1; + int group_fd = -1; + unsigned long flags = 0; + + counters->fd = syscall(__NR_perf_event_open, &counters->events, pid, cpu, + group_fd, flags); + + if (counters->fd != -1) return 0; + + return -1; +} + +static int BMK_countersInit(BMK_linuxPerfCounters_t* counters) +{ + memset(counters, 0, sizeof(struct perf_event_attr)); + counters->events.type = PERF_TYPE_HARDWARE; + counters->events.size = sizeof(struct perf_event_attr); + /* TODO(cavalcanti): Add more performance counters: + * PERF_COUNT_HW_INSTRUCTIONS, PERF_COUNT_HW_BRANCH_MISSES, + * PERF_COUNT_HW_CACHE_REFERENCES, PERF_COUNT_HW_CACHE_MISSES. + */ + counters->events.config = PERF_COUNT_HW_CPU_CYCLES; + counters->events.disabled = 1; + counters->events.exclude_kernel = 1; + counters->events.exclude_hv = 1; + + counters->cycles = 0; + + return BMK_countersOpen(counters); +} + +static int BMK_eventStart(BMK_linuxPerfCounters_t* counters) +{ + int res = 0; + if (counters->fd != -1) { + res = ioctl(counters->fd, PERF_EVENT_IOC_RESET, 0); + if (res != -1) res = ioctl(counters->fd, PERF_EVENT_IOC_ENABLE, 0); + } + + return res; +} + +static int BMK_eventStop(BMK_linuxPerfCounters_t* counters) +{ + long long count = 0; + ioctl(counters->fd, PERF_EVENT_IOC_DISABLE, 0); + if (read(counters->fd, &count, sizeof(long long)) == -1) return -1; + counters->cycles += count; +} + +static int BMK_countersClose(BMK_linuxPerfCounters_t* counters) +{ + close(counters->fd); +} + +#endif diff --git a/programs/zstdcli.c b/programs/zstdcli.c index fa7ea37b3f0..770cb55339a 100644 --- a/programs/zstdcli.c +++ b/programs/zstdcli.c @@ -309,6 +309,7 @@ static void usageAdvanced(const char* programName) DISPLAYOUT(" -b# Perform benchmarking with compression level #. [Default: %d]\n", ZSTDCLI_CLEVEL_DEFAULT); DISPLAYOUT(" -e# Test all compression levels up to #; starting level is `-b#`. [Default: 1]\n"); DISPLAYOUT(" -i# Set the minimum evaluation to time # seconds. [Default: 3]\n"); + DISPLAYOUT(" -y# Collect CPU counters.\n"); DISPLAYOUT(" --split=# Split input into independent chunks of size #. [Default: No chunking]\n"); DISPLAYOUT(" -S Output one benchmark result per input file. [Default: Consolidated result]\n"); DISPLAYOUT(" -D dictionary Benchmark using dictionary \n"); @@ -882,6 +883,7 @@ int main(int argCount, const char* argv[]) cLevelLast = MINCLEVEL - 1, /* for benchmark range */ setThreads_non1 = 0; unsigned nbWorkers = init_nbWorkers(); + unsigned cpuCounters = 0; /* wether we want to harvest CPU counters during benchmark */ ZSTD_ParamSwitch_e mmapDict = ZSTD_ps_auto; ZSTD_ParamSwitch_e useRowMatchFinder = ZSTD_ps_auto; FIO_compressionType_t cType = FIO_zstdCompression; @@ -1316,6 +1318,15 @@ int main(int argCount, const char* argv[]) compressibility = (double)readU32FromChar(&argument) / 100; break; + /* Harvest performance counters */ + case 'y': + argument++; + cpuCounters = 1; + /* Collecting performance counters requires single threaded mode for now */ + nbWorkers = 0; + singleThread = 1; + break; + /* unknown command */ default : { char shortArgument[3] = {'-', 0, 0}; @@ -1423,6 +1434,7 @@ int main(int argCount, const char* argv[]) benchParams.ldmMinMatch = (int)g_ldmMinMatch; benchParams.ldmHashLog = (int)g_ldmHashLog; benchParams.useRowMatchFinder = (int)useRowMatchFinder; + benchParams.cpuCounters = (int)cpuCounters; if (g_ldmBucketSizeLog != LDM_PARAM_DEFAULT) { benchParams.ldmBucketSizeLog = (int)g_ldmBucketSizeLog; }