From 749e4aa8b107d1bdac2ddb904009a561c22fb4fe Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 19 Mar 2024 02:35:08 +0000 Subject: [PATCH 1/6] Fix: Initialize calls to zeros --- libsee.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/libsee.c b/libsee.c index 3303dda..f40c7e0 100644 --- a/libsee.c +++ b/libsee.c @@ -1148,11 +1148,12 @@ typedef struct libsee_name_stats { void libsee_initialize(void) { - // Initialize all the counters to zeros, without using `memset` - size_t *counters = libsee_thread_cycles[0].indexed; + // Initialize all the cycles to zeros, without using `memset` + size_t *cycles = libsee_thread_cycles[0].indexed; + size_t *calls = libsee_thread_calls[0].indexed; size_t total_counters_per_thread = sizeof(thread_local_counters) / sizeof(size_t); size_t total_counters_across_threads = LIBSEE_MAX_THREADS * total_counters_per_thread; - for (size_t i = 0; i < total_counters_across_threads; i++) { counters[i] = 0; } + for (size_t i = 0; i < total_counters_across_threads; i++) cycles[i] = calls[i] = 0; // Load the symbols from the underlying implementation real_apis *apis = &libsee_apis; From 3712cfba247e8c6e82e281b098a3ddaf36ad5dcc Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 18 Mar 2024 20:40:46 -0600 Subject: [PATCH 2/6] Docs: Match any extension --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5c72614..21ab18b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -14,7 +14,7 @@ To compile the library, run: ```bash cmake -B build_release cmake --build build_release --config Release -test -e build_release/libsee.so && echo "Success" || echo "Failure" +test -e build_release/libsee.* && echo "Success" || echo "Failure" ``` Want to try it out? Here's how to use it: From a8e9d885c2c6916dfd1d177112ad487946921d5a Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Wed, 20 Mar 2024 21:19:48 +0000 Subject: [PATCH 3/6] Docs: Extend plans --- .vscode/settings.json | 3 ++- README.md | 9 ++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 48e701e..b167472 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,6 +1,7 @@ { "files.associations": { "string.h": "c", - "cstdio": "c" + "cstdio": "c", + "wchar.h": "c" } } \ No newline at end of file diff --git a/README.md b/README.md index bf809c8..e356afb 100644 --- a/README.md +++ b/README.md @@ -24,9 +24,10 @@ There are several things worth knowing, that came handy implementing this. - One way to implement this library would be to override the `_start` symbols, but implementing correct loading sequence for a binary is tricky, so I use conventional `dlsym` to lookup the symbols on first function invocation. - On `x86_64` architecture, the `rdtscp` instruction yields both the CPU cycle and also the unique identifier of the core. Very handy if you are profiling a multi-threaded application. - Once the unloading sequence reaches `libsee.so`, the `STDOUT` is already closed. So if you want to print to the console, you may want to reopen the `/dev/tty` device before printing usage stats. +- Calling convention for system calls on Aarch64 and x86 differs significantly. On Aarch64 I use the [generalized `openat`](https://github.com/torvalds/linux/blob/bf3a69c6861ff4dc7892d895c87074af7bc1c400/include/uapi/asm-generic/unistd.h#L158-L159) with opcode 56. On [x86 it's opcode 2](https://github.com/torvalds/linux/blob/0dd3ee31125508cd67f7e7172247f05b7fd1753a/arch/x86/entry/syscalls/syscall_64.tbl#L13). - On MacOS the `sprintf`, `vsprintf`, `snprintf`, `vsnprintf` are macros. You have to `#undef` them. - On `Release` builds compilers love replacing your code with `memset` and `memcpy` calls. As the symbol can't be found from inside LibSee, it will `SEGFAULT` so don't forget to disable such optimizations for built-ins `-fno-builtin`. -- Aarch64 doesn't seem to have an `open` system call, but it [has the generalized `openat`](https://github.com/torvalds/linux/blob/bf3a69c6861ff4dc7892d895c87074af7bc1c400/include/uapi/asm-generic/unistd.h#L158-L159) number 56. +- No symbol versioning is implemented, vanilla `dlsym` is used over the `dlvsym`. ## Coverage @@ -47,5 +48,11 @@ Feel free to suggest PRs covering the rest: - [ ] [localization](https://en.cppreference.com/w/c/locale) - [ ] anything newer than C 11 +There are a few other C libraries that most of the world reuses, rather than implementing from scratch in other languages: + +- [ ] BLAS and LAPACK +- [ ] PCRE RegEx +- [ ] `hsearch`, `tsearch`, and pattern matching [extensions](https://ftp.gnu.org/old-gnu/Manuals/glibc-2.2.3/html_node/libc_toc.html) + [Program support](https://en.cppreference.com/w/c/program) utilities aren't intended. From 32cd03f9212394e3743347419858e743948676c0 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Wed, 20 Mar 2024 21:20:16 +0000 Subject: [PATCH 4/6] Add: Minimal wide-string support --- libsee.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/libsee.c b/libsee.c index e6da45f..2e0edb4 100644 --- a/libsee.c +++ b/libsee.c @@ -69,6 +69,8 @@ typedef int errno_t; typedef size_t rsize_t; #endif +#define LIBSEE_MAX_SYMBOLS 97 + /** * @brief Contains the number of times each function was called. * @@ -113,6 +115,10 @@ typedef union thread_local_counters { size_t memmem; size_t memrchr; + size_t wcstombs; + size_t wcswidth; + size_t wcwidth; + size_t malloc; size_t calloc; size_t realloc; @@ -179,7 +185,7 @@ typedef union thread_local_counters { size_t mktime; } named; - size_t indexed[94]; + size_t indexed[LIBSEE_MAX_SYMBOLS]; } thread_local_counters; #pragma region Function Pointers @@ -219,6 +225,10 @@ typedef errno_t (*api_strerror_s_t)(char *buf, rsize_t bufsz, errno_t errnum); typedef void *(*api_memmem_t)(void const *haystack, size_t haystacklen, void const *needle, size_t needlelen); typedef void *(*api_memrchr_t)(void const *s, int c, size_t n); +typedef size_t (*api_wcstombs_t)(char *dest, wchar_t const *src, size_t max); +typedef int (*api_wcswidth_t)(wchar_t const *wcs, size_t n); +typedef int (*api_wcwidth_t)(wchar_t wc); + typedef void *(*api_malloc_t)(size_t); typedef void *(*api_calloc_t)(size_t, size_t); typedef void *(*api_realloc_t)(void *, size_t); @@ -345,6 +355,10 @@ typedef struct real_apis { api_memmem_t memmem; api_memrchr_t memrchr; + api_wcstombs_t wcstombs; + api_wcswidth_t wcswidth; + api_wcwidth_t wcwidth; + api_malloc_t malloc; api_calloc_t calloc; api_realloc_t realloc; @@ -779,6 +793,20 @@ libsee_export void *memrchr(void const *s, int c, size_t n) { libsee_return(memr #pragma endregion +#pragma region Wide Characters // Contents of `wchar.h` + +#include + +libsee_export size_t wcstombs(char *dst, wchar_t const *src, size_t len) { + libsee_return(wcstombs, size_t, dst, src, len); +} + +libsee_export int wcwidth(wchar_t c) { libsee_return(wcwidth, int, c); } + +libsee_export int wcswidth(wchar_t const *s, size_t n) { libsee_return(wcswidth, int, s, n); } + +#pragma endregion + #pragma region Numerics // Contents of `stdlib.h` libsee_export void srand(unsigned seed) { libsee_noreturn(srand, seed); } @@ -1185,6 +1213,10 @@ void libsee_initialize(void) { apis->memmem = (api_memmem_t)dlsym(RTLD_NEXT, "memmem"); apis->memrchr = (api_memrchr_t)dlsym(RTLD_NEXT, "memrchr"); + apis->wcstombs = (api_wcstombs_t)dlsym(RTLD_NEXT, "wcstombs"); + apis->wcswidth = (api_wcswidth_t)dlsym(RTLD_NEXT, "wcswidth"); + apis->wcwidth = (api_wcwidth_t)dlsym(RTLD_NEXT, "wcwidth"); + apis->malloc = (api_malloc_t)dlsym(RTLD_NEXT, "malloc"); apis->calloc = (api_calloc_t)dlsym(RTLD_NEXT, "calloc"); apis->realloc = (api_realloc_t)dlsym(RTLD_NEXT, "realloc"); @@ -1359,6 +1391,8 @@ void libsee_finalize(void) { {"strspn"}, {"strcspn"}, {"strpbrk"}, {"strstr"}, {"strtok"}, {"strtok_s"}, {"memchr"}, {"memcmp"}, {"memset"}, {"memset_s"}, {"memcpy"}, {"memcpy_s"}, {"memmove"}, {"memmove_s"}, {"strerror"}, {"strerror_s"}, {"memmem"}, {"memrchr"}, + // Wide strings + {"wcstombs"}, {"mbstowcs"}, {"mbrtowc"}, // Heap {"malloc"}, {"calloc"}, {"realloc"}, {"free"}, {"aligned_alloc"}, // Algorithms From c20e93539e9fd55edc8a0b05e851e2f8c0408479 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sat, 4 Jan 2025 17:08:35 +0000 Subject: [PATCH 5/6] Docs: Formatting --- .vscode/settings.json | 14 +++++++++++- README.md | 10 ++++----- libsee.c | 52 +++++++++++++++++++++++-------------------- 3 files changed, 46 insertions(+), 30 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index b167472..fbca6bd 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -3,5 +3,17 @@ "string.h": "c", "cstdio": "c", "wchar.h": "c" - } + }, + "cSpell.words": [ + "BLAS", + "dlsym", + "dlvsym", + "hsearch", + "LAPACK", + "libsee", + "memcpy", + "memset", + "PCRE", + "tsearch" + ] } \ No newline at end of file diff --git a/README.md b/README.md index e356afb..7b41e74 100644 --- a/README.md +++ b/README.md @@ -3,18 +3,18 @@ > _See where you use LibC the most._
> _Trace calls failing tests. Then - roast!_ -One-liner, to download and compile the script and run your vavorite query: +One-liner, to download and compile the script and run your favorite query: ```bash gcc -g -O2 -fno-builtin -fPIC -nostdlib -nostartfiles -shared -o libsee.so libsee.c ``` -LibSee overrides LibC symbols using `LD_PRELOAD`, profiling the most commonly used functions, and, optionally, fuzzing their behaviour for testing. +LibSee overrides LibC symbols using `LD_PRELOAD`, profiling the most commonly used functions, and, optionally, fuzzing their behavior for testing. The library yields a few binaries when compiled: ```bash libsee.so # Profiles LibC calls -libsee_and_knee.so # Correct LibC behaviour, but fuzzed! +libsee_and_knee.so # Correct LibC behavior, but fuzzed! ``` ## Tricks Used @@ -39,11 +39,11 @@ Feel free to suggest PRs covering the rest: - [x] [algorithms](https://en.cppreference.com/w/c/algorithm) - [x] [date and time](https://en.cppreference.com/w/c/chrono) - [x] [input/output](https://en.cppreference.com/w/c/io) +- [x] [wide-character strings](https://en.cppreference.com/w/c/string/wide) - [ ] [concurrency and atomics](https://en.cppreference.com/w/c/thread) - [ ] retrieving error numbers - [ ] [numerics](https://en.cppreference.com/w/c/numeric) -- [ ] [wide-character strings](https://en.cppreference.com/w/c/string/wide) -- [ ] [multibyte strings](https://en.cppreference.com/w/c/string/multibyte) +- [ ] [multi-byte strings](https://en.cppreference.com/w/c/string/multibyte) - [ ] [wide-character IO](https://en.cppreference.com/w/c/io) - [ ] [localization](https://en.cppreference.com/w/c/locale) - [ ] anything newer than C 11 diff --git a/libsee.c b/libsee.c index 2e0edb4..cae23b9 100644 --- a/libsee.c +++ b/libsee.c @@ -480,25 +480,27 @@ void syscall_print(char const *buf, size_t count) { // The system call number is passed in x8, and the arguments are in x0, x1, and x2. long syscall_write = (long)64; // System call number for write in AArch64 Linux long file_descriptor = (long)1; - asm volatile("mov x0, %1\n" // First argument: file descriptor - "mov x1, %2\n" // Second argument: buffer address - "mov x2, %3\n" // Third argument: buffer size - "mov x8, %4\n" // System call number: SYS_write (64) - "svc #0\n" // Make the system call - "mov %0, x0" // Store the return value - : "=r"(ret) - : "r"(file_descriptor), "r"(buf), "r"((long)count), "r"(syscall_write) - : "x0", "x1", "x2", "x8", "memory"); + asm volatile( // + "mov x0, %1\n" // First argument: file descriptor + "mov x1, %2\n" // Second argument: buffer address + "mov x2, %3\n" // Third argument: buffer size + "mov x8, %4\n" // System call number: SYS_write (64) + "svc #0\n" // Make the system call + "mov %0, x0" // Store the return value + : "=r"(ret) + : "r"(file_descriptor), "r"(buf), "r"((long)count), "r"(syscall_write) + : "x0", "x1", "x2", "x8", "memory"); #elif defined(__x86_64__) || defined(__i386__) // Inline assembly syntax for making a system call in x86-64 Linux. // Uses the syscall instruction, passing the system call number in rax, // and the call arguments in rdi, rsi, and rdx, respectively. long syscall_write = (long)1; // System call number for write in x86-64 Linux unsigned int file_descriptor = (unsigned int)1; - asm volatile("syscall" - : "=a"(ret) - : "a"(syscall_write), "D"(file_descriptor), "S"(buf), "d"(count) - : "rcx", "r11", "memory"); + asm volatile( // + "syscall" + : "=a"(ret) + : "a"(syscall_write), "D"(file_descriptor), "S"(buf), "d"(count) + : "rcx", "r11", "memory"); (void)ret; #endif (void)buf; @@ -539,18 +541,20 @@ void reopen_stdout(void) { void close_stdout(void) { long ret; #ifdef __aarch64__ - asm volatile("mov x0, 1\n" // File descriptor for stdout - "mov x8, 57\n" // Syscall number for 'close' in AArch64 - "svc #0\n" - "mov %0, x0" - : "=r"(ret) - : // No inputs besides the syscall number and FD - : "x0", "x8", "memory"); + asm volatile( // + "mov x0, 1\n" // File descriptor for stdout + "mov x8, 57\n" // Syscall number for 'close' in AArch64 + "svc #0\n" + "mov %0, x0" + : "=r"(ret) + : // No inputs besides the syscall number and FD + : "x0", "x8", "memory"); #elif defined(__x86_64__) - asm volatile("syscall" - : "=a"(ret) - : "a"(3), "D"(1) // Inputs: syscall number for 'close', FD for stdout - : "rcx", "r11", "memory"); + asm volatile( // + "syscall" + : "=a"(ret) + : "a"(3), "D"(1) // Inputs: syscall number for 'close', FD for stdout + : "rcx", "r11", "memory"); #endif (void)ret; } From 01ee6160f4f3192f5a4e0a2d854dba0eb64d5856 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sat, 4 Jan 2025 17:12:34 +0000 Subject: [PATCH 6/6] Make: Upgrade CI with TinySemVer --- .github/workflows/prerelease.yml | 24 ++++++++++++++++++++++++ .github/workflows/release.yml | 28 +++++++++++++++++++++------- .github/workflows/update_version.sh | 7 ------- CMakeLists.txt | 2 +- package-ci.json | 10 ---------- 5 files changed, 46 insertions(+), 25 deletions(-) delete mode 100644 .github/workflows/update_version.sh delete mode 100644 package-ci.json diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml index b465b48..94df306 100644 --- a/.github/workflows/prerelease.yml +++ b/.github/workflows/prerelease.yml @@ -18,6 +18,30 @@ permissions: contents: read jobs: + versioning: + name: Update Version + runs-on: ubuntu-24.04 + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + persist-credentials: false + - name: Run TinySemVer + uses: ashvardanian/tinysemver@v2.1.1 + with: + verbose: "true" + version-file: "VERSION" + update-version-in: | + CMakeLists.txt:VERSION (\d+\.\d+\.\d+) + update-major-version-in: | + libsee.c:^#define LIBSEE_VERSION_MAJOR (\d+) + update-minor-version-in: | + libsee.c:^#define LIBSEE_VERSION_MINOR (\d+) + update-patch-version-in: | + libsee.c:^#define LIBSEE_VERSION_PATCH (\d+) + dry-run: "true" + test_ubuntu_gcc: name: Ubuntu (GCC) runs-on: ubuntu-22.04 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ed7704b..7b1e311 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -18,17 +18,31 @@ permissions: jobs: versioning: - name: Semantic Release - runs-on: ubuntu-22.04 + name: Update Version + runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@v4 + - name: Checkout + uses: actions/checkout@v4 with: + fetch-depth: 0 persist-credentials: false - - name: Set up Node.js - uses: actions/setup-node@v4 + - name: Run TinySemVer + uses: ashvardanian/tinysemver@v2.1.1 with: - node-version: 20 - - run: npm install --ignore-scripts --save-dev --prefix ./package-ci @semantic-release/exec @semantic-release/git conventional-changelog-eslint semantic-release && npx --prefix ./package-ci semantic-release + verbose: "true" + version-file: "VERSION" + update-version-in: | + CMakeLists.txt:VERSION (\d+\.\d+\.\d+) + update-major-version-in: | + libsee.c:^#define LIBSEE_VERSION_MAJOR (\d+) + update-minor-version-in: | + libsee.c:^#define LIBSEE_VERSION_MINOR (\d+) + update-patch-version-in: | + libsee.c:^#define LIBSEE_VERSION_PATCH (\d+) + dry-run: "false" + push: "true" + create-release: "true" + github-token: ${{ secrets.SEMANTIC_RELEASE_TOKEN }} rebase: name: Rebase Dev. Branch diff --git a/.github/workflows/update_version.sh b/.github/workflows/update_version.sh deleted file mode 100644 index e5dd511..0000000 --- a/.github/workflows/update_version.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/sh - -echo $1 > VERSION && - sed -i "s/^\(#define LIBSEE_VERSION_MAJOR \).*/\1$(echo "$1" | cut -d. -f1)/" libsee.c && - sed -i "s/^\(#define LIBSEE_VERSION_MINOR \).*/\1$(echo "$1" | cut -d. -f2)/" libsee.c && - sed -i "s/^\(#define LIBSEE_VERSION_PATCH \).*/\1$(echo "$1" | cut -d. -f3)/" libsee.c && - sed -i "s/VERSION [0-9]\+\.[0-9]\+\.[0-9]\+/VERSION $1/" CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index ca3d502..b56fb0d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.14) -project(libsee +project(libsee VERSION 1.0.2 DESCRIPTION "See where you use LibC the most. Trace calls failing tests. Then - roast!") diff --git a/package-ci.json b/package-ci.json deleted file mode 100644 index 6288162..0000000 --- a/package-ci.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "name": "stringzilla-ci", - "version": "1.0.0", - "devDependencies": { - "@semantic-release/exec": "^6.0.3", - "@semantic-release/git": "^10.0.1", - "conventional-changelog-eslint": "^3.0.9", - "semantic-release": "^21.1.2" - } -} \ No newline at end of file