From f46ad8e2d304336e120a2fa7749a2b5e2e026731 Mon Sep 17 00:00:00 2001 From: kyon4ik Date: Thu, 31 Jul 2025 19:03:55 +0500 Subject: [PATCH 1/8] Add hashtable --- Makefile | 1 + src/b.rs | 10 +- src/codegen/gas_aarch64.rs | 11 +- src/crust.rs | 47 +++----- src/hashtable.rs | 220 +++++++++++++++++++++++++++++++++++++ src/ir.rs | 4 +- 6 files changed, 252 insertions(+), 41 deletions(-) create mode 100644 src/hashtable.rs diff --git a/Makefile b/Makefile index a2431457..e20cc26d 100644 --- a/Makefile +++ b/Makefile @@ -16,6 +16,7 @@ CRUST_FLAGS=-g --edition 2021 -C opt-level=0 -C panic="abort" RSS=\ $(SRC)/arena.rs \ + $(SRC)/hashtable.rs \ $(SRC)/b.rs \ $(SRC)/ir.rs \ $(SRC)/crust.rs \ diff --git a/src/b.rs b/src/b.rs index b046c89b..20cb5e09 100644 --- a/src/b.rs +++ b/src/b.rs @@ -37,16 +37,18 @@ pub mod targets; pub mod ir; pub mod time; pub mod shlex; +pub mod hashtable; use core::ffi::*; use core::mem::zeroed; use core::ptr; use core::slice; use core::cmp; +use crust::Str; +use hashtable::HashTable; use nob::*; use flag::*; use crust::libc::*; -use crust::assoc_lookup_cstr; use arena::Arena; use targets::*; use lexer::{Lexer, Loc, Token}; @@ -947,7 +949,7 @@ pub unsafe fn compile_program(l: *mut Lexer, c: *mut Compiler) -> Option<()> { get_and_expect_token_but_continue(l, c, Token::ID)?; let func = arena::strdup(&mut (*c).arena, (*l).string); let func_loc = (*l).loc; - if let Some(existing_variadic) = assoc_lookup_cstr(da_slice((*c).program.variadics), func) { + if let Some(existing_variadic) = HashTable::get(&(*c).program.variadics, Str(func)) { // TODO: report all the duplicate variadics maybe? diagf!(func_loc, c!("ERROR: duplicate variadic declaration `%s`\n"), func); diagf!((*existing_variadic).loc, c!("NOTE: the first declaration is located here\n")); @@ -959,10 +961,10 @@ pub unsafe fn compile_program(l: *mut Lexer, c: *mut Compiler) -> Option<()> { diagf!((*l).loc, c!("ERROR: variadic function `%s` cannot have 0 arguments\n"), func); bump_error_count(c)?; } - da_append(&mut (*c).program.variadics, (func, Variadic { + HashTable::insert(&mut (*c).program.variadics, Str(func), Variadic { loc: func_loc, fixed_args: (*l).int_number as usize, - })); + }); get_and_expect_token_but_continue(l, c, Token::CParen)?; get_and_expect_token_but_continue(l, c, Token::SemiColon)?; } diff --git a/src/codegen/gas_aarch64.rs b/src/codegen/gas_aarch64.rs index dec99eae..3fb02757 100644 --- a/src/codegen/gas_aarch64.rs +++ b/src/codegen/gas_aarch64.rs @@ -1,8 +1,9 @@ use core::ffi::*; use core::mem::zeroed; +use crate::crust::Str; +use crate::hashtable::HashTable; use crate::nob::*; use crate::crust::libc::*; -use crate::crust::assoc_lookup_cstr; use crate::ir::*; use crate::lexer::*; use crate::missingf; @@ -127,7 +128,7 @@ pub unsafe fn load_arg_to_reg(arg: Arg, reg: *const c_char, output: *mut String_ }; } -pub unsafe fn generate_function(name: *const c_char, _name_loc: Loc, params_count: usize, auto_vars_count: usize, os: Os, variadics: *const [(*const c_char, Variadic)], body: *const [OpWithLocation], output: *mut String_Builder) { +pub unsafe fn generate_function(name: *const c_char, _name_loc: Loc, params_count: usize, auto_vars_count: usize, os: Os, variadics: *const HashTable, body: *const [OpWithLocation], output: *mut String_Builder) { let stack_size = align_bytes(auto_vars_count*8, 16); match os { Os::Linux => { @@ -316,7 +317,7 @@ pub unsafe fn generate_function(name: *const c_char, _name_loc: Loc, params_coun let mut fixed_args = 0; match fun { Arg::External(name) | Arg::RefExternal(name) => { - if let Some(variadic) = assoc_lookup_cstr(variadics, name) { + if let Some(variadic) = HashTable::get(variadics, Str(name)) { fixed_args = (*variadic).fixed_args; } } @@ -395,7 +396,7 @@ pub unsafe fn generate_function(name: *const c_char, _name_loc: Loc, params_coun sb_appendf(output, c!(" ret\n")); } -pub unsafe fn generate_funcs(output: *mut String_Builder, funcs: *const [Func], variadics: *const [(*const c_char, Variadic)], os: Os) { +pub unsafe fn generate_funcs(output: *mut String_Builder, funcs: *const [Func], variadics: *const HashTable, os: Os) { sb_appendf(output, c!(".text\n")); for i in 0..funcs.len() { generate_function((*funcs)[i].name, (*funcs)[i].name_loc, (*funcs)[i].params_count, (*funcs)[i].auto_vars_count, os, variadics, da_slice((*funcs)[i].body), output); @@ -544,7 +545,7 @@ pub unsafe fn generate_program( if debug { todo!("Debug information for aarch64") } - generate_funcs(output, da_slice((*program).funcs), da_slice((*program).variadics), os); + generate_funcs(output, da_slice((*program).funcs), &(*program).variadics, os); generate_asm_funcs(output, da_slice((*program).asm_funcs), os); generate_globals(output, da_slice((*program). globals), os); generate_data_section(output, da_slice((*program).data)); diff --git a/src/crust.rs b/src/crust.rs index 1ea13edb..34ba18fb 100644 --- a/src/crust.rs +++ b/src/crust.rs @@ -1,5 +1,6 @@ // This is a module that facilitates Crust-style programming - https://github.com/tsoding/crust use crate::crust::libc::*; +use core::hash::{Hash, Hasher}; use core::panic::PanicInfo; use core::ffi::*; @@ -38,44 +39,28 @@ pub unsafe fn slice_contains(slice: *const [Value], needle: *c false } -pub unsafe fn assoc_lookup_cstr_mut(assoc: *mut [(*const c_char, Value)], needle: *const c_char) -> Option<*mut Value> { - for i in 0..assoc.len() { - if strcmp((*assoc)[i].0, needle) == 0 { - return Some(&mut (*assoc)[i].1); - } - } - None -} +#[repr(transparent)] +#[derive(Clone, Copy, Debug)] +pub struct Str(pub *const c_char); -pub unsafe fn assoc_lookup_cstr(assoc: *const [(*const c_char, Value)], needle: *const c_char) -> Option<*const Value> { - for i in 0..assoc.len() { - if strcmp((*assoc)[i].0, needle) == 0 { - return Some(&(*assoc)[i].1); - } +impl PartialEq for Str { + fn eq(&self, other: &Self) -> bool { + unsafe { strcmp(self.0, other.0) == 0 } } - None } -pub unsafe fn assoc_lookup_mut(assoc: *mut [(Key, Value)], needle: *const Key) -> Option<*mut Value> -where Key: PartialEq -{ - for i in 0..assoc.len() { - if (*assoc)[i].0 == *needle { - return Some(&mut (*assoc)[i].1); - } - } - None -} +impl Eq for Str {} -pub unsafe fn assoc_lookup(assoc: *const [(Key, Value)], needle: *const Key) -> Option<*const Value> -where Key: PartialEq -{ - for i in 0..assoc.len() { - if (*assoc)[i].0 == *needle { - return Some(&(*assoc)[i].1); +impl Hash for Str { + fn hash(&self, state: &mut H) { + unsafe { + let mut ptr = self.0; + while *ptr != 0 { + state.write_i8(*ptr); + ptr = ptr.add(1); + } } } - None } #[macro_use] diff --git a/src/hashtable.rs b/src/hashtable.rs new file mode 100644 index 00000000..9b334a98 --- /dev/null +++ b/src/hashtable.rs @@ -0,0 +1,220 @@ +use crate::crust::libc; +use core::hash::{BuildHasher, Hash, Hasher}; +use core::{cmp, mem, ptr}; + +#[derive(Clone, Copy)] +pub struct HashTable { + pub entries: *mut Entry, + pub capacity: usize, + pub count: usize, + pub occupied: *mut u8, + pub hasher_builder: S, +} + +#[derive(Clone, Copy)] +pub struct Entry { + pub key: K, + pub value: V, +} + +#[derive(Clone, Copy)] +pub enum HtEntry { + Occupied(*mut Entry), + Vacant(*mut Entry), +} + +impl HashTable +where + K: Clone + Copy + Hash + Eq, + V: Clone + Copy, + S: BuildHasher, + H: Hasher, +{ + // Must be power of 2 and greater than or equal to 8 + pub const MIN_CAPACITY: usize = 8; + + /// Returns previous value stored by this `key` or `None` + pub unsafe fn insert(ht: *mut Self, key: K, value: V) -> Option { + match Self::find(ht, key) { + HtEntry::Occupied(entry) => Some(mem::replace(&mut (*entry).value, value)), + HtEntry::Vacant(entry) => { + Self::insert_new_key(ht, entry, key, value); + None + } + } + } + + pub unsafe fn get(ht: *const Self, key: K) -> Option<*const V> { + match Self::find(ht, key) { + HtEntry::Occupied(entry) => Some(&(*entry).value), + HtEntry::Vacant(_) => None, + } + } + + pub unsafe fn get_mut(ht: *mut Self, key: K) -> Option<*mut V> { + match Self::find(ht, key) { + HtEntry::Occupied(entry) => Some(&mut (*entry).value), + HtEntry::Vacant(_) => None, + } + } + + pub unsafe fn find(ht: *const Self, key: K) -> HtEntry { + if (*ht).capacity == 0 { + return HtEntry::Vacant(ptr::null_mut()); + } + + let hash = Self::hash_key(ht, key); + let mut index = Self::index_from_hash(hash, (*ht).capacity); + + let mut step = 1; + loop { + let entry = (*ht).entries.add(index); + if Self::is_occupied(ht, index) { + if (*entry).key == key { + return HtEntry::Occupied(entry); + } + } else { + return HtEntry::Vacant(entry); + } + + index = (index + step) & ((*ht).capacity - 1); + step += 1; + } + } + + pub unsafe fn insert_new_key(ht: *mut Self, entry: *mut Entry, key: K, value: V) { + if entry.is_null() { + Self::realloc_rehash(ht); + + // Executes only when capacity was 0 + let hash = Self::hash_key(ht, key); + let index = Self::index_from_hash(hash, (*ht).capacity); + *(*ht).entries.add(index) = Entry { key, value }; + (*ht).count += 1; + } else { + (*entry).key = key; + (*entry).value = value; + (*ht).count += 1; + + // When load factor > 0.75 + if (3 * (*ht).capacity) / 4 < (*ht).count { + Self::realloc_rehash(ht); + } + } + } + + pub unsafe fn realloc_rehash(ht: *mut Self) { + let new_capacity = cmp::max((*ht).capacity << 1, Self::MIN_CAPACITY); + debug_assert!(new_capacity.is_power_of_two()); + + // We need new allocations here, to properly copy entries + let new_entries: *mut Entry = libc::realloc_items(ptr::null_mut(), new_capacity); + let new_occupied: *mut u8 = libc::realloc_items(ptr::null_mut(), new_capacity >> 3); + debug_assert!(!new_entries.is_null()); + debug_assert!(!new_occupied.is_null()); + + // Fill occupied with zeros + ptr::write_bytes(new_occupied, 0, new_capacity >> 3); + + // Rehash all occupoed entries + let buckets_count = (*ht).capacity >> 3; + for i in 0..buckets_count { + let bucket = *(*ht).occupied.add(i); + for j in 0..8 { + if (bucket >> j) & 1 == 1 { + let index = (i << 3) + (7 - j); + let entry = *(*ht).entries.add(index); + let hash = Self::hash_key(ht, entry.key); + let new_index = Self::index_from_hash(hash, new_capacity); + let new_bucket = new_index >> 3; + + *new_entries.add(new_index) = entry; + *new_occupied.add(new_bucket) |= 1 << (7 - (new_index & 7)); + } + } + } + + // free old allocations + libc::free((*ht).entries); + libc::free((*ht).occupied); + + (*ht).capacity = new_capacity; + (*ht).entries = new_entries; + (*ht).occupied = new_occupied; + } + + pub unsafe fn is_occupied(ht: *const Self, index: usize) -> bool { + let bucket = *(*ht).occupied.add(index >> 3); + let sub_index = 7 - (index & 7); + (bucket >> sub_index) & 1 == 1 + } + + pub unsafe fn index_from_hash(hash: u64, capacity: usize) -> usize { + (hash & (capacity as u64 - 1)) as usize + } + + pub unsafe fn hash_key(ht: *const Self, key: K) -> u64 { + let mut hasher = (*ht).hasher_builder.build_hasher(); + key.hash(&mut hasher); + hasher.finish() + } +} + +#[derive(Clone, Copy)] +pub struct DefaultHasher; + +impl BuildHasher for DefaultHasher { + type Hasher = Fnv1aHasher; + + fn build_hasher(&self) -> Self::Hasher { + Fnv1aHasher { hash: Fnv1aHasher::OFFSET } + } +} + +#[derive(Clone, Copy)] +pub struct Fnv1aHasher { + pub hash: u64, +} + +impl Fnv1aHasher { + const OFFSET: u64 = 14695981039346656037; + const PRIME: u64 = 1099511628211; +} + +impl Hasher for Fnv1aHasher { + fn finish(&self) -> u64 { + self.hash + } + + fn write(&mut self, bytes: &[u8]) { + for byte in bytes { + self.hash ^= *byte as u64; + self.hash = self.hash.wrapping_mul(Self::PRIME); + } + } + + fn write_u8(&mut self, i: u8) { + self.hash ^= i as u64; + self.hash = self.hash.wrapping_mul(Self::PRIME); + } + + fn write_u16(&mut self, i: u16) { + self.hash ^= i as u64; + self.hash = self.hash.wrapping_mul(Self::PRIME); + } + + fn write_u32(&mut self, i: u32) { + self.hash ^= i as u64; + self.hash = self.hash.wrapping_mul(Self::PRIME); + } + + fn write_u64(&mut self, i: u64) { + self.hash ^= i; + self.hash = self.hash.wrapping_mul(Self::PRIME); + } + + fn write_usize(&mut self, i: usize) { + self.hash ^= i as u64; + self.hash = self.hash.wrapping_mul(Self::PRIME); + } +} diff --git a/src/ir.rs b/src/ir.rs index 72bb0a22..9ee1defb 100644 --- a/src/ir.rs +++ b/src/ir.rs @@ -1,4 +1,6 @@ use core::ffi::*; +use crate::crust::Str; +use crate::hashtable::HashTable; use crate::lexer::*; use crate::nob::*; @@ -120,7 +122,7 @@ pub struct Program { pub funcs: Array, pub data: Array, pub extrns: Array<*const c_char>, - pub variadics: Array<(*const c_char, Variadic)>, + pub variadics: HashTable, pub globals: Array, pub asm_funcs: Array, } From 85842fcde425c9225b02207ce000804827fc0ccd Mon Sep 17 00:00:00 2001 From: kyon4ik Date: Thu, 31 Jul 2025 19:03:55 +0500 Subject: [PATCH 2/8] Add hashtable --- Makefile | 1 + src/b.rs | 10 +- src/codegen/gas_aarch64.rs | 11 +- src/crust.rs | 47 +++----- src/hashtable.rs | 220 +++++++++++++++++++++++++++++++++++++ src/ir.rs | 4 +- 6 files changed, 252 insertions(+), 41 deletions(-) create mode 100644 src/hashtable.rs diff --git a/Makefile b/Makefile index a2431457..e20cc26d 100644 --- a/Makefile +++ b/Makefile @@ -16,6 +16,7 @@ CRUST_FLAGS=-g --edition 2021 -C opt-level=0 -C panic="abort" RSS=\ $(SRC)/arena.rs \ + $(SRC)/hashtable.rs \ $(SRC)/b.rs \ $(SRC)/ir.rs \ $(SRC)/crust.rs \ diff --git a/src/b.rs b/src/b.rs index 9cdddb16..70e7151d 100644 --- a/src/b.rs +++ b/src/b.rs @@ -37,16 +37,18 @@ pub mod targets; pub mod ir; pub mod time; pub mod shlex; +pub mod hashtable; use core::ffi::*; use core::mem::zeroed; use core::ptr; use core::slice; use core::cmp; +use crust::Str; +use hashtable::HashTable; use nob::*; use flag::*; use crust::libc::*; -use crust::assoc_lookup_cstr; use arena::Arena; use targets::*; use lexer::{Lexer, Loc, Token}; @@ -947,7 +949,7 @@ pub unsafe fn compile_program(l: *mut Lexer, c: *mut Compiler) -> Option<()> { get_and_expect_token_but_continue(l, c, Token::ID)?; let func = arena::strdup(&mut (*c).arena, (*l).string); let func_loc = (*l).loc; - if let Some(existing_variadic) = assoc_lookup_cstr(da_slice((*c).program.variadics), func) { + if let Some(existing_variadic) = HashTable::get(&(*c).program.variadics, Str(func)) { // TODO: report all the duplicate variadics maybe? diagf!(func_loc, c!("ERROR: duplicate variadic declaration `%s`\n"), func); diagf!((*existing_variadic).loc, c!("NOTE: the first declaration is located here\n")); @@ -959,10 +961,10 @@ pub unsafe fn compile_program(l: *mut Lexer, c: *mut Compiler) -> Option<()> { diagf!((*l).loc, c!("ERROR: variadic function `%s` cannot have 0 arguments\n"), func); bump_error_count(c)?; } - da_append(&mut (*c).program.variadics, (func, Variadic { + HashTable::insert(&mut (*c).program.variadics, Str(func), Variadic { loc: func_loc, fixed_args: (*l).int_number as usize, - })); + }); get_and_expect_token_but_continue(l, c, Token::CParen)?; get_and_expect_token_but_continue(l, c, Token::SemiColon)?; } diff --git a/src/codegen/gas_aarch64.rs b/src/codegen/gas_aarch64.rs index dec99eae..3fb02757 100644 --- a/src/codegen/gas_aarch64.rs +++ b/src/codegen/gas_aarch64.rs @@ -1,8 +1,9 @@ use core::ffi::*; use core::mem::zeroed; +use crate::crust::Str; +use crate::hashtable::HashTable; use crate::nob::*; use crate::crust::libc::*; -use crate::crust::assoc_lookup_cstr; use crate::ir::*; use crate::lexer::*; use crate::missingf; @@ -127,7 +128,7 @@ pub unsafe fn load_arg_to_reg(arg: Arg, reg: *const c_char, output: *mut String_ }; } -pub unsafe fn generate_function(name: *const c_char, _name_loc: Loc, params_count: usize, auto_vars_count: usize, os: Os, variadics: *const [(*const c_char, Variadic)], body: *const [OpWithLocation], output: *mut String_Builder) { +pub unsafe fn generate_function(name: *const c_char, _name_loc: Loc, params_count: usize, auto_vars_count: usize, os: Os, variadics: *const HashTable, body: *const [OpWithLocation], output: *mut String_Builder) { let stack_size = align_bytes(auto_vars_count*8, 16); match os { Os::Linux => { @@ -316,7 +317,7 @@ pub unsafe fn generate_function(name: *const c_char, _name_loc: Loc, params_coun let mut fixed_args = 0; match fun { Arg::External(name) | Arg::RefExternal(name) => { - if let Some(variadic) = assoc_lookup_cstr(variadics, name) { + if let Some(variadic) = HashTable::get(variadics, Str(name)) { fixed_args = (*variadic).fixed_args; } } @@ -395,7 +396,7 @@ pub unsafe fn generate_function(name: *const c_char, _name_loc: Loc, params_coun sb_appendf(output, c!(" ret\n")); } -pub unsafe fn generate_funcs(output: *mut String_Builder, funcs: *const [Func], variadics: *const [(*const c_char, Variadic)], os: Os) { +pub unsafe fn generate_funcs(output: *mut String_Builder, funcs: *const [Func], variadics: *const HashTable, os: Os) { sb_appendf(output, c!(".text\n")); for i in 0..funcs.len() { generate_function((*funcs)[i].name, (*funcs)[i].name_loc, (*funcs)[i].params_count, (*funcs)[i].auto_vars_count, os, variadics, da_slice((*funcs)[i].body), output); @@ -544,7 +545,7 @@ pub unsafe fn generate_program( if debug { todo!("Debug information for aarch64") } - generate_funcs(output, da_slice((*program).funcs), da_slice((*program).variadics), os); + generate_funcs(output, da_slice((*program).funcs), &(*program).variadics, os); generate_asm_funcs(output, da_slice((*program).asm_funcs), os); generate_globals(output, da_slice((*program). globals), os); generate_data_section(output, da_slice((*program).data)); diff --git a/src/crust.rs b/src/crust.rs index 1ea13edb..34ba18fb 100644 --- a/src/crust.rs +++ b/src/crust.rs @@ -1,5 +1,6 @@ // This is a module that facilitates Crust-style programming - https://github.com/tsoding/crust use crate::crust::libc::*; +use core::hash::{Hash, Hasher}; use core::panic::PanicInfo; use core::ffi::*; @@ -38,44 +39,28 @@ pub unsafe fn slice_contains(slice: *const [Value], needle: *c false } -pub unsafe fn assoc_lookup_cstr_mut(assoc: *mut [(*const c_char, Value)], needle: *const c_char) -> Option<*mut Value> { - for i in 0..assoc.len() { - if strcmp((*assoc)[i].0, needle) == 0 { - return Some(&mut (*assoc)[i].1); - } - } - None -} +#[repr(transparent)] +#[derive(Clone, Copy, Debug)] +pub struct Str(pub *const c_char); -pub unsafe fn assoc_lookup_cstr(assoc: *const [(*const c_char, Value)], needle: *const c_char) -> Option<*const Value> { - for i in 0..assoc.len() { - if strcmp((*assoc)[i].0, needle) == 0 { - return Some(&(*assoc)[i].1); - } +impl PartialEq for Str { + fn eq(&self, other: &Self) -> bool { + unsafe { strcmp(self.0, other.0) == 0 } } - None } -pub unsafe fn assoc_lookup_mut(assoc: *mut [(Key, Value)], needle: *const Key) -> Option<*mut Value> -where Key: PartialEq -{ - for i in 0..assoc.len() { - if (*assoc)[i].0 == *needle { - return Some(&mut (*assoc)[i].1); - } - } - None -} +impl Eq for Str {} -pub unsafe fn assoc_lookup(assoc: *const [(Key, Value)], needle: *const Key) -> Option<*const Value> -where Key: PartialEq -{ - for i in 0..assoc.len() { - if (*assoc)[i].0 == *needle { - return Some(&(*assoc)[i].1); +impl Hash for Str { + fn hash(&self, state: &mut H) { + unsafe { + let mut ptr = self.0; + while *ptr != 0 { + state.write_i8(*ptr); + ptr = ptr.add(1); + } } } - None } #[macro_use] diff --git a/src/hashtable.rs b/src/hashtable.rs new file mode 100644 index 00000000..9b334a98 --- /dev/null +++ b/src/hashtable.rs @@ -0,0 +1,220 @@ +use crate::crust::libc; +use core::hash::{BuildHasher, Hash, Hasher}; +use core::{cmp, mem, ptr}; + +#[derive(Clone, Copy)] +pub struct HashTable { + pub entries: *mut Entry, + pub capacity: usize, + pub count: usize, + pub occupied: *mut u8, + pub hasher_builder: S, +} + +#[derive(Clone, Copy)] +pub struct Entry { + pub key: K, + pub value: V, +} + +#[derive(Clone, Copy)] +pub enum HtEntry { + Occupied(*mut Entry), + Vacant(*mut Entry), +} + +impl HashTable +where + K: Clone + Copy + Hash + Eq, + V: Clone + Copy, + S: BuildHasher, + H: Hasher, +{ + // Must be power of 2 and greater than or equal to 8 + pub const MIN_CAPACITY: usize = 8; + + /// Returns previous value stored by this `key` or `None` + pub unsafe fn insert(ht: *mut Self, key: K, value: V) -> Option { + match Self::find(ht, key) { + HtEntry::Occupied(entry) => Some(mem::replace(&mut (*entry).value, value)), + HtEntry::Vacant(entry) => { + Self::insert_new_key(ht, entry, key, value); + None + } + } + } + + pub unsafe fn get(ht: *const Self, key: K) -> Option<*const V> { + match Self::find(ht, key) { + HtEntry::Occupied(entry) => Some(&(*entry).value), + HtEntry::Vacant(_) => None, + } + } + + pub unsafe fn get_mut(ht: *mut Self, key: K) -> Option<*mut V> { + match Self::find(ht, key) { + HtEntry::Occupied(entry) => Some(&mut (*entry).value), + HtEntry::Vacant(_) => None, + } + } + + pub unsafe fn find(ht: *const Self, key: K) -> HtEntry { + if (*ht).capacity == 0 { + return HtEntry::Vacant(ptr::null_mut()); + } + + let hash = Self::hash_key(ht, key); + let mut index = Self::index_from_hash(hash, (*ht).capacity); + + let mut step = 1; + loop { + let entry = (*ht).entries.add(index); + if Self::is_occupied(ht, index) { + if (*entry).key == key { + return HtEntry::Occupied(entry); + } + } else { + return HtEntry::Vacant(entry); + } + + index = (index + step) & ((*ht).capacity - 1); + step += 1; + } + } + + pub unsafe fn insert_new_key(ht: *mut Self, entry: *mut Entry, key: K, value: V) { + if entry.is_null() { + Self::realloc_rehash(ht); + + // Executes only when capacity was 0 + let hash = Self::hash_key(ht, key); + let index = Self::index_from_hash(hash, (*ht).capacity); + *(*ht).entries.add(index) = Entry { key, value }; + (*ht).count += 1; + } else { + (*entry).key = key; + (*entry).value = value; + (*ht).count += 1; + + // When load factor > 0.75 + if (3 * (*ht).capacity) / 4 < (*ht).count { + Self::realloc_rehash(ht); + } + } + } + + pub unsafe fn realloc_rehash(ht: *mut Self) { + let new_capacity = cmp::max((*ht).capacity << 1, Self::MIN_CAPACITY); + debug_assert!(new_capacity.is_power_of_two()); + + // We need new allocations here, to properly copy entries + let new_entries: *mut Entry = libc::realloc_items(ptr::null_mut(), new_capacity); + let new_occupied: *mut u8 = libc::realloc_items(ptr::null_mut(), new_capacity >> 3); + debug_assert!(!new_entries.is_null()); + debug_assert!(!new_occupied.is_null()); + + // Fill occupied with zeros + ptr::write_bytes(new_occupied, 0, new_capacity >> 3); + + // Rehash all occupoed entries + let buckets_count = (*ht).capacity >> 3; + for i in 0..buckets_count { + let bucket = *(*ht).occupied.add(i); + for j in 0..8 { + if (bucket >> j) & 1 == 1 { + let index = (i << 3) + (7 - j); + let entry = *(*ht).entries.add(index); + let hash = Self::hash_key(ht, entry.key); + let new_index = Self::index_from_hash(hash, new_capacity); + let new_bucket = new_index >> 3; + + *new_entries.add(new_index) = entry; + *new_occupied.add(new_bucket) |= 1 << (7 - (new_index & 7)); + } + } + } + + // free old allocations + libc::free((*ht).entries); + libc::free((*ht).occupied); + + (*ht).capacity = new_capacity; + (*ht).entries = new_entries; + (*ht).occupied = new_occupied; + } + + pub unsafe fn is_occupied(ht: *const Self, index: usize) -> bool { + let bucket = *(*ht).occupied.add(index >> 3); + let sub_index = 7 - (index & 7); + (bucket >> sub_index) & 1 == 1 + } + + pub unsafe fn index_from_hash(hash: u64, capacity: usize) -> usize { + (hash & (capacity as u64 - 1)) as usize + } + + pub unsafe fn hash_key(ht: *const Self, key: K) -> u64 { + let mut hasher = (*ht).hasher_builder.build_hasher(); + key.hash(&mut hasher); + hasher.finish() + } +} + +#[derive(Clone, Copy)] +pub struct DefaultHasher; + +impl BuildHasher for DefaultHasher { + type Hasher = Fnv1aHasher; + + fn build_hasher(&self) -> Self::Hasher { + Fnv1aHasher { hash: Fnv1aHasher::OFFSET } + } +} + +#[derive(Clone, Copy)] +pub struct Fnv1aHasher { + pub hash: u64, +} + +impl Fnv1aHasher { + const OFFSET: u64 = 14695981039346656037; + const PRIME: u64 = 1099511628211; +} + +impl Hasher for Fnv1aHasher { + fn finish(&self) -> u64 { + self.hash + } + + fn write(&mut self, bytes: &[u8]) { + for byte in bytes { + self.hash ^= *byte as u64; + self.hash = self.hash.wrapping_mul(Self::PRIME); + } + } + + fn write_u8(&mut self, i: u8) { + self.hash ^= i as u64; + self.hash = self.hash.wrapping_mul(Self::PRIME); + } + + fn write_u16(&mut self, i: u16) { + self.hash ^= i as u64; + self.hash = self.hash.wrapping_mul(Self::PRIME); + } + + fn write_u32(&mut self, i: u32) { + self.hash ^= i as u64; + self.hash = self.hash.wrapping_mul(Self::PRIME); + } + + fn write_u64(&mut self, i: u64) { + self.hash ^= i; + self.hash = self.hash.wrapping_mul(Self::PRIME); + } + + fn write_usize(&mut self, i: usize) { + self.hash ^= i as u64; + self.hash = self.hash.wrapping_mul(Self::PRIME); + } +} diff --git a/src/ir.rs b/src/ir.rs index 72bb0a22..9ee1defb 100644 --- a/src/ir.rs +++ b/src/ir.rs @@ -1,4 +1,6 @@ use core::ffi::*; +use crate::crust::Str; +use crate::hashtable::HashTable; use crate::lexer::*; use crate::nob::*; @@ -120,7 +122,7 @@ pub struct Program { pub funcs: Array, pub data: Array, pub extrns: Array<*const c_char>, - pub variadics: Array<(*const c_char, Variadic)>, + pub variadics: HashTable, pub globals: Array, pub asm_funcs: Array, } From 83827aa0edd37db90494e1683574620c4cb5652d Mon Sep 17 00:00:00 2001 From: kyon4ik Date: Fri, 1 Aug 2025 00:16:23 +0500 Subject: [PATCH 3/8] Fix realloc_rehash --- src/b.rs | 94 +++++++++++++++++++++++++----------------------- src/hashtable.rs | 73 +++++++++++++++++++++---------------- 2 files changed, 92 insertions(+), 75 deletions(-) diff --git a/src/b.rs b/src/b.rs index 70e7151d..ae69e04c 100644 --- a/src/b.rs +++ b/src/b.rs @@ -46,6 +46,7 @@ use core::slice; use core::cmp; use crust::Str; use hashtable::HashTable; +use hashtable::HtEntry; use nob::*; use flag::*; use crust::libc::*; @@ -130,38 +131,32 @@ pub enum Storage { } #[derive(Clone, Copy)] -pub struct Var { - pub name: *const c_char, +pub struct VarData { pub loc: Loc, pub storage: Storage, } -pub unsafe fn scope_push(vars: *mut Array>) { +pub unsafe fn scope_push(vars: *mut Array>) { if (*vars).count < (*vars).capacity { // Reusing already allocated scopes (*vars).count += 1; - (*da_last_mut(vars).expect("There should be always at least the global scope")).count = 0; + let last_scope = da_last_mut(vars).expect("There should be always at least the global scope"); + HashTable::clear(last_scope); } else { da_append(vars, zeroed()); } } -pub unsafe fn scope_pop(vars: *mut Array>) { +pub unsafe fn scope_pop(vars: *mut Array>) { assert!((*vars).count > 0); (*vars).count -= 1; } -pub unsafe fn find_var_near(vars: *const Array, name: *const c_char) -> *const Var { - for i in 0..(*vars).count { - let var = (*vars).items.add(i); - if strcmp((*var).name, name) == 0 { - return var - } - } - ptr::null() +pub unsafe fn find_var_near(vars: *const HashTable<*const c_char, VarData>, name: *const c_char) -> *const VarData { + HashTable::get(vars, name).unwrap_or(ptr::null()) } -pub unsafe fn find_var_deep(vars: *const Array>, name: *const c_char) -> *const Var { +pub unsafe fn find_var_deep(vars: *const Array>, name: *const c_char) -> *const VarData { let mut i = (*vars).count; while i > 0 { let var = find_var_near((*vars).items.add(i-1), name); @@ -182,7 +177,7 @@ pub unsafe fn declare_var(c: *mut Compiler, name: *const c_char, loc: Loc, stora return bump_error_count(c); } - da_append(scope, Var {name, loc, storage}); + HashTable::insert(scope, name, VarData {loc, storage}); Some(()) } @@ -402,7 +397,7 @@ pub unsafe fn compile_primary_expression(l: *mut Lexer, c: *mut Compiler) -> Opt } Token::CharLit | Token::IntLit => Some((Arg::Literal((*l).int_number), false)), Token::ID => { - let name = arena::strdup(&mut (*c).arena, (*l).string); + let name = intern(&mut (*c).interner, (*l).string); let var_def = find_var_deep(&mut (*c).vars, name); if var_def.is_null() { @@ -654,7 +649,7 @@ pub unsafe fn compile_asm_stmts(l: *mut Lexer, c: *mut Compiler, stmts: *mut Arr get_and_expect_token(l, Token::String)?; match (*l).token { Token::String => { - let line = arena::strdup(&mut (*c).arena, (*l).string); + let line = intern(&mut (*c).interner, (*l).string); let loc = (*l).loc; da_append(stmts, AsmStmt { line, loc }); } @@ -692,7 +687,7 @@ pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { Token::Extrn => { while (*l).token != Token::SemiColon { get_and_expect_token(l, Token::ID)?; - let name = arena::strdup(&mut (*c).arena, (*l).string); + let name = intern(&mut (*c).interner, (*l).string); name_declare_if_not_exists(&mut (*c).program.extrns, name); declare_var(c, name, (*l).loc, Storage::External {name})?; get_and_expect_tokens(l, &[Token::SemiColon, Token::Comma])?; @@ -702,7 +697,7 @@ pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { Token::Auto => { while (*l).token != Token::SemiColon { get_and_expect_token(l, Token::ID)?; - let name = arena::strdup(&mut (*c).arena, (*l).string); + let name = intern(&mut (*c).interner, (*l).string); let index = allocate_auto_var(&mut (*c).auto_vars_ator); declare_var(c, name, (*l).loc, Storage::Auto {index})?; get_and_expect_tokens(l, &[Token::SemiColon, Token::Comma, Token::IntLit, Token::CharLit])?; @@ -785,7 +780,7 @@ pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { } Token::Goto => { get_and_expect_token(l, Token::ID)?; - let name = arena::strdup(&mut (*c).arena, (*l).string); + let name = intern(&mut (*c).interner, (*l).string); let loc = (*l).loc; let addr = (*c).func_body.count; da_append(&mut (*c).func_gotos, Goto {name, loc, addr}); @@ -859,7 +854,7 @@ pub unsafe fn compile_statement(l: *mut Lexer, c: *mut Compiler) -> Option<()> { } _ => { if (*l).token == Token::ID { - let name = arena::strdup(&mut (*c).arena, (*l).string); + let name = intern(&mut (*c).interner, (*l).string); let name_loc = (*l).loc; lexer::get_token(l)?; if (*l).token == Token::Colon { @@ -893,10 +888,28 @@ pub struct Switch { pub cond: usize, } +/// Deduplicates and prolongs strings lifetime +#[derive(Clone, Copy)] +pub struct StringInterner { + pub deduper: HashTable, + pub arena: Arena, +} + +pub unsafe fn intern(interner: *mut StringInterner, string: *const c_char) -> *mut c_char { + match HashTable::find(&(*interner).deduper, Str(string)) { + HtEntry::Occupied(entry) => (*entry).key.0 as *mut c_char, + HtEntry::Vacant(entry) => { + let ptr = arena::strdup(&mut (*interner).arena, string); + HashTable::insert_new_key(&mut (*interner).deduper, entry, Str(ptr), ()); + ptr + } + } +} + #[derive(Clone, Copy)] pub struct Compiler { pub program: Program, - pub vars: Array>, + pub vars: Array>, pub auto_vars_ator: AutoVarsAtor, pub func_body: Array, pub func_goto_labels: Array, @@ -904,17 +917,7 @@ pub struct Compiler { pub used_funcs: Array, pub op_label_count: usize, pub switch_stack: Array, - /// Arena into which the Compiler allocates all the names and - /// objects that need to live for the duration of the - /// compilation. Even if some object/names don't need to live that - /// long (for example, function labels need to live only for the - /// duration of that function compilation), just letting them live - /// longer makes the memory management easier. - /// - /// Basically just dump everything into this arena and if you ever - /// need to reset the state of the Compiler, just reset all its - /// Dynamic Arrays and this Arena. - pub arena: Arena, + pub interner: StringInterner, pub target: Target, pub error_count: usize, pub historical: bool, @@ -947,7 +950,7 @@ pub unsafe fn compile_program(l: *mut Lexer, c: *mut Compiler) -> Option<()> { Token::Variadic => { get_and_expect_token_but_continue(l, c, Token::OParen)?; get_and_expect_token_but_continue(l, c, Token::ID)?; - let func = arena::strdup(&mut (*c).arena, (*l).string); + let func = intern(&mut (*c).interner, (*l).string); let func_loc = (*l).loc; if let Some(existing_variadic) = HashTable::get(&(*c).program.variadics, Str(func)) { // TODO: report all the duplicate variadics maybe? @@ -971,7 +974,7 @@ pub unsafe fn compile_program(l: *mut Lexer, c: *mut Compiler) -> Option<()> { Token::Extrn => { while (*l).token != Token::SemiColon { get_and_expect_token(l, Token::ID)?; - let name = arena::strdup(&mut (*c).arena, (*l).string); + let name = intern(&mut (*c).interner, (*l).string); name_declare_if_not_exists(&mut (*c).program.extrns, name); declare_var(c, name, (*l).loc, Storage::External {name})?; get_and_expect_tokens(l, &[Token::SemiColon, Token::Comma])?; @@ -979,7 +982,7 @@ pub unsafe fn compile_program(l: *mut Lexer, c: *mut Compiler) -> Option<()> { } _ => { expect_token(l, Token::ID)?; - let name = arena::strdup(&mut (*c).arena, (*l).string); + let name = intern(&mut (*c).interner, (*l).string); let name_loc = (*l).loc; declare_var(c, name, name_loc, Storage::External{name})?; @@ -996,7 +999,7 @@ pub unsafe fn compile_program(l: *mut Lexer, c: *mut Compiler) -> Option<()> { (*l).parse_point = saved_point; 'params: loop { get_and_expect_token(l, Token::ID)?; - let name = arena::strdup(&mut (*c).arena, (*l).string); + let name = intern(&mut (*c).interner, (*l).string); let name_loc = (*l).loc; let index = allocate_auto_var(&mut (*c).auto_vars_ator); declare_var(c, name, name_loc, Storage::Auto{index})?; @@ -1074,7 +1077,7 @@ pub unsafe fn compile_program(l: *mut Lexer, c: *mut Compiler) -> Option<()> { Token::IntLit | Token::CharLit => ImmediateValue::Literal((*l).int_number), Token::String => ImmediateValue::DataOffset(compile_string((*l).string, c)), Token::ID => { - let name = arena::strdup(&mut (*c).arena, (*l).string); + let name = intern(&mut (*c).interner, (*l).string); let scope = da_last_mut(&mut (*c).vars).expect("There should be always at least the global scope"); let var = find_var_near(scope, name); if var.is_null() { @@ -1276,12 +1279,12 @@ pub unsafe fn main(mut argc: i32, mut argv: *mut*mut c_char) -> Option<()> { let gen = match target { Target::Gas_x86_64_Linux | Target::Gas_x86_64_Windows | - Target::Gas_x86_64_Darwin => codegen::gas_x86_64::new(&mut c.arena, da_slice(*codegen_args)), + Target::Gas_x86_64_Darwin => codegen::gas_x86_64::new(&mut c.interner.arena, da_slice(*codegen_args)), Target::Gas_AArch64_Linux | - Target::Gas_AArch64_Darwin => codegen::gas_aarch64::new(&mut c.arena, da_slice(*codegen_args)), - Target::Uxn => codegen::uxn::new(&mut c.arena, da_slice(*codegen_args)), - Target::Mos6502_Posix => codegen::mos6502::new(&mut c.arena, da_slice(*codegen_args)), - Target::ILasm_Mono => codegen::ilasm_mono::new(&mut c.arena, da_slice(*codegen_args)), + Target::Gas_AArch64_Darwin => codegen::gas_aarch64::new(&mut c.interner.arena, da_slice(*codegen_args)), + Target::Uxn => codegen::uxn::new(&mut c.interner.arena, da_slice(*codegen_args)), + Target::Mos6502_Posix => codegen::mos6502::new(&mut c.interner.arena, da_slice(*codegen_args)), + Target::ILasm_Mono => codegen::ilasm_mono::new(&mut c.interner.arena, da_slice(*codegen_args)), }?; if input_paths.count == 0 { @@ -1306,8 +1309,9 @@ pub unsafe fn main(mut argc: i32, mut argv: *mut*mut c_char) -> Option<()> { log(Log_Level::ERROR, c!("No standard library path %s found. Please run the compiler from the same folder where %s is located. Or if you don't want to use the standard library pass the -%s flag."), libb_path, libb_path, flag_name(nostdlib)); return None; } - include_path_if_exists(&mut input_paths, arena::sprintf(&mut c.arena, c!("%s/all.b"), libb_path)); - include_path_if_exists(&mut input_paths, arena::sprintf(&mut c.arena, c!("%s/%s.b"), libb_path, *target_name)); + // TODO: this should be stored in separate arena (or temporary allocated) + include_path_if_exists(&mut input_paths, arena::sprintf(&mut c.interner.arena, c!("%s/all.b"), libb_path)); + include_path_if_exists(&mut input_paths, arena::sprintf(&mut c.interner.arena, c!("%s/%s.b"), libb_path, *target_name)); } let mut sb: String_Builder = zeroed(); diff --git a/src/hashtable.rs b/src/hashtable.rs index 9b334a98..f3bc71ab 100644 --- a/src/hashtable.rs +++ b/src/hashtable.rs @@ -31,7 +31,7 @@ where H: Hasher, { // Must be power of 2 and greater than or equal to 8 - pub const MIN_CAPACITY: usize = 8; + pub const MIN_CAPACITY: usize = 32; /// Returns previous value stored by this `key` or `None` pub unsafe fn insert(ht: *mut Self, key: K, value: V) -> Option { @@ -58,11 +58,16 @@ where } } + pub unsafe fn clear(ht: *mut Self) { + (*ht).count = 0; + ptr::write_bytes((*ht).occupied, 0, (*ht).capacity >> 3); + } + pub unsafe fn find(ht: *const Self, key: K) -> HtEntry { if (*ht).capacity == 0 { return HtEntry::Vacant(ptr::null_mut()); } - + let hash = Self::hash_key(ht, key); let mut index = Self::index_from_hash(hash, (*ht).capacity); @@ -86,15 +91,14 @@ where if entry.is_null() { Self::realloc_rehash(ht); - // Executes only when capacity was 0 + // Executes only when capacity was 0 let hash = Self::hash_key(ht, key); let index = Self::index_from_hash(hash, (*ht).capacity); - *(*ht).entries.add(index) = Entry { key, value }; - (*ht).count += 1; + Self::fill_entry(ht, (*ht).entries.add(index), index, key, value); } else { - (*entry).key = key; - (*entry).value = value; - (*ht).count += 1; + let index = entry.offset_from((*ht).entries); + debug_assert!(index >= 0); + Self::fill_entry(ht, entry, index as usize, key, value); // When load factor > 0.75 if (3 * (*ht).capacity) / 4 < (*ht).count { @@ -104,43 +108,50 @@ where } pub unsafe fn realloc_rehash(ht: *mut Self) { - let new_capacity = cmp::max((*ht).capacity << 1, Self::MIN_CAPACITY); - debug_assert!(new_capacity.is_power_of_two()); + let old_entries = (*ht).entries; + let old_occupied = (*ht).occupied; + let old_capacity = (*ht).capacity; + (*ht).capacity = cmp::max(old_capacity << 1, Self::MIN_CAPACITY); + debug_assert!((*ht).capacity.is_power_of_two()); + // We need new allocations here, to properly copy entries - let new_entries: *mut Entry = libc::realloc_items(ptr::null_mut(), new_capacity); - let new_occupied: *mut u8 = libc::realloc_items(ptr::null_mut(), new_capacity >> 3); - debug_assert!(!new_entries.is_null()); - debug_assert!(!new_occupied.is_null()); + (*ht).entries = libc::realloc_items(ptr::null_mut(), (*ht).capacity); + (*ht).occupied = libc::realloc_items(ptr::null_mut(), (*ht).capacity >> 3); + debug_assert!(!(*ht).entries.is_null()); + debug_assert!(!(*ht).occupied.is_null()); // Fill occupied with zeros - ptr::write_bytes(new_occupied, 0, new_capacity >> 3); + ptr::write_bytes((*ht).occupied, 0, (*ht).capacity >> 3); // Rehash all occupoed entries - let buckets_count = (*ht).capacity >> 3; + let buckets_count = old_capacity >> 3; for i in 0..buckets_count { - let bucket = *(*ht).occupied.add(i); + let bucket = *old_occupied.add(i); for j in 0..8 { if (bucket >> j) & 1 == 1 { let index = (i << 3) + (7 - j); - let entry = *(*ht).entries.add(index); - let hash = Self::hash_key(ht, entry.key); - let new_index = Self::index_from_hash(hash, new_capacity); - let new_bucket = new_index >> 3; - - *new_entries.add(new_index) = entry; - *new_occupied.add(new_bucket) |= 1 << (7 - (new_index & 7)); + let entry = *old_entries.add(index); + assert!(Self::insert(ht, entry.key, entry.value).is_none()); } } } // free old allocations - libc::free((*ht).entries); - libc::free((*ht).occupied); + libc::free(old_entries); + libc::free(old_occupied); + } - (*ht).capacity = new_capacity; - (*ht).entries = new_entries; - (*ht).occupied = new_occupied; + pub unsafe fn fill_entry(ht: *mut Self, entry: *mut Entry, index: usize, key: K, value: V) { + *entry = Entry { key, value }; + Self::occupy_index(ht, index); + (*ht).count += 1; + } + + pub unsafe fn occupy_index(ht: *mut Self, index: usize) { + let bucket = (*ht).occupied.add(index >> 3); + let sub_index = 7 - (index & 7); + *bucket |= 1 << sub_index; } pub unsafe fn is_occupied(ht: *const Self, index: usize) -> bool { @@ -167,7 +178,9 @@ impl BuildHasher for DefaultHasher { type Hasher = Fnv1aHasher; fn build_hasher(&self) -> Self::Hasher { - Fnv1aHasher { hash: Fnv1aHasher::OFFSET } + Fnv1aHasher { + hash: Fnv1aHasher::OFFSET, + } } } From 33ec1af47e6e803a560c59e67138bbf719209e59 Mon Sep 17 00:00:00 2001 From: kyon4ik Date: Fri, 1 Aug 2025 18:48:07 +0500 Subject: [PATCH 4/8] Add string literals interning --- src/b.rs | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/src/b.rs b/src/b.rs index ae69e04c..12c876ff 100644 --- a/src/b.rs +++ b/src/b.rs @@ -311,17 +311,26 @@ pub unsafe fn allocate_auto_var(t: *mut AutoVarsAtor) -> usize { pub unsafe fn compile_string(string: *const c_char, c: *mut Compiler) -> usize { - let offset = (*c).program.data.count; - let string_len = strlen(string); - da_append_many(&mut (*c).program.data, slice::from_raw_parts(string as *const u8, string_len)); - // TODO: Strings in B are not NULL-terminated. - // They are terminated with symbol '*e' ('*' is escape character akin to '\' in C) which according to the - // spec is called just "end-of-file" without any elaboration on what its value is. Maybe it had a specific - // value on PDP that was a common knowledge at the time? In any case that breaks compatibility with - // libc. While the language is still in development we gonna terminate it with 0. We will make it - // "spec complaint" later. - da_append(&mut (*c).program.data, 0); // NULL-terminator - offset + // TODO: Don't use second hashtable, which requires changes to the API, returning string address + // instead of data offset + let string = intern(&mut (*c).interner, string); + match HashTable::find(&(*c).string_offset, string) { + HtEntry::Occupied(entry) => (*entry).value, + HtEntry::Vacant(entry) => { + let offset = (*c).program.data.count; + let string_len = strlen(string); + da_append_many(&mut (*c).program.data, slice::from_raw_parts(string as *const u8, string_len)); + // TODO: Strings in B are not NULL-terminated. + // They are terminated with symbol '*e' ('*' is escape character akin to '\' in C) which according to the + // spec is called just "end-of-file" without any elaboration on what its value is. Maybe it had a specific + // value on PDP that was a common knowledge at the time? In any case that breaks compatibility with + // libc. While the language is still in development we gonna terminate it with 0. We will make it + // "spec complaint" later. + da_append(&mut (*c).program.data, 0); // NULL-terminator + HashTable::insert_new_key(&mut (*c).string_offset, entry, string, offset); + offset + }, + } } pub unsafe fn compile_primary_expression(l: *mut Lexer, c: *mut Compiler) -> Option<(Arg, bool)> { @@ -918,6 +927,7 @@ pub struct Compiler { pub op_label_count: usize, pub switch_stack: Array, pub interner: StringInterner, + pub string_offset: HashTable<*const c_char, usize>, pub target: Target, pub error_count: usize, pub historical: bool, From 9e1c92b86fa9b0f4507c93c2ea8c9e084b73a3b4 Mon Sep 17 00:00:00 2001 From: kyon4ik Date: Fri, 1 Aug 2025 19:30:12 +0500 Subject: [PATCH 5/8] Use pointer for variadics --- src/b.rs | 7 +++---- src/codegen/gas_aarch64.rs | 7 +++---- src/ir.rs | 3 +-- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/b.rs b/src/b.rs index 12c876ff..2cbd3597 100644 --- a/src/b.rs +++ b/src/b.rs @@ -45,8 +45,7 @@ use core::ptr; use core::slice; use core::cmp; use crust::Str; -use hashtable::HashTable; -use hashtable::HtEntry; +use hashtable::{HashTable, HtEntry}; use nob::*; use flag::*; use crust::libc::*; @@ -962,7 +961,7 @@ pub unsafe fn compile_program(l: *mut Lexer, c: *mut Compiler) -> Option<()> { get_and_expect_token_but_continue(l, c, Token::ID)?; let func = intern(&mut (*c).interner, (*l).string); let func_loc = (*l).loc; - if let Some(existing_variadic) = HashTable::get(&(*c).program.variadics, Str(func)) { + if let Some(existing_variadic) = HashTable::get(&(*c).program.variadics, func) { // TODO: report all the duplicate variadics maybe? diagf!(func_loc, c!("ERROR: duplicate variadic declaration `%s`\n"), func); diagf!((*existing_variadic).loc, c!("NOTE: the first declaration is located here\n")); @@ -974,7 +973,7 @@ pub unsafe fn compile_program(l: *mut Lexer, c: *mut Compiler) -> Option<()> { diagf!((*l).loc, c!("ERROR: variadic function `%s` cannot have 0 arguments\n"), func); bump_error_count(c)?; } - HashTable::insert(&mut (*c).program.variadics, Str(func), Variadic { + HashTable::insert(&mut (*c).program.variadics, func, Variadic { loc: func_loc, fixed_args: (*l).int_number as usize, }); diff --git a/src/codegen/gas_aarch64.rs b/src/codegen/gas_aarch64.rs index 3fb02757..c8f72aea 100644 --- a/src/codegen/gas_aarch64.rs +++ b/src/codegen/gas_aarch64.rs @@ -1,6 +1,5 @@ use core::ffi::*; use core::mem::zeroed; -use crate::crust::Str; use crate::hashtable::HashTable; use crate::nob::*; use crate::crust::libc::*; @@ -128,7 +127,7 @@ pub unsafe fn load_arg_to_reg(arg: Arg, reg: *const c_char, output: *mut String_ }; } -pub unsafe fn generate_function(name: *const c_char, _name_loc: Loc, params_count: usize, auto_vars_count: usize, os: Os, variadics: *const HashTable, body: *const [OpWithLocation], output: *mut String_Builder) { +pub unsafe fn generate_function(name: *const c_char, _name_loc: Loc, params_count: usize, auto_vars_count: usize, os: Os, variadics: *const HashTable<*const c_char, Variadic>, body: *const [OpWithLocation], output: *mut String_Builder) { let stack_size = align_bytes(auto_vars_count*8, 16); match os { Os::Linux => { @@ -317,7 +316,7 @@ pub unsafe fn generate_function(name: *const c_char, _name_loc: Loc, params_coun let mut fixed_args = 0; match fun { Arg::External(name) | Arg::RefExternal(name) => { - if let Some(variadic) = HashTable::get(variadics, Str(name)) { + if let Some(variadic) = HashTable::get(variadics, name) { fixed_args = (*variadic).fixed_args; } } @@ -396,7 +395,7 @@ pub unsafe fn generate_function(name: *const c_char, _name_loc: Loc, params_coun sb_appendf(output, c!(" ret\n")); } -pub unsafe fn generate_funcs(output: *mut String_Builder, funcs: *const [Func], variadics: *const HashTable, os: Os) { +pub unsafe fn generate_funcs(output: *mut String_Builder, funcs: *const [Func], variadics: *const HashTable<*const c_char, Variadic>, os: Os) { sb_appendf(output, c!(".text\n")); for i in 0..funcs.len() { generate_function((*funcs)[i].name, (*funcs)[i].name_loc, (*funcs)[i].params_count, (*funcs)[i].auto_vars_count, os, variadics, da_slice((*funcs)[i].body), output); diff --git a/src/ir.rs b/src/ir.rs index 9ee1defb..49213dbd 100644 --- a/src/ir.rs +++ b/src/ir.rs @@ -1,5 +1,4 @@ use core::ffi::*; -use crate::crust::Str; use crate::hashtable::HashTable; use crate::lexer::*; use crate::nob::*; @@ -122,7 +121,7 @@ pub struct Program { pub funcs: Array, pub data: Array, pub extrns: Array<*const c_char>, - pub variadics: HashTable, + pub variadics: HashTable<*const c_char, Variadic>, pub globals: Array, pub asm_funcs: Array, } From f69f8b4e4340dd7b80e674d9cce9032ab9163a84 Mon Sep 17 00:00:00 2001 From: kyon4ik Date: Mon, 4 Aug 2025 00:02:56 +0500 Subject: [PATCH 6/8] Document crust::Str --- src/crust.rs | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/src/crust.rs b/src/crust.rs index 34ba18fb..6262838c 100644 --- a/src/crust.rs +++ b/src/crust.rs @@ -1,6 +1,7 @@ // This is a module that facilitates Crust-style programming - https://github.com/tsoding/crust use crate::crust::libc::*; use core::hash::{Hash, Hasher}; +use core::cmp::Ordering; use core::panic::PanicInfo; use core::ffi::*; @@ -39,8 +40,15 @@ pub unsafe fn slice_contains(slice: *const [Value], needle: *c false } +/// This is just a zero-cost wrapper around null-terminated C-string. +/// It would be nice to use `core::ffi::CStr` here, but it has two downsides: +/// 1. Overhead on construction from pointer +/// 2. It is a fat pointer (slice), which means it consumes two times more memory +/// +/// It is useful when you want to pass a `*... c_char` to a function or a struct +/// constraint by `Eq`, `Ord` or `Hash` traits and act it as a C-string. #[repr(transparent)] -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Copy, Eq, Debug)] pub struct Str(pub *const c_char); impl PartialEq for Str { @@ -49,7 +57,29 @@ impl PartialEq for Str { } } -impl Eq for Str {} +impl PartialOrd for Str { + fn partial_cmp(&self, other: &Self) -> Option { + unsafe { + Some(match strcmp(self.0, other.0) { + 0 => Ordering::Equal, + 1.. => Ordering::Greater, + _ => Ordering::Less, + }) + } + } +} + +impl Ord for Str { + fn cmp(&self, other: &Self) -> Ordering { + unsafe { + match strcmp(self.0, other.0) { + 0 => Ordering::Equal, + 1.. => Ordering::Greater, + _ => Ordering::Less, + } + } + } +} impl Hash for Str { fn hash(&self, state: &mut H) { From 600fcaad558f2d2b38479deea14b84d44fc9807a Mon Sep 17 00:00:00 2001 From: kyon4ik Date: Mon, 4 Aug 2025 14:37:53 +0500 Subject: [PATCH 7/8] Cleanup --- src/b.rs | 4 +-- src/crust.rs | 23 ++++++++--------- src/hashtable.rs | 65 ++++++++++++++++++++++-------------------------- 3 files changed, 43 insertions(+), 49 deletions(-) diff --git a/src/b.rs b/src/b.rs index 8f2b2167..db5c3165 100644 --- a/src/b.rs +++ b/src/b.rs @@ -330,7 +330,7 @@ pub unsafe fn compile_string(string: *const c_char, c: *mut Compiler) -> usize { // libc. While the language is still in development we gonna terminate it with 0. We will make it // "spec complaint" later. da_append(&mut (*c).program.data, 0); // NULL-terminator - HashTable::insert_new_key(&mut (*c).string_offset, entry, string, offset); + HashTable::insert_new(&mut (*c).string_offset, entry, string, offset); offset }, } @@ -919,7 +919,7 @@ pub unsafe fn intern(interner: *mut StringInterner, string: *const c_char) -> *m HtEntry::Occupied(entry) => (*entry).key.0 as *mut c_char, HtEntry::Vacant(entry) => { let ptr = arena::strdup(&mut (*interner).arena, string); - HashTable::insert_new_key(&mut (*interner).deduper, entry, Str(ptr), ()); + HashTable::insert_new(&mut (*interner).deduper, entry, Str(ptr), ()); ptr } } diff --git a/src/crust.rs b/src/crust.rs index 6262838c..81254e91 100644 --- a/src/crust.rs +++ b/src/crust.rs @@ -59,13 +59,7 @@ impl PartialEq for Str { impl PartialOrd for Str { fn partial_cmp(&self, other: &Self) -> Option { - unsafe { - Some(match strcmp(self.0, other.0) { - 0 => Ordering::Equal, - 1.. => Ordering::Greater, - _ => Ordering::Less, - }) - } + Some(self.cmp(other)) } } @@ -84,11 +78,9 @@ impl Ord for Str { impl Hash for Str { fn hash(&self, state: &mut H) { unsafe { - let mut ptr = self.0; - while *ptr != 0 { - state.write_i8(*ptr); - ptr = ptr.add(1); - } + let len = strlen(self.0); + let slice = core::slice::from_raw_parts(self.0 as *const u8, len); + state.write(slice); } } } @@ -131,6 +123,13 @@ pub mod libc { pub fn qsort(base: *mut c_void, nmemb: usize, size: usize, compar: unsafe extern "C" fn(*const c_void, *const c_void) -> c_int); } + pub unsafe fn alloc_items(count: usize) -> *mut T { + extern "C" { + fn malloc(size: usize) -> *mut c_void; + } + malloc(size_of::() * count) as *mut T + } + // count is the amount of items, not bytes pub unsafe fn realloc_items(ptr: *mut T, count: usize) -> *mut T { extern "C" { diff --git a/src/hashtable.rs b/src/hashtable.rs index f3bc71ab..cbda9c2b 100644 --- a/src/hashtable.rs +++ b/src/hashtable.rs @@ -2,6 +2,8 @@ use crate::crust::libc; use core::hash::{BuildHasher, Hash, Hasher}; use core::{cmp, mem, ptr}; +/// General purpose hashtable, that accepts any kind of key and value types. +/// Current implementation uses open addressing and quadratic probing to minimize hash collisions. #[derive(Clone, Copy)] pub struct HashTable { pub entries: *mut Entry, @@ -38,7 +40,7 @@ where match Self::find(ht, key) { HtEntry::Occupied(entry) => Some(mem::replace(&mut (*entry).value, value)), HtEntry::Vacant(entry) => { - Self::insert_new_key(ht, entry, key, value); + Self::insert_new(ht, entry, key, value); None } } @@ -87,7 +89,7 @@ where } } - pub unsafe fn insert_new_key(ht: *mut Self, entry: *mut Entry, key: K, value: V) { + pub unsafe fn insert_new(ht: *mut Self, entry: *mut Entry, key: K, value: V) { if entry.is_null() { Self::realloc_rehash(ht); @@ -116,28 +118,28 @@ where debug_assert!((*ht).capacity.is_power_of_two()); // We need new allocations here, to properly copy entries - (*ht).entries = libc::realloc_items(ptr::null_mut(), (*ht).capacity); - (*ht).occupied = libc::realloc_items(ptr::null_mut(), (*ht).capacity >> 3); + (*ht).entries = libc::alloc_items((*ht).capacity); + (*ht).occupied = libc::alloc_items((*ht).capacity >> 3); debug_assert!(!(*ht).entries.is_null()); debug_assert!(!(*ht).occupied.is_null()); // Fill occupied with zeros ptr::write_bytes((*ht).occupied, 0, (*ht).capacity >> 3); - // Rehash all occupoed entries + // Iterate over all occupied entries and rehash them let buckets_count = old_capacity >> 3; for i in 0..buckets_count { let bucket = *old_occupied.add(i); for j in 0..8 { if (bucket >> j) & 1 == 1 { - let index = (i << 3) + (7 - j); + let index = (i << 3) + j; let entry = *old_entries.add(index); - assert!(Self::insert(ht, entry.key, entry.value).is_none()); + let new_entry = Self::find_vacant(ht, entry.key); + Self::insert_new(ht, new_entry, entry.key, entry.value); } } } - // free old allocations libc::free(old_entries); libc::free(old_occupied); } @@ -150,13 +152,13 @@ where pub unsafe fn occupy_index(ht: *mut Self, index: usize) { let bucket = (*ht).occupied.add(index >> 3); - let sub_index = 7 - (index & 7); + let sub_index = index & 7; *bucket |= 1 << sub_index; } pub unsafe fn is_occupied(ht: *const Self, index: usize) -> bool { let bucket = *(*ht).occupied.add(index >> 3); - let sub_index = 7 - (index & 7); + let sub_index = index & 7; (bucket >> sub_index) & 1 == 1 } @@ -169,6 +171,24 @@ where key.hash(&mut hasher); hasher.finish() } + + + // This function is only for internal usage to speed up rehashing + unsafe fn find_vacant(ht: *mut Self, key: K) -> *mut Entry { + let hash = Self::hash_key(ht, key); + let mut index = Self::index_from_hash(hash, (*ht).capacity); + + let mut step = 1; + loop { + let entry = (*ht).entries.add(index); + if !Self::is_occupied(ht, index) { + return entry; + } + + index = (index + step) & ((*ht).capacity - 1); + step += 1; + } + } } #[derive(Clone, Copy)] @@ -205,29 +225,4 @@ impl Hasher for Fnv1aHasher { self.hash = self.hash.wrapping_mul(Self::PRIME); } } - - fn write_u8(&mut self, i: u8) { - self.hash ^= i as u64; - self.hash = self.hash.wrapping_mul(Self::PRIME); - } - - fn write_u16(&mut self, i: u16) { - self.hash ^= i as u64; - self.hash = self.hash.wrapping_mul(Self::PRIME); - } - - fn write_u32(&mut self, i: u32) { - self.hash ^= i as u64; - self.hash = self.hash.wrapping_mul(Self::PRIME); - } - - fn write_u64(&mut self, i: u64) { - self.hash ^= i; - self.hash = self.hash.wrapping_mul(Self::PRIME); - } - - fn write_usize(&mut self, i: usize) { - self.hash ^= i as u64; - self.hash = self.hash.wrapping_mul(Self::PRIME); - } } From 151320adea1d602958cb8ed98531be0aa950f62f Mon Sep 17 00:00:00 2001 From: kyon4ik Date: Mon, 4 Aug 2025 22:20:07 +0500 Subject: [PATCH 8/8] Add hashtable to btest --- src/btest.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/btest.rs b/src/btest.rs index 2d8fc4f9..d5c7d1d9 100644 --- a/src/btest.rs +++ b/src/btest.rs @@ -20,6 +20,7 @@ pub mod lexer; pub mod codegen; pub mod shlex; pub mod params; +pub mod hashtable; use core::ffi::*; use core::cmp;