feat: use HashMap for create_shared_string to fix O(N²) performance (#8958)

* feat: use HashMap for create_shared_string to fix O(N²) performance

* refactor: clean up no_std binary_search_by with direct slice comparison
This commit is contained in:
statxc
2026-03-11 03:56:34 +02:00
committed by GitHub
parent 8aa7084f01
commit de3b97355d
2 changed files with 115 additions and 12 deletions

View File

@@ -24,6 +24,9 @@ use core::marker::PhantomData;
use core::ops::{Add, AddAssign, Deref, DerefMut, Index, IndexMut, Sub, SubAssign};
use core::ptr::write_bytes;
#[cfg(feature = "std")]
use std::collections::HashMap;
use crate::endian_scalar::emplace_scalar;
use crate::primitives::*;
use crate::push::{Push, PushAlignment};
@@ -139,6 +142,9 @@ pub struct FlatBufferBuilder<'fbb, A: Allocator = DefaultAllocator> {
min_align: usize,
force_defaults: bool,
#[cfg(feature = "std")]
strings_pool: HashMap<String, WIPOffset<&'fbb str>>,
#[cfg(not(feature = "std"))]
strings_pool: Vec<WIPOffset<&'fbb str>>,
_phantom: PhantomData<&'fbb ()>,
@@ -197,6 +203,9 @@ impl<'fbb, A: Allocator> FlatBufferBuilder<'fbb, A> {
min_align: 0,
force_defaults: false,
#[cfg(feature = "std")]
strings_pool: HashMap::new(),
#[cfg(not(feature = "std"))]
strings_pool: Vec::new(),
_phantom: PhantomData,
@@ -343,6 +352,31 @@ impl<'fbb, A: Allocator> FlatBufferBuilder<'fbb, A> {
WIPOffset::new(o.value())
}
/// Create a utf8 string, and de-duplicate if already created.
///
/// Uses a HashMap to track previously written strings, providing O(1)
/// amortized lookup and insertion.
#[cfg(feature = "std")]
#[inline]
pub fn create_shared_string<'a: 'b, 'b>(&'a mut self, s: &'b str) -> WIPOffset<&'fbb str> {
self.assert_not_nested(
"create_shared_string can not be called when a table or vector is under construction",
);
if let Some(&offset) = self.strings_pool.get(s) {
return offset;
}
let address = WIPOffset::new(self.create_byte_string(s.as_bytes()).value());
self.strings_pool.insert(s.to_owned(), address);
address
}
/// Create a utf8 string, and de-duplicate if already created.
///
/// Uses a sorted Vec with binary search to track previously written
/// strings when in `no_std` mode.
#[cfg(not(feature = "std"))]
#[inline]
pub fn create_shared_string<'a: 'b, 'b>(&'a mut self, s: &'b str) -> WIPOffset<&'fbb str> {
self.assert_not_nested(
@@ -355,19 +389,15 @@ impl<'fbb, A: Allocator> FlatBufferBuilder<'fbb, A> {
let found = self.strings_pool.binary_search_by(|offset| {
let ptr = offset.value() as usize;
// Gets The pointer to the size of the string
let str_memory = &buf[buf.len() - ptr..];
// Gets the size of the written string from buffer
let size =
u32::from_le_bytes([str_memory[0], str_memory[1], str_memory[2], str_memory[3]])
as usize;
// Size of the string size
let string_size: usize = 4;
// Fetches actual string bytes from index of string after string size
// to the size of string plus string size
let iter = str_memory[string_size..size + string_size].iter();
// Compares bytes of fetched string and current writable string
iter.cloned().cmp(s.bytes())
let size = u32::from_le_bytes([
str_memory[0],
str_memory[1],
str_memory[2],
str_memory[3],
]) as usize;
let stored = &str_memory[4..4 + size];
stored.cmp(s.as_bytes())
});
match found {

View File

@@ -3224,4 +3224,77 @@ fn test_shared_strings() {
assert_eq!(string_vector.get(1), "foo");
}
#[test]
fn test_shared_strings_pool_deduplication() {
// Verifies that create_shared_string correctly deduplicates across many
// unique strings and that the resulting buffer contains valid data.
let mut builder = flatbuffers::FlatBufferBuilder::with_capacity(1024);
// Insert multiple unique strings and verify each gets a distinct offset.
let animals = ["cat", "dog", "bird", "fish", "snake"];
let offsets: Vec<_> = animals
.iter()
.map(|s| builder.create_shared_string(s))
.collect();
for i in 0..offsets.len() {
for j in (i + 1)..offsets.len() {
assert_ne!(
offsets[i].value(),
offsets[j].value(),
"unique strings '{}' and '{}' must have different offsets",
animals[i],
animals[j],
);
}
}
// Re-insert the same strings and verify they return the original offsets.
for (i, s) in animals.iter().enumerate() {
let offset = builder.create_shared_string(s);
assert_eq!(
offset.value(),
offsets[i].value(),
"duplicate string '{}' must return the same offset",
s,
);
}
// Verify that reset clears the pool: a previously shared string is no
// longer deduplicated against strings from before the reset.
builder.reset();
let a = builder.create_shared_string("cat");
let b = builder.create_shared_string("cat");
assert_eq!(a.value(), b.value(), "same string after reset must still deduplicate");
// Verify that shared strings produce a valid, readable buffer.
builder.reset();
let shared_name = builder.create_shared_string("goblin");
let shared_name_dup = builder.create_shared_string("goblin");
assert_eq!(shared_name.value(), shared_name_dup.value());
let enemy = my_game::example::Monster::create(
&mut builder,
&my_game::example::MonsterArgs {
name: Some(shared_name),
..Default::default()
},
);
let main_name = builder.create_shared_string("goblin");
assert_eq!(main_name.value(), shared_name.value());
let monster = my_game::example::Monster::create(
&mut builder,
&my_game::example::MonsterArgs {
name: Some(main_name),
enemy: Some(enemy),
..Default::default()
},
);
builder.finish(monster, None);
let m = my_game::example::root_as_monster(builder.finished_data()).unwrap();
assert_eq!(m.name(), "goblin");
assert_eq!(m.enemy().unwrap().name(), "goblin");
}
}