Store vtables sorted in Rust builder (#6765)

* benchmark many vtables

* Rust: Store written_table rev-positions sorted.

The previous implementation was slow if there were too many tables.

Asymototically when inserting the n^th vtable: The old implementation
took O(n) lookup steps and O(1) insertion. The new implementation is
O(log n) lookup and O(n) insertion. This might be improved further by
using a balanced btree.

Benchmarking, create_many_tables is 7.5x faster (on my laptop):

// Simple vector cache
test create_many_tables ... bench: 728,875 ns/iter (+/- 12,279) = 44 MB/s

// Sorted vector cache
test create_many_tables ... bench: 97,843 ns/iter (+/- 4,430) = 334 MB/s

* Fix lints

Co-authored-by: Casper Neo <cneo@google.com>
This commit is contained in:
Casper
2021-08-03 15:31:45 -04:00
committed by GitHub
parent c39fc9dd9c
commit 35e2cac6eb
4 changed files with 51 additions and 60 deletions

View File

@@ -142,7 +142,7 @@ fn blackbox<T>(t: T) -> T {
#[inline(always)]
fn traverse_serialized_example_with_generated_code(bytes: &[u8]) {
let m = my_game::example::get_root_as_monster(bytes);
let m = unsafe { my_game::example::root_as_monster_unchecked(bytes) };
blackbox(m.hp());
blackbox(m.mana());
blackbox(m.name());
@@ -172,7 +172,7 @@ fn traverse_serialized_example_with_generated_code(bytes: &[u8]) {
}
fn create_string_10(bench: &mut Bencher) {
let builder = &mut flatbuffers::FlatBufferBuilder::new_with_capacity(1 << 20);
let builder = &mut flatbuffers::FlatBufferBuilder::with_capacity(1 << 20);
let mut i = 0;
bench.iter(|| {
builder.create_string("foobarbaz"); // zero-terminated -> 10 bytes
@@ -187,7 +187,7 @@ fn create_string_10(bench: &mut Bencher) {
}
fn create_string_100(bench: &mut Bencher) {
let builder = &mut flatbuffers::FlatBufferBuilder::new_with_capacity(1 << 20);
let builder = &mut flatbuffers::FlatBufferBuilder::with_capacity(1 << 20);
let s_owned = (0..99).map(|_| "x").collect::<String>();
let s: &str = &s_owned;
@@ -205,7 +205,7 @@ fn create_string_100(bench: &mut Bencher) {
}
fn create_byte_vector_100_naive(bench: &mut Bencher) {
let builder = &mut flatbuffers::FlatBufferBuilder::new_with_capacity(1 << 20);
let builder = &mut flatbuffers::FlatBufferBuilder::with_capacity(1 << 20);
let v_owned = (0u8..100).map(|i| i).collect::<Vec<u8>>();
let v: &[u8] = &v_owned;
@@ -223,7 +223,7 @@ fn create_byte_vector_100_naive(bench: &mut Bencher) {
}
fn create_byte_vector_100_optimal(bench: &mut Bencher) {
let builder = &mut flatbuffers::FlatBufferBuilder::new_with_capacity(1 << 20);
let builder = &mut flatbuffers::FlatBufferBuilder::with_capacity(1 << 20);
let v_owned = (0u8..100).map(|i| i).collect::<Vec<u8>>();
let v: &[u8] = &v_owned;
@@ -240,6 +240,24 @@ fn create_byte_vector_100_optimal(bench: &mut Bencher) {
bench.bytes = v.len() as u64;
}
fn create_many_tables(bench: &mut Bencher) {
let builder = &mut flatbuffers::FlatBufferBuilder::with_capacity(1 << 20);
// We test vtable overhead by making many unique tables of up to 16 fields of u8s.
bench.iter(|| {
for i in 0..(1u16 << 10) {
let t = builder.start_table();
for j in 0..15 {
if i & (1 << j) == 1 {
builder.push_slot_always(i * 2, 42u8);
}
}
builder.end_table(t);
}
builder.reset();
});
bench.bytes = 1 << 15;
}
benchmark_group!(
benches,
create_byte_vector_100_naive,
@@ -247,5 +265,6 @@ benchmark_group!(
traverse_canonical_buffer,
create_canonical_buffer_then_reset,
create_string_10,
create_string_100
create_string_100,
create_many_tables,
);

View File

@@ -354,7 +354,7 @@ fn test_object_api_reads_correctly() -> Result<(), &'static str>{
// Disabled due to Windows CI limitations.
// #[test]
// fn builder_initializes_with_maximum_buffer_size() {
// flatbuffers::FlatBufferBuilder::new_with_capacity(flatbuffers::FLATBUFFERS_MAX_BUFFER_SIZE);
// flatbuffers::FlatBufferBuilder::with_capacity(flatbuffers::FLATBUFFERS_MAX_BUFFER_SIZE);
// }
#[should_panic]