[Java][FlexBuffers] Optimize Map access (#5735)

The original implementation of map access is very naive:
- Encode String to UTF8 byte[]
- Creates a new KeyVector
- Performs a binary search to find the key
- return value

So every access to the Map there was useless allocations of Keys and KeyVector
and complete encoding of the search key, which for most comparisons would be wasteful.

This changes completely removes the use of KeyVector and compute the key
positions on the spot. Besides that, it compares keys codepoint-by-codepoint,
avoiding unnecessary allocations and reducing encoding for most cases.

Some benchmarks result in a 2.75x speedup.
This commit is contained in:
Paulo Pinheiro
2020-03-30 22:46:42 +02:00
committed by GitHub
parent d9fecc3327
commit 925fab6b15
3 changed files with 210 additions and 9 deletions

View File

@@ -18,9 +18,13 @@ package com.google.flatbuffers;
import java.nio.ByteBuffer;
import static java.lang.Character.MAX_SURROGATE;
import static java.lang.Character.MIN_SURROGATE;
import static java.lang.Character.MIN_HIGH_SURROGATE;
import static java.lang.Character.MIN_LOW_SURROGATE;
import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT;
import static java.lang.Character.isSurrogatePair;
import static java.lang.Character.toCodePoint;
public abstract class Utf8 {
@@ -73,6 +77,56 @@ public abstract class Utf8 {
DEFAULT = instance;
}
/**
* Encode a Java's CharSequence UTF8 codepoint into a byte array.
* @param in CharSequence to be encoded
* @param start start position of the first char in the codepoint
* @param out byte array of 4 bytes to be filled
* @return return the amount of bytes occupied by the codepoint
*/
public static int encodeUtf8CodePoint(CharSequence in, int start, byte[] out) {
// utf8 codepoint needs at least 4 bytes
assert out.length >= 4;
final int inLength = in.length();
if (start >= inLength) {
return 0;
}
char c = in.charAt(start);
if (c < 0x80) {
// One byte (0xxx xxxx)
out[0] = (byte) c;
return 1;
} else if (c < 0x800) {
// Two bytes (110x xxxx 10xx xxxx)
out[0] = (byte) (0xC0 | (c >>> 6));
out[1] = (byte) (0x80 | (0x3F & c));
return 2;
} else if (c < MIN_SURROGATE || MAX_SURROGATE < c) {
// Three bytes (1110 xxxx 10xx xxxx 10xx xxxx)
// Maximum single-char code point is 0xFFFF, 16 bits.
out[0] = (byte) (0xE0 | (c >>> 12));
out[1] =(byte) (0x80 | (0x3F & (c >>> 6)));
out[2] = (byte) (0x80 | (0x3F & c));
return 3;
} else {
// Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx)
// Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
// bytes
final char low;
if (start + 1 == inLength || !isSurrogatePair(c, (low = in.charAt(start+1)))) {
throw new UnpairedSurrogateException(start, inLength);
}
int codePoint = toCodePoint(c, low);
out[0] = (byte) ((0xF << 4) | (codePoint >>> 18));
out[1] = (byte) (0x80 | (0x3F & (codePoint >>> 12)));
out[2] = (byte) (0x80 | (0x3F & (codePoint >>> 6)));
out[3] = (byte) (0x80 | (0x3F & codePoint));
return 4;
}
}
/**
* Utility methods for decoding bytes into {@link String}. Callers are responsible for extracting
* bytes (possibly using Unsafe methods), and checking remaining bytes. All other UTF-8 validity