How To Store Random Integers From The Set Of 32-bit Integers In A Compact Data Structure To Rapidly Check Membership?
Solution 1:
You wrote:
One approach I am thinking that might work that fits these constraints is a sort of tree, like a B+tree. [...] I think it would be like a key-only B+tree (no values, and hence no leaves). But not entirely sure what this would look like (in JavaScript, if I had to pick a language).
Indeed, a key-only B(+)-tree would be a possible solution. It solves the problem of the wasted space you get from a sparse array solution. It is also a solution that works well when the data type happens to be string or float instead of number, which would be a problem for the sparse array solution.
The code I post below is similar to another B+ tree related answer I posted earlier on another question of yours, but which was not a search tree.
The idea is to use a B+ tree, taking into account the extra requirement about array sizes that are powers of 2. The actual data keys are found in the bottom level of the tree. The internal nodes repeat the smallest key that its subtree contains. This can be used for deciding which child to choose when walking from the root down the tree, in search for a bottom-level location where a key should be found or should be inserted.
The tree is kept compact by distributing children/keys to neighbors when a node gets full, only splitting a node when there is no room in the neighbors either. When a key is deleted, the affected node will merge with a neighbor when possible. Each node (except the root) is guaranteed to have at least half the use of the maximum (32) node capacity -- either for storing children or keys (at the bottom level).
Keys are located starting the search at the root. A child is selected using binary search so that the key of that child is the greatest key that is less or equal to the key were are locating. If the key is not found at the bottom level, we have at least the location where it should be, and it could be inserted there.
Implementation
classNode {
constructor(capacity) {
// Mimic fixed-size array (avoid accidentally growing it)this.children = Object.seal(Array(capacity).fill(null));
this.childCount = 0; // Number of used slots in children arraythis.key = null; // Property for supporting a search// Maintain back-link to parent.this.parent = null;
// Per level in the tree, maintain a doubly linked listthis.prev = this.next = null;
}
setCapacity(capacity) {
if (capacity < 1) return;
// Here we make a new array, and copy the data into itlet children = Object.seal(Array(capacity).fill(null));
for (let i = 0; i < this.childCount; i++) children[i] = this.children[i];
this.children = children;
}
isLeaf() {
return !(this.children[0] instanceofNode);
}
index() {
returnthis.parent.children.indexOf(this);
}
updateKey() {
this.key = this.isLeaf() ? this.children[0] : this.children[0].key;
for (let node = this.parent; node; node = node.parent) {
node.key = node.children[0].key;
}
}
wipe(start, end) {
this.children.copyWithin(start, end, this.childCount);
for (let i = this.childCount - end + start; i < this.childCount; i++) {
this.children[i] = null;
}
this.childCount -= end - start;
// Reduce allocated size if possibleif (this.childCount * 2 <= this.children.length) this.setCapacity(this.children.length / 2);
// Update key if first item changedif (start === 0) this.updateKey();
}
moveFrom(neighbor, target, start, count=1) {
// Note: `start` can have two meanings:// if neighbor is null, it is the key/Node to move to the target// if neighbor is a Node, it is the index from where keys(s)/Node(s) have to be moved to the target// Make room in target nodeif (this.childCount + count > this.children.length) this.setCapacity(this.children.length * 2);
this.children.copyWithin(target + count, target, Math.max(target + count, this.childCount));
this.childCount += count;
if (neighbor !== null) {
// Copy the childrenfor (let i = 0; i < count; i++) {
this.children[target + i] = neighbor.children[start + i];
}
// Remove the original references
neighbor.wipe(start, start + count);
} else {
this.children[target] = start; // start is key to insert
}
// Set parent link(s)if (!this.isLeaf()) {
for (let i = 0; i < count; i++) {
this.children[target + i].parent = this;
}
}
// Update key if first item changedif (target === 0) this.updateKey();
}
moveToNext(count) {
this.next.moveFrom(this, 0, this.childCount - count, count);
}
moveFromNext(count) {
this.moveFrom(this.next, this.childCount, 0, count);
}
basicRemove(index) {
if (!this.isLeaf()) {
// Take node out of the level's linked listlet prev = this.children[index].prev;
let next = this.children[index].next;
if (prev) prev.next = next;
if (next) next.prev = prev;
}
this.wipe(index, index + 1);
}
basicInsert(index, value) { // value can be a key or a Nodethis.moveFrom(null, index, value);
if (value instanceofNode) {
// Insert node in the level's linked listif (index > 0) {
value.prev = this.children[index-1];
value.next = value.prev.next;
} elseif (this.childCount > 1) {
value.next = this.children[1];
value.prev = value.next.prev;
}
if (value.prev) value.prev.next = value;
if (value.next) value.next.prev = value;
}
}
pairWithSmallest() {
returnthis.prev && (!this.next || this.next.childCount > this.prev.childCount)
? [this.prev, this] : [this, this.next];
}
toString() {
return"[" + this.children.map(v => v??"-").join() + "]";
}
}
classTree {
constructor(nodeCapacity=32) {
this.nodeCapacity = nodeCapacity;
this.root = newNode(1);
this.first = this.root; // Head of doubly linked list at bottom level
}
locate(key) {
let node = this.root;
while (!node.isLeaf()) {
// Binary search among keyslet low = 0;
let high = node.childCount;
while (low < high) {
let index = (low + high) >> 1;
if (key >= node.children[index].key) {
low = index + 1;
} else {
high = index;
}
}
if (low) low--;
node = node.children[low];
}
// Binary search in leaflet low = 0;
let high = node.childCount;
while (low < high) {
let index = (low + high) >> 1;
if (key > node.children[index]) {
low = index + 1;
} else {
high = index;
}
}
return [node, low];
}
has(key) {
let [node, index] = this.locate(key);
return index < node.childCount && node.children[index] === key;
}
remove(key) {
let [node, index] = this.locate(key);
if (index >= node.childCount || node.children[index] !== key) return; // not foundwhile (true) {
node.basicRemove(index);
// Exit when node's fill ratio is fineif (!node.parent || node.childCount * 2 > this.nodeCapacity) return;
// Node has potentially too few children, we should either merge or redistributelet [left, right] = node.pairWithSmallest();
if (!left || !right) { // A node with no siblings? Must become the root!this.root = node;
node.parent = null;
return;
}
let sumCount = left.childCount + right.childCount;
let childCount = sumCount >> 1;
// Check whether to merge or to redistributeif (sumCount > this.nodeCapacity) { // redistribute// Move some data from the bigger to the smaller nodelet shift = childCount - node.childCount;
if (!shift) { // Boundary case: when a redistribution would bring no improvementconsole.assert(node.childCount * 2 === this.nodeCapacity && sumCount === this.nodeCapacity + 1);
return;
}
if (node === left) { // move some children from right to left
left.moveFromNext(shift);
} else { // move some children from left to right
left.moveToNext(shift);
}
return;
}
// Merge:// Move all data from the right to the left
left.moveFromNext(right.childCount);
// Prepare to delete right node
node = right.parent;
index = right.index();
}
}
insert(value) { // value is a keylet [node, index] = this.locate(value);
if (index < node.childCount && node[index] === value) return; // already presentwhile (node.childCount === this.nodeCapacity) { // No room hereif (index === 0 && node.prev && node.prev.childCount < this.nodeCapacity) {
return node.prev.basicInsert(node.prev.childCount, value);
}
// Check whether we can redistribute (to avoid a split)if (node !== this.root) {
let [left, right] = node.pairWithSmallest();
let joinedIndex = left === node ? index : left.childCount + index;
let sumCount = left.childCount + right.childCount + 1;
if (sumCount <= 2 * this.nodeCapacity) { // redistributelet childCount = sumCount >> 1;
if (node === right) { // redistribute to the leftlet insertInLeft = joinedIndex < childCount;
left.moveFromNext(childCount - left.childCount - +insertInLeft);
} else { // redistribute to the rightlet insertInRight = index >= sumCount - childCount;
left.moveToNext(childCount - right.childCount - +insertInRight);
}
if (joinedIndex > left.childCount ||
joinedIndex === left.childCount && left.childCount > right.childCount) {
right.basicInsert(joinedIndex - left.childCount, value);
} else {
left.basicInsert(joinedIndex, value);
}
return;
}
}
// Cannot redistribute: split nodelet childCount = node.childCount >> 1;
// Create a new node that will later become the right sibling of this nodelet sibling = newNode(childCount);
// Move half of node node's data to it
sibling.moveFrom(node, 0, childCount, childCount);
// Insert the value in either the current node or the new oneif (index > node.childCount) {
sibling.basicInsert(index - node.childCount, value);
} else {
node.basicInsert(index, value);
}
// Is this the root? if (!node.parent) {
// ...then first create a parent, which is the new rootthis.root = newNode(2);
this.root.basicInsert(0, node);
}
// Prepare for inserting the sibling node into the tree
index = node.index() + 1;
node = node.parent;
value = sibling; // value is now a Node
}
node.basicInsert(index, value);
}
/* Below this point: these methods are optional */
* [Symbol.iterator]() { // Make tree iterablelet i = 0;
for (let node = this.first; node; node = node.next) {
for (let i = 0; i < node.childCount; i++) yield node.children[i];
}
}
print() {
console.log(this.root && this.root.toString());
}
verify() {
// Raise an error when the tree violates one of the required propertiesif (!this.root) return; // An empty tree is fine.if (this.root.parent) throw"root should not have a parent";
// Perform a breadth first traversallet q = [this.root];
while (q.length) {
if (q[0].isLeaf() && this.first !== q[0]) throw"this.first is not pointing to first leaf";
let level = [];
let last = null;
for (let parent of q) {
if (!(parent instanceofNode)) throw"parent is not instance of Node";
if (parent.children.length > this.nodeCapacity) throw"node's children array is too large";
if (parent.childCount > 0 && parent.childCount * 2 <= parent.children.length) throw"node's fill ratio is too low";
for (let i = parent.childCount; i < parent.children.length; i++) {
if (parent.children[i] !== null) throw"child beyond childCount should be null but is not";
}
if (parent.isLeaf()) {
if (parent.children[0] !== parent.key) throw"key does not match with first child value";
for (let value of parent.children.slice(0, parent.childCount)) {
if (value === null) throw"leaf has a null as value";
if (value instanceofNode) throw"leaf has a Node as value";
}
} else {
if (parent.children[0].key !== parent.key) throw"key does not match with first child's key";
for (let node of parent.children.slice(0, parent.childCount)) {
if (node === null) throw"internal node has a null as value";
if (!(node instanceofNode)) throw"internal node has a non-Node as value";
if (node.parent !== parent) throw"wrong parent";
if (node.prev !== last) throw"prev link incorrect";
if (last && last.next !== node) throw"next link incorrect";
if (last && last.children.length + node.children.length <= this.nodeCapacity) {
throw"two consecutive siblings have a total number of children that is too small";
}
if (node.childCount * 2 < this.nodeCapacity) {
throw"internal node is too small: " + node;
}
level.push(node);
last = node;
}
}
}
if (last && last.next) throw"last node in level has a next reference";
q = level;
}
}
test(count=100, option=3) {
// option:// 0 = always insert & delete at left side (offset 0)// 1 = always insert & delete at right side// 2 = always insert & delete at middle// 3 = insert & delete at random offsets// Create array to perform the same operations on it as on the treelet max = count*2;
let arr = [];
// Perform a series of insertionsfor (let i = 0; i < count; i++) {
// Choose random valuelet key = Math.floor(Math.random() * max);
// Perform same insertion in array and tree
arr.push(key);
this.insert(key);
// Verify tree consistency and propertiesthis.verify();
// Verify the order of keys in the array is the same as in the treeif (arr.sort((a,b) => a-b)+"" !== [...this]+"") throw i + ": tree not same as array";
}
// Perform a series of has-callsfor (let i = 0; i < count; i++) {
// Choose random update indexlet key = Math.floor(Math.random() * max);
// Perform same insertion in array and treelet has = arr.includes(key);
if (has !== this.has(key)) {
throw"has() returns inconsistent result";
}
if (!has) {
this.remove(key); // should not alter the tree// Verify the order of keys in the array is the same as in the treeif (arr+"" !== [...this]+"") throw i + ": tree not same as array";
}
}
// Perform a series of deletionsfor (let i = arr.length - 1; i >= 0; i--) {
// Choose random deletion key let key = arr[Math.floor(Math.random() * i)];
// Perform same deletion in array and tree
arr.splice(arr.indexOf(key), 1);
this.remove(key);
// Verify tree consistency and propertiesthis.verify();
// Verify the order of keys in the array is the same as in the treeif (arr+"" !== [...this]+"") throw"tree not same as array";
}
}
}
// Perform 1000 insertions, 1000 updates, and 1000 deletions on a tree with node capacity of 8newTree(8).test(1000);
console.log("all tests completed");
Supported data types
The data type of the keys can be anything for which comparison operators work. So for strings or numbers it will work. In JavaScript it will also work for objects that have implemented either a valueOf
or toString
method. For instance, this is already the case for native Date
objects. JavaScript will first try the valueOf
method and in absence the toString
method, which is also defined on the Object prototype.
Considerations
I stored one key
in each node, as this is an in-memory solution. Historically, B(+)-trees are used for disk storage, where it is important to keep the number of block reads low. A standard B(+)-tree implementation stores the keys one level higher, in the parent, as an array. That way the search does not have to load each child record, while searching for the right one to choose.
Also, it then does not need to store the key of its first child, as we really only need the keys that separate two consecutive children.
Solution 2:
Here is my suggested approach.
Every node should be a 32 entry array. If the first entry is null
or an array, it is a 32-way split of the whole search space. Otherwise it is descending list of the entries in this block. Fill in the non-entries with -1.
This means that in no more than 5 lookups we get to the block that either has or doesn't have our value. We then do a linear scan. A binary search of a sorted list naively seems like it would be more efficient, but in fact it involves a series of hard to predict branches which CPUs hate. In a low level language (which I hope yours is), it is faster to avoid the pipeline stall and do a linear search.
Here is an implementation in JavaScript.
classIntegerSet {
constructor() {
this.make_node = function () {
let node = newArray();
for (let i = 0; i < 32; i++) {
node[i] = -1;
}
return node;
};
this.root = this.make_node();
this.lookup = function (target) {
let node = this.root;
let size = 2**32;
while (Array.isArray(node[0])) {
size /= 32;
let idx = Math.floor(target / size);
node = node[idx];
target = target % size;
}
for (let i = 0; i < 32; i++) {
if (target == node[i]) {
returntrue;
}
// On average this lets us break out 1/4 of the way through.elseif (node[i] < target) {
returnfalse;
}
}
returnfalse;
};
this.insert = function (target, node) {
node = node || this.root;
let size = 2**32;
while (Array.isArray(node[0])) {
size /= 32;
let idx = Math.floor(target / size);
node = node[idx];
target = target % size;
}
for (let i = 0; i < 32; i++) {
if (target == node[i]) {
returnfalse;
}
elseif (node[i] < target) {
// copy elements along, in order.let value = node[i];
for (let j = i; j < 31; j++) {
node[j] = target;
target = value;
value = node[j+1];
if (target == -1) {
break;
}
}
node[31] = target; // just in case the block is fullif (-1 == target) {
returntrue;
}
else {
break; // we inserted and filled the node.
}
}
}
// If we get here, we need to split the node.let node_copy = this.make_node();
for (let i = 0; i < 32; i++) {
node_copy[i] = node[i];
node[i] = this.make_node();
}
size /= 32;
let idx = Math.floor(target / size);
node[idx][0] = target % size;
for (let i = 0; i < 32; i++) {
let value = node_copy[i];
idx = Math.floor(value / size);
this.insert(value % size, node[idx]);
}
};
this.remove = function (target) {
let node = this.root;
let size = 2**32;
while (Array.isArray(node[0])) {
size /= 32;
let idx = Math.floor(target / size);
node = node[idx];
target = target % size;
}
for (let i = 0; i < 32; i++) {
if (target == node[i]) {
for (let j = i; j < 31; j++) {
node[j] = node[j+1];
if (node[j] == -1) {
break;
}
}
node[31] = -1;
returntrue;
}
}
returnfalse;
};
}
}
let foo = newIntegerSet();
for (let i = 0; i < 300; i++) {
foo.insert(Math.floor(Math.random() * 2**32));
}
console.log(foo.root);
console.log(foo.lookup(5));
console.log(foo.remove(5));
console.log(foo.insert(5));
console.log(foo.root);
console.log(foo.lookup(5));
console.log(foo.remove(5));
Here are the characteristics of this data structure.
A read requires no more than 5 lookups to get to a block, then a scan with at most 64 if statements (which are well-predicted to always be false). If you insert a lot of random elements, the leaves are on average half-full, and on average we only do 1/4 of the scan.
The fact that there are 32 children of each internal node mean that the internal nodes only add a few percent overhead to the structure.
If you insert and then delete, performance remains good but we never will reclaim memory.
The following is an implementation that is similar to the previous but uses arrays of different size, and allows you to reclaim space. It aims to make 60-70% of the space actually data. But this comes, obviously, with penalties on insert/remove.
classIntegerSet {
constructor() {
this.make_node = function (size) {
letnode=newArray();
for (leti=0; i < size; i++) {
node[i] = -1;
}
node.child_size = 0;
return node;
};
this.root = this.make_node(2);
this.lookup = function (target) {
letnode=this.root;
letsize=2**32;
while (Array.isArray(node[0])) {
size /= 32;
letidx= Math.floor(target / size);
node = node[idx];
target = target % size;
}
leti=0;
while (target < node[i]) {
i++;
}
returntarget== node[i];
};
this.insert = function (target, node, parent_node, size) {
node = node || this.root;
if (size == null) {
size = 2**32;
}
letidx=null;
letupdate_parent=false;
while (Array.isArray(node[0])) {
size /= 32;
parent_node = node;
update_parent = true;
idx = Math.floor(target / size);
node = node[idx];
target = target % size;
}
leti=0;
while (target < node[i]) {
i++;
}
if (target == node[i]) {
returntrue; // already had it.
}
else {
// copy along, in order.letvalue= node[i];
while (-1 < target) {
value = node[i]
node[i] = target;
i++;
target = value;
}
if (update_parent) {
parent_node.child_size++;
}
if (i == node.length) {
letnew_node=null;
if (node.length < 32) {
// Double node size.
new_node = this.make_node(node.length * 2);
for (letj=0; j < i; j++) {
new_node[j] = node[j];
}
}
else {
// Need to split.
new_node = this.make_node(node.length);
for (letj=0; j < node.length; j++) {
new_node[j] = this.make_node(2);
}
new_node.child_size = node.length;
size /= 32;
for (letj=0; j < 32; j++) {
target = node[j];
letnew_idx= Math.floor(target / size);
target = target % size;
this.insert(target, new_node[new_idx], parent_node, size);
}
}
if (parent_node != null) {
parent_node[idx] = new_node;
}
else {
this.root = new_node;
}
}
}
};
this.remove = function (target) {
letnode=this.root;
letgrandparent_node=null;
letparent_node=null;
letsize=2**32;
letidx=null;
letparent_idx=null;
while (Array.isArray(node[0])) {
size /= 32;
parent_idx = idx;
idx = Math.floor(target / size);
grandparent_node = parent_node;
parent_node = node;
node = node[idx];
target = target % size;
}
leti=0;
while (target < node[i]) {
i++;
}
if (node[i] == target) {
// now delete along.while (-1 < node[i]) {
node[i] = node[i+1];
i++;
}
if (parent_node != null) {
parent_node.child_size--;
// Reconsolidate if we save about 1/4 of memory.if (parent_node.child_size < 24) {
letnew_node=this.make_node(32);
letnew_id=0;
for (leti=31; -1 < i; i--) {
letold_node= parent_node[i];
letj=0;
while (-1 < old_node[j]) {
new_node[new_id] = size * i + old_node[j];
new_id++;
j++;
}
}
if (grandparent_node == null) {
this.root = new_node;
}
else {
grandparent_node[parent_idx] = new_node;
grandparent_node.child_size += parent_node.child_size - 33;
}
returntrue;
}
}
// Reconsolidate if we save about 2/3 of memory.if (i < node.length / 3 && 2 < node.length) {
letnew_node=this.make_node(node.length/2);
for (letj=0; j < i; j++) {
new_node[j] = node[j];
}
if (parent_node != null) {
parent_node[idx] = new_node;
}
else {
this.root = new_node;
}
}
returntrue;
}
else {
returnfalse;
}
};
}
}
letfoo=newIntegerSet();
letkeep=newArray;
for (leti=0; i < 5; i++) {
keep.push(Math.floor(Math.random() * 2**32));
}
letremove=newArray;
for (leti=0; i < 300; i++) {
remove.push(Math.floor(Math.random() * 2**32));
}
for (leti=0; i < keep.length; i++) {
foo.insert(keep[i]);
}
for (leti=0; i < remove.length; i++) {
foo.insert(remove[i]);
}
for (leti=0; i < remove.length; i++) {
foo.remove(remove[i]);
}
console.log(foo.root);
for (leti=0; i < keep.length; i++) {
console.log(keep[i]);
console.log(foo.lookup(keep[i]));
console.log(foo.remove(keep[i]));
console.log(foo.lookup(keep[i]));
console.log(foo.remove(keep[i]));
}
Post a Comment for "How To Store Random Integers From The Set Of 32-bit Integers In A Compact Data Structure To Rapidly Check Membership?"