If you ever have encountered issue with database performance, you might have heard about indexing. Indexing is a way to improve the performance of a database by reducing the number of disk accesses required when a query is processed. It is a data structure technique which is used to quickly locate and access the data in a database. Indexes are created using a few database columns.
Let’s illustrate this with an example. Suppose you have a table called users with the following columns:
from dataclasses import dataclass@dataclassclass User:id: int name: str age: int country: str planet: str galaxy: strdef__str__(self):returnf'{self.id}\t{self.name}\t{self.age}\t{self.country}\t{self.planet}\t{self.galaxy}'first_names = ['Agus', 'Budi', 'Charlie', 'Dina', 'Erika', 'Fandi', 'Gina', 'Hani', 'Ika', 'Joni']last_names = ['Wijaya', 'Mulyana', 'Setiawan', 'Susanto', 'Gusti', 'Saputra', 'Surya', 'Andini', 'Wijayanti', 'Saputri']ages =range(20, 70)countries = ['Indonesia', 'Malaysia', 'Singapore', 'Thailand', 'Vietnam', 'Laos', 'Myanmar', 'Brunei', 'Philippines', 'Timor Leste']planets = ['Mercury', 'Venus', 'Earth', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptunus', 'Pluto']galaxies = ['Milky Way', 'Andromeda', 'Triangulum', 'Messier 81', 'Messier 82', 'Centaurus A', 'Messier 87', 'Messier 49', 'Messier 51', 'Messier 101']def generate_random_users():import random users = []id=0for fn in first_names:for ln in last_names:for age in ages:for country in countries:for planet in planets:for galaxy in galaxies: users.append(User(id, f'{fn}{ln}', age, country, planet, galaxy))id+=1return usersusers = generate_random_users()print('Size: ', len(users))print('id\tname\tage\tcountry')print('--------------------------------')for user in users[:20]:print(str(user))
Size: 4500000
id name age country
--------------------------------
0 Agus Wijaya 20 Indonesia Mercury Milky Way
1 Agus Wijaya 20 Indonesia Mercury Andromeda
2 Agus Wijaya 20 Indonesia Mercury Triangulum
3 Agus Wijaya 20 Indonesia Mercury Messier 81
4 Agus Wijaya 20 Indonesia Mercury Messier 82
5 Agus Wijaya 20 Indonesia Mercury Centaurus A
6 Agus Wijaya 20 Indonesia Mercury Messier 87
7 Agus Wijaya 20 Indonesia Mercury Messier 49
8 Agus Wijaya 20 Indonesia Mercury Messier 51
9 Agus Wijaya 20 Indonesia Mercury Messier 101
10 Agus Wijaya 20 Indonesia Venus Milky Way
11 Agus Wijaya 20 Indonesia Venus Andromeda
12 Agus Wijaya 20 Indonesia Venus Triangulum
13 Agus Wijaya 20 Indonesia Venus Messier 81
14 Agus Wijaya 20 Indonesia Venus Messier 82
15 Agus Wijaya 20 Indonesia Venus Centaurus A
16 Agus Wijaya 20 Indonesia Venus Messier 87
17 Agus Wijaya 20 Indonesia Venus Messier 49
18 Agus Wijaya 20 Indonesia Venus Messier 51
19 Agus Wijaya 20 Indonesia Venus Messier 101
Now, what’s the detail of the user with id 5? To find this, the database will have to scan through all the rows in the table.
This is called a full table scan. If the table has a million rows, it will have to scan through all the million rows to find the user with id 5. This is a very slow process.
def find_user_by_id(id):for user in users:if user.id==id:return user%time find_user_by_id(500_000)
CPU times: user 12.2 ms, sys: 9.06 ms, total: 21.2 ms
Wall time: 20 ms
We can of course make this faster by storing in a HashMap:
def build_id_index():return {user.id: user for user in users}def find_user_by_id(id_index, id):return id_index[id] # O(1)%time users_by_id = build_id_index()# take first 20 users_by_idprint('id\tname\tage\tcountry')print('--------------------------------')ctr =0for user in users_by_id.items():print(user)if ctr ==5:break ctr +=1
CPU times: user 157 ms, sys: 317 ms, total: 474 ms
Wall time: 531 ms
id name age country
--------------------------------
(0, User(id=0, name='Agus Wijaya', age=20, country='Indonesia', planet='Mercury', galaxy='Milky Way'))
(1, User(id=1, name='Agus Wijaya', age=20, country='Indonesia', planet='Mercury', galaxy='Andromeda'))
(2, User(id=2, name='Agus Wijaya', age=20, country='Indonesia', planet='Mercury', galaxy='Triangulum'))
(3, User(id=3, name='Agus Wijaya', age=20, country='Indonesia', planet='Mercury', galaxy='Messier 81'))
(4, User(id=4, name='Agus Wijaya', age=20, country='Indonesia', planet='Mercury', galaxy='Messier 82'))
(5, User(id=5, name='Agus Wijaya', age=20, country='Indonesia', planet='Mercury', galaxy='Centaurus A'))
It takes longer to build the HashMap, but once it’s built, we can find the user with id 5 in constant time (O(1)).
Building index is equivalent to this SQL:
CREATEINDEXidON users (id);-- usually primary key is indexed by default
That’s a magic script that will make your database queries run (much) faster*
Other Column Index
How if now we want to find all the users with age 25? We can’t do that with the current index. We can only find the user with id 5.
Unless we build another HashMap with age as the key.
from collections import defaultdictdef build_age_index(): age_index = defaultdict(list)for user in users: age_index[user.age].append(user)return age_indexdef find_users_by_age(age_index, age):return age_index[age]%time age_index = build_age_index()%time find_users_by_age(age_index, 50)[:10]
CPU times: user 147 ms, sys: 30.5 ms, total: 177 ms
Wall time: 180 ms
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 2.62 µs
Building the index takes longer, so - for demo purpose - let’s just build the index for the first 1000 users:
from sys import setrecursionlimitdef build_age_index_bst(): age_index = BST(None, None, users[0].age, users[0])for user in users[1:10_000]: age_index.insert(user.age, user)return age_indexdef find_users_by_age_bst(age_index, age):return age_index.find(age)def find_users_by_age_bst_between(age_index, age1, age2):return age_index.find_between(age1, age2)%time age_index_bst = build_age_index_bst()%time print(find_users_by_age_bst(age_index_bst, 20))selected_users =%time find_users_by_age_bst_between(age_index_bst, 20, 30)for user in selected_users[:10]:print(user)
CPU times: user 83 µs, sys: 0 ns, total: 83 µs
Wall time: 85.1 µs
43 Agus Wijaya 20 Indonesia Jupiter Messier 81
CPU times: user 11 µs, sys: 0 ns, total: 11 µs
Wall time: 14.1 µs
CPU times: user 14 µs, sys: 0 ns, total: 14 µs
Wall time: 15.3 µs
43 Agus Wijaya 20 Indonesia Jupiter Messier 81
1 Agus Wijaya 20 Indonesia Mercury Andromeda
28 Agus Wijaya 20 Indonesia Earth Messier 51
14 Agus Wijaya 20 Indonesia Venus Messier 82
36 Agus Wijaya 20 Indonesia Mars Messier 87
12 Agus Wijaya 20 Indonesia Venus Triangulum
0 Agus Wijaya 20 Indonesia Mercury Milky Way
27 Agus Wijaya 20 Indonesia Earth Messier 49
47 Agus Wijaya 20 Indonesia Jupiter Messier 49
7 Agus Wijaya 20 Indonesia Mercury Messier 49
It takes longer to build the index, but when the index is built, we can do range queries.
It’s equivalent to this SQL:
SELECT*FROM users WHERE age >25AND age <30;
The Problem of BST
The problem with BST is that it’s not always balanced. If we insert the data in sorted order, the tree will be skewed:
# random dataimport randomrandom.seed(0)users = generate_random_users()[:50]random.shuffle(users)bst = BST(None, None, users[0].id, users[0])for d in users[1:]: bst.insert(d.id, d)draw(bst, 0)
├──43 -> (43 Agus Wijaya 20 Indonesia Jupiter Messier 81)
├──1 -> (1 Agus Wijaya 20 Indonesia Mercury Andromeda)
├──0 -> (0 Agus Wijaya 20 Indonesia Mercury Milky Way)
├──28 -> (28 Agus Wijaya 20 Indonesia Earth Messier 51)
├──14 -> (14 Agus Wijaya 20 Indonesia Venus Messier 82)
├──12 -> (12 Agus Wijaya 20 Indonesia Venus Triangulum)
├──7 -> (7 Agus Wijaya 20 Indonesia Mercury Messier 49)
├──5 -> (5 Agus Wijaya 20 Indonesia Mercury Centaurus A)
├──3 -> (3 Agus Wijaya 20 Indonesia Mercury Messier 81)
├──2 -> (2 Agus Wijaya 20 Indonesia Mercury Triangulum)
├──4 -> (4 Agus Wijaya 20 Indonesia Mercury Messier 82)
├──6 -> (6 Agus Wijaya 20 Indonesia Mercury Messier 87)
├──11 -> (11 Agus Wijaya 20 Indonesia Venus Andromeda)
├──10 -> (10 Agus Wijaya 20 Indonesia Venus Milky Way)
├──9 -> (9 Agus Wijaya 20 Indonesia Mercury Messier 101)
├──8 -> (8 Agus Wijaya 20 Indonesia Mercury Messier 51)
├──13 -> (13 Agus Wijaya 20 Indonesia Venus Messier 81)
├──27 -> (27 Agus Wijaya 20 Indonesia Earth Messier 49)
├──20 -> (20 Agus Wijaya 20 Indonesia Earth Milky Way)
├──15 -> (15 Agus Wijaya 20 Indonesia Venus Centaurus A)
├──17 -> (17 Agus Wijaya 20 Indonesia Venus Messier 49)
├──16 -> (16 Agus Wijaya 20 Indonesia Venus Messier 87)
├──18 -> (18 Agus Wijaya 20 Indonesia Venus Messier 51)
├──19 -> (19 Agus Wijaya 20 Indonesia Venus Messier 101)
├──23 -> (23 Agus Wijaya 20 Indonesia Earth Messier 81)
├──21 -> (21 Agus Wijaya 20 Indonesia Earth Andromeda)
├──22 -> (22 Agus Wijaya 20 Indonesia Earth Triangulum)
├──25 -> (25 Agus Wijaya 20 Indonesia Earth Centaurus A)
├──24 -> (24 Agus Wijaya 20 Indonesia Earth Messier 82)
├──26 -> (26 Agus Wijaya 20 Indonesia Earth Messier 87)
├──36 -> (36 Agus Wijaya 20 Indonesia Mars Messier 87)
├──33 -> (33 Agus Wijaya 20 Indonesia Mars Messier 81)
├──29 -> (29 Agus Wijaya 20 Indonesia Earth Messier 101)
├──30 -> (30 Agus Wijaya 20 Indonesia Mars Milky Way)
├──31 -> (31 Agus Wijaya 20 Indonesia Mars Andromeda)
├──32 -> (32 Agus Wijaya 20 Indonesia Mars Triangulum)
├──34 -> (34 Agus Wijaya 20 Indonesia Mars Messier 82)
├──35 -> (35 Agus Wijaya 20 Indonesia Mars Centaurus A)
├──38 -> (38 Agus Wijaya 20 Indonesia Mars Messier 51)
├──37 -> (37 Agus Wijaya 20 Indonesia Mars Messier 49)
├──40 -> (40 Agus Wijaya 20 Indonesia Jupiter Milky Way)
├──39 -> (39 Agus Wijaya 20 Indonesia Mars Messier 101)
├──41 -> (41 Agus Wijaya 20 Indonesia Jupiter Andromeda)
├──42 -> (42 Agus Wijaya 20 Indonesia Jupiter Triangulum)
├──47 -> (47 Agus Wijaya 20 Indonesia Jupiter Messier 49)
├──46 -> (46 Agus Wijaya 20 Indonesia Jupiter Messier 87)
├──45 -> (45 Agus Wijaya 20 Indonesia Jupiter Centaurus A)
├──44 -> (44 Agus Wijaya 20 Indonesia Jupiter Messier 82)
├──49 -> (49 Agus Wijaya 20 Indonesia Jupiter Messier 101)
├──48 -> (48 Agus Wijaya 20 Indonesia Jupiter Messier 51)
But if we insert the data in sorted order, the tree will be skewed:
users = generate_random_users()[:50]bst = BST(None, None, users[0].id, users[0])for d in users[1:]: bst.insert(d.id, d)draw(bst, 0)
├──0 -> (0 Agus Wijaya 20 Indonesia Mercury Milky Way)
├──1 -> (1 Agus Wijaya 20 Indonesia Mercury Andromeda)
├──2 -> (2 Agus Wijaya 20 Indonesia Mercury Triangulum)
├──3 -> (3 Agus Wijaya 20 Indonesia Mercury Messier 81)
├──4 -> (4 Agus Wijaya 20 Indonesia Mercury Messier 82)
├──5 -> (5 Agus Wijaya 20 Indonesia Mercury Centaurus A)
├──6 -> (6 Agus Wijaya 20 Indonesia Mercury Messier 87)
├──7 -> (7 Agus Wijaya 20 Indonesia Mercury Messier 49)
├──8 -> (8 Agus Wijaya 20 Indonesia Mercury Messier 51)
├──9 -> (9 Agus Wijaya 20 Indonesia Mercury Messier 101)
├──10 -> (10 Agus Wijaya 20 Indonesia Venus Milky Way)
├──11 -> (11 Agus Wijaya 20 Indonesia Venus Andromeda)
├──12 -> (12 Agus Wijaya 20 Indonesia Venus Triangulum)
├──13 -> (13 Agus Wijaya 20 Indonesia Venus Messier 81)
├──14 -> (14 Agus Wijaya 20 Indonesia Venus Messier 82)
├──15 -> (15 Agus Wijaya 20 Indonesia Venus Centaurus A)
├──16 -> (16 Agus Wijaya 20 Indonesia Venus Messier 87)
├──17 -> (17 Agus Wijaya 20 Indonesia Venus Messier 49)
├──18 -> (18 Agus Wijaya 20 Indonesia Venus Messier 51)
├──19 -> (19 Agus Wijaya 20 Indonesia Venus Messier 101)
├──20 -> (20 Agus Wijaya 20 Indonesia Earth Milky Way)
├──21 -> (21 Agus Wijaya 20 Indonesia Earth Andromeda)
├──22 -> (22 Agus Wijaya 20 Indonesia Earth Triangulum)
├──23 -> (23 Agus Wijaya 20 Indonesia Earth Messier 81)
├──24 -> (24 Agus Wijaya 20 Indonesia Earth Messier 82)
├──25 -> (25 Agus Wijaya 20 Indonesia Earth Centaurus A)
├──26 -> (26 Agus Wijaya 20 Indonesia Earth Messier 87)
├──27 -> (27 Agus Wijaya 20 Indonesia Earth Messier 49)
├──28 -> (28 Agus Wijaya 20 Indonesia Earth Messier 51)
├──29 -> (29 Agus Wijaya 20 Indonesia Earth Messier 101)
├──30 -> (30 Agus Wijaya 20 Indonesia Mars Milky Way)
├──31 -> (31 Agus Wijaya 20 Indonesia Mars Andromeda)
├──32 -> (32 Agus Wijaya 20 Indonesia Mars Triangulum)
├──33 -> (33 Agus Wijaya 20 Indonesia Mars Messier 81)
├──34 -> (34 Agus Wijaya 20 Indonesia Mars Messier 82)
├──35 -> (35 Agus Wijaya 20 Indonesia Mars Centaurus A)
├──36 -> (36 Agus Wijaya 20 Indonesia Mars Messier 87)
├──37 -> (37 Agus Wijaya 20 Indonesia Mars Messier 49)
├──38 -> (38 Agus Wijaya 20 Indonesia Mars Messier 51)
├──39 -> (39 Agus Wijaya 20 Indonesia Mars Messier 101)
├──40 -> (40 Agus Wijaya 20 Indonesia Jupiter Milky Way)
├──41 -> (41 Agus Wijaya 20 Indonesia Jupiter Andromeda)
├──42 -> (42 Agus Wijaya 20 Indonesia Jupiter Triangulum)
├──43 -> (43 Agus Wijaya 20 Indonesia Jupiter Messier 81)
├──44 -> (44 Agus Wijaya 20 Indonesia Jupiter Messier 82)
├──45 -> (45 Agus Wijaya 20 Indonesia Jupiter Centaurus A)
├──46 -> (46 Agus Wijaya 20 Indonesia Jupiter Messier 87)
├──47 -> (47 Agus Wijaya 20 Indonesia Jupiter Messier 49)
├──48 -> (48 Agus Wijaya 20 Indonesia Jupiter Messier 51)
├──49 -> (49 Agus Wijaya 20 Indonesia Jupiter Messier 101)
That’s a very unbalanced tree. It’s equivalent to a linked list.
Balanced Tree
There are many types of balanced tree, such as AVL tree, red-black tree, etc.
We won’t go into details of each type of balanced tree, but let’s just see how they look like:
Disclaimer: code below is GPT generated without being double checked
class BTreeNode:def__init__(self, leaf=False):self.leaf = leafself.keys = []self.child = []class BTree:def__init__(self, t):self.root = BTreeNode(True)self.t = tdef insert(self, k): root =self.rootiflen(root.keys) == (2*self.t) -1: temp = BTreeNode()self.root = temp temp.child.insert(0, root)self.split_child(temp, 0)self.insert_non_full(temp, k)else:self.insert_non_full(root, k)def insert_non_full(self, x, k): i =len(x.keys) -1if x.leaf: x.keys.append((None, None))while i >=0and k < x.keys[i][0]: x.keys[i +1] = x.keys[i] i -=1 x.keys[i +1] = (k, None)else:while i >=0and k < x.keys[i][0]: i -=1 i +=1iflen(x.child[i].keys) == (2*self.t) -1:self.split_child(x, i)if k > x.keys[i][0]: i +=1self.insert_non_full(x.child[i], k)def split_child(self, x, i): t =self.t y = x.child[i] z = BTreeNode(y.leaf) x.child.insert(i +1, z) x.keys.insert(i, y.keys[t -1]) z.keys = y.keys[t: (2* t) -1] y.keys = y.keys[0: t -1]ifnot y.leaf: z.child = y.child[t: 2* t] y.child = y.child[0: t]def traverse(self):self._traverse(self.root)def _traverse(self, node): i =0while i <len(node.keys):ifnot node.leaf:self._traverse(node.child[i])print(node.keys[i][0], end=' ') i +=1ifnot node.leaf:self._traverse(node.child[i])def draw_tree(self):""" This method prints the keys at each level of the B-Tree, with a bounding box around each node. """self._draw_tree(self.root, 0)def _draw_tree(self, node, level):if node: indent =' '* (level *4) # Indentation for levels keys_str =' '.join(str(key[0]) for key in node.keys) node_representation =f"[{keys_str}]"print(indent + node_representation)for child in node.child:self._draw_tree(child, level +1)
b = BTree(2) # A B-Tree with min degree 3b.insert(10)b.insert(20)b.insert(5)b.insert(6)b.insert(12)b.insert(30)b.insert(7)b.insert(17)b.insert(18)b.insert(19)b.insert(21)b.insert(22)b.traverse()
Disclaimer: code below is GPT generated without being double checked
class BPlusTreeNode:def__init__(self, leaf=False):self.leaf = leafself.keys = []self.children = []self.next=None# Pointer to the next node, used in leaf nodesclass BPlusTree:def__init__(self, t):self.root = BPlusTreeNode(leaf=True)self.t = tdef insert(self, key): root =self.rootiflen(root.keys) == (2*self.t) -1: temp = BPlusTreeNode()self.root = temp temp.children.append(root)self.split_child(temp, 0)self._insert_non_full(temp, key)else:self._insert_non_full(root, key)def _insert_non_full(self, node, key): i =len(node.keys) -1if node.leaf: node.keys.append(None)while i >=0and key < node.keys[i]: node.keys[i +1] = node.keys[i] i -=1 node.keys[i +1] = keyelse:while i >=0and key < node.keys[i]: i -=1 i +=1iflen(node.children[i].keys) == (2*self.t) -1:self.split_child(node, i)if key > node.keys[i]: i +=1self._insert_non_full(node.children[i], key)def split_child(self, parent, i): t =self.t node = parent.children[i] new_node = BPlusTreeNode(leaf=node.leaf) parent.children.insert(i +1, new_node) new_node.keys = node.keys[t:] node.keys = node.keys[:t]ifnot node.leaf: new_node.children = node.children[t:] node.children = node.children[:t]if node.leaf: new_node.next= node.next node.next= new_node parent.keys.insert(i, new_node.keys[0])def traverse(self): node =self.rootwhilenot node.leaf: node = node.children[0]while node:for key in node.keys:print(key, end=' ') node = node.nextdef draw_tree(self):""" This method prints the keys at each level of the B+Tree, with a bounding box around each node. """self._draw_tree(self.root, 0)def _draw_tree(self, node, level):if node: indent =' '* (level *4) # Indentation for levels keys_str =' '.join(str(key) for key in node.keys) node_representation =f"[{keys_str}]"print(indent + node_representation)if node.leaf and node.next:print(indent +' '*4+"|--> Next at same level: ["+' '.join(str(key) for key in node.next.keys) +"]")for child in node.children:self._draw_tree(child, level +1)
bpt = BPlusTree(3) # A B-Tree with min degree 3bpt.insert(10)bpt.insert(20)bpt.insert(5)bpt.insert(6)bpt.insert(12)bpt.insert(30)bpt.insert(7)bpt.insert(17)bpt.insert(18)bpt.insert(19)bpt.insert(21)bpt.insert(22)bpt.traverse()
5 6 7 10 12 17 18 19 20 21 22 30
bpt.draw_tree()
[12 20]
[5 6 7 10]
|--> Next at same level: [12 17 18 19]
[12 17 18 19]
|--> Next at same level: [20 21 22 30]
[20 21 22 30]