Recently, I've been looking at a problem where data is joined with enrichment data stored in a Pandas dataframe. For each record being processed, a lookup is performed and the single record from the enrichment data selected based on a key. Essentially, this is a SQL join operation, but one of the datasets couldn't fit into memory.
Having had experience with dataframes being slow (particularly when iterating through rows), I investigated whether the lookup would be faster if a enrichment data was stored in a Python
In my experiments, I was able to get a 70 times speed improvement using a
The Python code used in the experiments is here:
Having had experience with dataframes being slow (particularly when iterating through rows), I investigated whether the lookup would be faster if a enrichment data was stored in a Python
dict
as opposed to a dataframe.
In my experiments, I was able to get a 70 times speed improvement using a
dict
over a dataframe, even when indexing the dataframe.
The Python code used in the experiments is here:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Parameters | |
num_entries = 1000000 # Number of rows of data to generate | |
num_columns = 10 # Number of columns in each row | |
num_lookups = 10000 # Number of test lookups to perform | |
# Import the required libraries | |
from faker import Faker | |
import pandas as pd | |
import random | |
import timeit | |
fake = Faker() | |
def generate_value(): | |
return fake.sentence() | |
def generate_row(num_columns): | |
"""Generate a single row of data.""" | |
return {f"column-{idx}": generate_value() for idx in range(num_columns)} | |
def generate_id(idx): | |
"""Generate an ID (used in the lookup).""" | |
return f"ID-{idx}" | |
def generate_dataframe(num_entries, num_columns): | |
"""Generate dataset.""" | |
rows_for_df = [] | |
rows = {} | |
for idx in range(num_entries): | |
data = generate_row(num_columns) | |
rows[generate_id(idx)] = data | |
data["id"] = generate_id(idx) | |
rows_for_df.append(data) | |
return pd.DataFrame(rows_for_df), rows | |
# Generate the dataset | |
df, rows = generate_dataframe(num_entries, num_columns) | |
# Index the dataframe by the 'id' column for a faster lookup | |
df = df.set_index("id") | |
print(f"Generated {len(rows)} rows of data") | |
def perform_lookup_on_dicts(rows, num_lookups, max_id): | |
for _ in range(num_lookups): | |
idx = random.randint(0, max_id-1) | |
data = rows[generate_id(idx)] | |
def perform_lookup_on_dataframe(df, num_lookups, max_id): | |
for _ in range(num_lookups): | |
idx = random.randint(0, max_id-1) | |
data = df.loc[generate_id(idx)] | |
%timeit perform_lookup_on_dicts(rows, num_lookups, num_entries) | |
%timeit perform_lookup_on_dataframe(df, num_lookups, num_entries) |
Comments