Exploratory Data Analysis
Data organisation
Folder structure:
data/
iris-dataset/
dataset.db
mnist/
train-test-seed-1.db
train-test-seed-7.db
images/
coco/
subset-1.db
coco-full.db
images/
annotations/
Each db
can have tables like:
idx | split | class | ... | path | label |
---|---|---|---|---|---|
0 | train | cat | ... | images/cat-1.png | 1 |
1 | val | dog | ... | images/dog-1.png | 2 |
... |
Advantages of using DataFrame:
- dataset is 'cached', avoiding surprises with random sampling
- does not need to be indexed each time, so loads faster
- stores any extra metadata needed per row (e.g. mean, std per image)
- easy to define
__getitem__(...)
to createtorch...Dataset
- allows for interactive exploration
Helpers to write/read sqlite3 db files
import sqlite3
from pathlib import Path
from typing import Literal
import pandas as pd
def save_db(
df: pd.DataFrame,
db_path: Path | str,
table: str,
if_exists: Literal["fail", "replace", "append"] = "replace",
index: bool = False,
verbose: bool = False,
) -> None:
"""Save DataFrame to sqlite3 db as specified table.
NOTE : default behaviour is to replace the table if it exists."""
conn = sqlite3.connect(str(db_path))
if len(df.columns) == 0:
print(f"Cannot save empty df ->\n{df}")
return
df.to_sql(table, conn, if_exists=if_exists, index=index)
if verbose:
print(df)
print(df.columns)
print(f"Saved to => {table} => {db_path}")
def load_db(
db_path: Path | str,
table: str,
verbose: bool = False,
) -> pd.DataFrame:
"""Read sqlite3 db, return all rows from specified table."""
conn = sqlite3.connect(str(db_path))
df = pd.read_sql_query(f"SELECT * from '{table}'", conn)
if verbose:
print(df)
print(df.columns)
print(f"Loaded from => {table} => {db_path}")
return df