Skip to content

Polars datamodule

PyTorch Lightning DataModule for loading dataset using Polars.

PolarsDataModule #

Bases: LightningDataModule

PyTorch Lightning DataModule for loading dataset using Polars.

Source code in src/data/polars_datamodule.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
class PolarsDataModule(LightningDataModule):
    """PyTorch Lightning DataModule for loading dataset using Polars."""

    def __init__(
        self, data_path: str, output_column: str, batch_size: int = 32, num_workers: int = 0, test_size: float = 0.2
    ) -> None:
        """Initialize the PolarsDataModule.

        Args:
            data_path: Path to the dataset.
            output_column: Column name that contains the labels.
            batch_size: Batch size for the dataloaders.
            num_workers: Number of workers for the dataloaders.
            test_size: Fraction of the dataset to be used for validation.
        """
        super().__init__()
        self.data_path = data_path
        self.output_column = output_column
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.test_size = test_size
        self.df = None  # Will hold the loaded Polars DataFrame

    def setup(self, stage: str = "") -> None:
        """Load and split the dataset into train and validation sets."""
        # Load dataset using Polars
        self.df = pl.read_csv(self.data_path)

        # Split the data into train and validation sets
        train_df, val_df = train_test_split(self.df, test_size=self.test_size, random_state=42)

        self.train_dataset = PolarsDataset(pl.DataFrame(train_df), output_column=self.output_column)
        self.val_dataset = PolarsDataset(pl.DataFrame(val_df), output_column=self.output_column)

    def train_dataloader(self) -> DataLoader:
        """Create and return the train dataloader."""
        return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=True)

    def val_dataloader(self) -> DataLoader:
        """Create and return the validation dataloader."""
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.num_workers)

__init__(data_path, output_column, batch_size=32, num_workers=0, test_size=0.2) #

Initialize the PolarsDataModule.

Parameters:

Name Type Description Default
data_path str

Path to the dataset.

required
output_column str

Column name that contains the labels.

required
batch_size int

Batch size for the dataloaders.

32
num_workers int

Number of workers for the dataloaders.

0
test_size float

Fraction of the dataset to be used for validation.

0.2
Source code in src/data/polars_datamodule.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def __init__(
    self, data_path: str, output_column: str, batch_size: int = 32, num_workers: int = 0, test_size: float = 0.2
) -> None:
    """Initialize the PolarsDataModule.

    Args:
        data_path: Path to the dataset.
        output_column: Column name that contains the labels.
        batch_size: Batch size for the dataloaders.
        num_workers: Number of workers for the dataloaders.
        test_size: Fraction of the dataset to be used for validation.
    """
    super().__init__()
    self.data_path = data_path
    self.output_column = output_column
    self.batch_size = batch_size
    self.num_workers = num_workers
    self.test_size = test_size
    self.df = None  # Will hold the loaded Polars DataFrame

setup(stage='') #

Load and split the dataset into train and validation sets.

Source code in src/data/polars_datamodule.py
55
56
57
58
59
60
61
62
63
64
def setup(self, stage: str = "") -> None:
    """Load and split the dataset into train and validation sets."""
    # Load dataset using Polars
    self.df = pl.read_csv(self.data_path)

    # Split the data into train and validation sets
    train_df, val_df = train_test_split(self.df, test_size=self.test_size, random_state=42)

    self.train_dataset = PolarsDataset(pl.DataFrame(train_df), output_column=self.output_column)
    self.val_dataset = PolarsDataset(pl.DataFrame(val_df), output_column=self.output_column)

train_dataloader() #

Create and return the train dataloader.

Source code in src/data/polars_datamodule.py
66
67
68
def train_dataloader(self) -> DataLoader:
    """Create and return the train dataloader."""
    return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=True)

val_dataloader() #

Create and return the validation dataloader.

Source code in src/data/polars_datamodule.py
70
71
72
def val_dataloader(self) -> DataLoader:
    """Create and return the validation dataloader."""
    return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.num_workers)

PolarsDataset #

Bases: Dataset

Custom PyTorch Dataset wrapping a Polars DataFrame.

Source code in src/data/polars_datamodule.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
class PolarsDataset(Dataset):
    """Custom PyTorch Dataset wrapping a Polars DataFrame."""

    def __init__(self, df: pl.DataFrame, output_column: str) -> None:
        """Initialize the PolarsDataset."""
        self.df = df
        self.output_column = output_column

    def __len__(self) -> int:
        """Return the number of rows in the dataset."""
        return self.df.shape[0]

    def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
        """Return the features and label for the given index."""
        row = self.df[idx]
        features = torch.tensor([val for col, val in row.items() if col != self.output_column], dtype=torch.float32)
        label = torch.tensor(row[self.output_column], dtype=torch.long)
        return features, label

__getitem__(idx) #

Return the features and label for the given index.

Source code in src/data/polars_datamodule.py
23
24
25
26
27
28
def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
    """Return the features and label for the given index."""
    row = self.df[idx]
    features = torch.tensor([val for col, val in row.items() if col != self.output_column], dtype=torch.float32)
    label = torch.tensor(row[self.output_column], dtype=torch.long)
    return features, label

__init__(df, output_column) #

Initialize the PolarsDataset.

Source code in src/data/polars_datamodule.py
14
15
16
17
def __init__(self, df: pl.DataFrame, output_column: str) -> None:
    """Initialize the PolarsDataset."""
    self.df = df
    self.output_column = output_column

__len__() #

Return the number of rows in the dataset.

Source code in src/data/polars_datamodule.py
19
20
21
def __len__(self) -> int:
    """Return the number of rows in the dataset."""
    return self.df.shape[0]